├── .github ├── dependabot.yml └── workflows │ ├── release.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CODE_OF_CONDUCT.md ├── LICENSE ├── Makefile ├── README.rst ├── README.template.rst ├── data_extractor ├── __init__.py ├── contrib │ └── mypy │ │ └── __init__.py ├── core.py ├── exceptions.py ├── item.py ├── json.py ├── lxml.py ├── py.typed └── utils.py ├── default.nix ├── docs ├── Makefile ├── make.bat └── source │ ├── _static │ └── custom.css │ ├── api_core.rst │ ├── api_exceptions.rst │ ├── api_item.rst │ ├── api_json.rst │ ├── api_lxml.rst │ ├── api_reference.rst │ ├── api_utils.rst │ ├── changelog.rst │ ├── conf.py │ ├── contributing.rst │ ├── history.rst │ ├── howto │ ├── index.rst │ ├── item.rst │ ├── json.rst │ └── lxml.rst │ ├── index.rst │ ├── installation.rst │ ├── quickstarts.rst │ └── readme.rst ├── flake.lock ├── flake.nix ├── noxfile.py ├── pdm.lock ├── pyproject.toml ├── pytest.ini ├── scripts ├── build_readme.py ├── export_requirements_txt.py └── requirements │ ├── requirements-dev.txt │ ├── requirements-docs.txt │ ├── requirements-mini.txt │ └── requirements.txt ├── setup.cfg └── tests ├── __init__.py ├── assets └── sample-rss-2.xml ├── conftest.py ├── mypy.ini ├── test_exceptions.py ├── test_generic_item.py ├── test_item.py ├── test_json.py ├── test_lxml.py ├── test_utils.py ├── typesafety ├── conftest.py ├── test_extracted_typed_dict.yml └── test_generic.yml └── utils.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up PDM 14 | uses: pdm-project/setup-pdm@v2.5 15 | with: 16 | python-version: 3.9 17 | version: 2.19.2 18 | - name: Build release 19 | run: | 20 | pdm build 21 | - name: Upload release 22 | uses: actions/upload-artifact@v4 23 | with: 24 | name: dist 25 | path: dist 26 | - name: Publish release to PYPI 27 | run: | 28 | pip install twine 29 | twine upload -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} --verbose dist/* 30 | - name: Publish release to GitHub Release 31 | uses: softprops/action-gh-release@v2 32 | with: 33 | files: dist/* 34 | env: 35 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 36 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Lint&Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - release/* 8 | pull_request: 9 | branches: 10 | - "*" 11 | 12 | jobs: 13 | lint: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v1 17 | - name: Set up PDM 18 | uses: pdm-project/setup-pdm@v3 19 | with: 20 | python-version: 3.13 21 | version: 2.19.2 22 | - name: Cache Nox Virtualenvs 23 | uses: actions/cache@v1 24 | with: 25 | path: .nox 26 | key: ${{ runner.os }}-nox-${{ hashFiles('**/pdm.lock') }} 27 | restore-keys: ${{ runner.os }}-nox 28 | - name: Install nox 29 | run: | 30 | pip install nox 31 | pdm config python.use_venv true 32 | - uses: pre-commit/action@v2.0.0 33 | env: 34 | SKIP: export_requirements_txt 35 | test: 36 | needs: lint 37 | runs-on: ubuntu-latest 38 | strategy: 39 | matrix: 40 | python-version: ["3.10", "3.11", "3.12", "3.13"] 41 | steps: 42 | - uses: actions/checkout@v1 43 | - name: Set up PDM 44 | uses: pdm-project/setup-pdm@v3 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | version: 2.19.2 48 | - name: Cache Nox Virtualenvs 49 | uses: actions/cache@v1 50 | with: 51 | path: .nox 52 | key: ${{ runner.os }}-${{ matrix.python-version }}-nox-${{ hashFiles('**/pdm.lock') }} 53 | restore-keys: ${{ runner.os }}-${{ matrix.python-version }}-nox 54 | - name: Install nox 55 | run: | 56 | pip install nox 57 | pdm config python.use_venv true 58 | - name: Test with coverage 59 | run: | 60 | make PYTHON=${{ matrix.python-version }} cov 61 | - name: Upload coverage to Codecov 62 | uses: codecov/codecov-action@v2 63 | with: 64 | token: ${{ secrets.CODECOV_TOKEN }} 65 | flags: main,unittest,${{ matrix.python-version }} 66 | fail_ci_if_error: true 67 | test-mypy-plugin: 68 | needs: lint 69 | runs-on: ubuntu-latest 70 | strategy: 71 | matrix: 72 | python-version: ["3.10", "3.11", "3.12", "3.13"] 73 | steps: 74 | - uses: actions/checkout@v1 75 | - name: Set up PDM 76 | uses: pdm-project/setup-pdm@v3 77 | with: 78 | python-version: ${{ matrix.python-version }} 79 | - name: Cache Nox Virtualenvs 80 | uses: actions/cache@v1 81 | with: 82 | path: .nox 83 | key: ${{ runner.os }}-${{ matrix.python-version }}-nox-${{ hashFiles('**/pdm.lock') }} 84 | restore-keys: ${{ runner.os }}-${{ matrix.python-version }}-nox 85 | - name: Install nox 86 | run: | 87 | pip install nox 88 | pdm config python.use_venv true 89 | - name: Test 90 | run: | 91 | make PYTHON=${{ matrix.python-version }} test-mypy-plugin 92 | - name: Upload coverage to Codecov 93 | uses: codecov/codecov-action@v2 94 | with: 95 | token: ${{ secrets.CODECOV_TOKEN }} 96 | flags: plugin-mypy,unittest,${{ matrix.python-version }} 97 | fail_ci_if_error: true 98 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python,emacs 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,emacs 4 | 5 | ### Emacs ### 6 | # -*- mode: gitignore; -*- 7 | *~ 8 | \#*\# 9 | /.emacs.desktop 10 | /.emacs.desktop.lock 11 | *.elc 12 | auto-save-list 13 | tramp 14 | .\#* 15 | 16 | # Org-mode 17 | .org-id-locations 18 | *_archive 19 | ltximg/** 20 | 21 | # flymake-mode 22 | *_flymake.* 23 | 24 | # eshell files 25 | /eshell/history 26 | /eshell/lastdir 27 | 28 | # elpa packages 29 | /elpa/ 30 | 31 | # reftex files 32 | *.rel 33 | 34 | # AUCTeX auto folder 35 | /auto/ 36 | 37 | # cask packages 38 | .cask/ 39 | dist/ 40 | 41 | # Flycheck 42 | flycheck_*.el 43 | 44 | # server auth directory 45 | /server/ 46 | 47 | # projectiles files 48 | .projectile 49 | 50 | # directory configuration 51 | .dir-locals.el 52 | 53 | # network security 54 | /network-security.data 55 | 56 | 57 | ### Python ### 58 | # Byte-compiled / optimized / DLL files 59 | __pycache__/ 60 | *.py[cod] 61 | *$py.class 62 | 63 | # C extensions 64 | *.so 65 | 66 | # Distribution / packaging 67 | .Python 68 | build/ 69 | develop-eggs/ 70 | downloads/ 71 | eggs/ 72 | .eggs/ 73 | lib/ 74 | lib64/ 75 | parts/ 76 | sdist/ 77 | var/ 78 | wheels/ 79 | pip-wheel-metadata/ 80 | share/python-wheels/ 81 | *.egg-info/ 82 | .installed.cfg 83 | *.egg 84 | MANIFEST 85 | 86 | # PyInstaller 87 | # Usually these files are written by a python script from a template 88 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 89 | *.manifest 90 | *.spec 91 | 92 | # Installer logs 93 | pip-log.txt 94 | pip-delete-this-directory.txt 95 | 96 | # Unit test / coverage reports 97 | htmlcov/ 98 | .tox/ 99 | .nox/ 100 | .coverage 101 | .coverage.* 102 | .cache 103 | nosetests.xml 104 | coverage.xml 105 | *.cover 106 | *.py,cover 107 | .hypothesis/ 108 | .pytest_cache/ 109 | pytestdebug.log 110 | 111 | # Translations 112 | *.mo 113 | *.pot 114 | 115 | # Django stuff: 116 | *.log 117 | local_settings.py 118 | db.sqlite3 119 | db.sqlite3-journal 120 | 121 | # Flask stuff: 122 | instance/ 123 | .webassets-cache 124 | 125 | # Scrapy stuff: 126 | .scrapy 127 | 128 | # Sphinx documentation 129 | docs/_build/ 130 | doc/_build/ 131 | 132 | # PyBuilder 133 | target/ 134 | 135 | # Jupyter Notebook 136 | .ipynb_checkpoints 137 | 138 | # IPython 139 | profile_default/ 140 | ipython_config.py 141 | 142 | # pyenv 143 | .python-version 144 | 145 | # pipenv 146 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 147 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 148 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 149 | # install all needed dependencies. 150 | #Pipfile.lock 151 | 152 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 153 | __pypackages__/ 154 | 155 | # Celery stuff 156 | celerybeat-schedule 157 | celerybeat.pid 158 | 159 | # SageMath parsed files 160 | *.sage.py 161 | 162 | # Environments 163 | .env 164 | .venv 165 | env/ 166 | venv/ 167 | ENV/ 168 | env.bak/ 169 | venv.bak/ 170 | pythonenv* 171 | 172 | # Spyder project settings 173 | .spyderproject 174 | .spyproject 175 | 176 | # Rope project settings 177 | .ropeproject 178 | 179 | # mkdocs documentation 180 | /site 181 | 182 | # mypy 183 | .mypy_cache/ 184 | .dmypy.json 185 | dmypy.json 186 | 187 | # Pyre type checker 188 | .pyre/ 189 | 190 | # pytype static type analyzer 191 | .pytype/ 192 | 193 | # profiling data 194 | .prof 195 | 196 | # End of https://www.toptal.com/developers/gitignore/api/python,emacs 197 | 198 | ### Custom ### 199 | ## IDEA 200 | .vscode 201 | ## Emacs 202 | .persp-confs 203 | ## Makefile 204 | ## pdm 205 | .pdm.toml 206 | ## MacOS 207 | .DS_Store 208 | ## misc 209 | .dream2nix/ 210 | .pdm-python 211 | pdm.toml 212 | result 213 | .envrc 214 | .direnv 215 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/commitizen-tools/commitizen 3 | rev: v3.27.0 4 | hooks: 5 | - id: commitizen 6 | stages: 7 | - commit-msg 8 | - repo: https://github.com/pre-commit/pre-commit-hooks 9 | rev: v4.6.0 10 | hooks: 11 | - id: check-symlinks 12 | - id: check-toml 13 | - id: check-yaml 14 | args: [--unsafe] 15 | - id: detect-private-key 16 | - id: end-of-file-fixer 17 | - id: trailing-whitespace 18 | - id: check-added-large-files 19 | - id: mixed-line-ending 20 | args: [--fix=lf] 21 | - repo: https://github.com/pre-commit/pygrep-hooks 22 | rev: v1.10.0 23 | hooks: 24 | - id: python-check-blanket-noqa 25 | - id: python-check-mock-methods 26 | - id: python-no-eval 27 | - id: python-no-log-warn 28 | - id: python-use-type-annotations 29 | - id: rst-backticks 30 | - repo: https://github.com/psf/black 31 | rev: 24.4.2 32 | hooks: 33 | - id: black 34 | - repo: https://github.com/asottile/blacken-docs 35 | rev: 1.16.0 36 | hooks: 37 | - id: blacken-docs 38 | additional_dependencies: [black==23.3.*] 39 | - repo: https://github.com/PyCQA/flake8 40 | rev: 7.0.0 41 | hooks: 42 | - id: flake8 43 | additional_dependencies: ["flake8-bugbear==23.5.*"] 44 | - repo: https://github.com/pre-commit/mirrors-mypy 45 | rev: v1.10.0 46 | hooks: 47 | - id: mypy 48 | files: data_extractor/.+\.py$ 49 | pass_filenames: false 50 | entry: bash -c 'env PYTHONPATH=.:$PYTHONPATH mypy data_extractor --show-traceback' 51 | - repo: https://github.com/pre-commit/mirrors-isort 52 | rev: v5.10.1 53 | hooks: 54 | - id: isort 55 | - repo: https://github.com/PyCQA/doc8 56 | rev: v1.1.1 57 | hooks: 58 | - id: doc8 59 | - repo: local 60 | hooks: 61 | - id: build_readme 62 | name: build_readme 63 | description: Build README.rst 64 | entry: nox -s build_readme 65 | language: system 66 | pass_filenames: false 67 | types: [rst] 68 | - id: export_requirements_txt 69 | name: export_requirements_txt 70 | description: create requirement file for python 71 | entry: python3 scripts/export_requirements_txt.py 72 | language: system 73 | files: pdm.lock 74 | pass_filenames: false 75 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.12" 7 | 8 | python: 9 | install: 10 | - requirements: ./scripts/requirements/requirements-docs.txt 11 | - path: . 12 | 13 | sphinx: 14 | builder: html 15 | configuration: docs/source/conf.py 16 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at linw1995@icloud.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 林玮 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | help: 2 | @echo "PYTHON=X.Y init setup development environemnt with specific Python version" 3 | @echo "init setup development environment with defualt Python version 3.11" 4 | @echo "update-dev update devepoment dependencies via pdm and via pre-commit" 5 | @echo "update update all dependencies via pdm and via pre-commit" 6 | @echo "pre-commit setup git hooks" 7 | @echo "check-all run code quality checkers" 8 | @echo "test run quick tests" 9 | @echo "vtest run quick tests with verbose" 10 | @echo "PYTHON=X.Y cov run tests with coverage and with specific Python version" 11 | @echo "cov run tests with coverage and with default Python version 3.11" 12 | @echo "test-mypy-plugin run mypy plugin tests" 13 | @echo "type-check run static type checking" 14 | 15 | EMPTY := 16 | SPACE := $(EMPTY) $(EMPTY) 17 | 18 | PYTHON = 3.13 19 | EXTRAS = lxml cssselect jsonpath-extractor jsonpath-rw jsonpath-rw-ext 20 | DEV_EXTRAS = test test-mypy-plugin docs 21 | EXTRAS_ARGS = $(if $(EXTRAS),-G,) $(subst $(SPACE),$(SPACE)-G$(SPACE),$(EXTRAS)) 22 | DEV_EXTRAS_ARGS = $(if $(DEV_EXTRAS),-G,) $(subst $(SPACE),$(SPACE)-G$(SPACE),$(DEV_EXTRAS)) 23 | 24 | # Environment setup 25 | init: 26 | @echo ">> installing $(if $(EXTRAS),\"$(EXTRAS)\" ,)$(if $(DEV_EXTRAS),\"$(DEV_EXTRAS)\" ,)dependencies by pdm" 27 | $(if $(PYTHON),pdm use -f $(PYTHON),) 28 | pdm info && pdm info --env 29 | pdm install $(EXTRAS_ARGS) $(DEV_EXTRAS_ARGS) 30 | pdm config -l python.use_venv true 31 | 32 | deinit: 33 | rm -rf .nox 34 | rm -rf __pypackages__ 35 | rm -rf .mypy_cache 36 | rm -rf htmlcov 37 | rm -rf .pytest_cache 38 | rm -rf *.egg-info 39 | 40 | update-dev: 41 | pdm update $(DEV_EXTRAS_ARGS) $(EXTRAS_ARGS) 42 | pre-commit autoupdate 43 | 44 | update: 45 | pdm update 46 | pre-commit autoupdate 47 | 48 | # Environment setup end 49 | 50 | pre-commit: 51 | pre-commit install --hook-type commit-msg --hook-type pre-commit --overwrite 52 | 53 | check-all: 54 | pre-commit run --all-files 55 | 56 | type-check: 57 | pre-commit run mypy 58 | 59 | test: 60 | pdm run pytest -q -x --ff --nf --ignore tests/typesafety 61 | 62 | vtest: 63 | pdm run pytest -vv -x --ff --nf --ignore tests/typesafety 64 | 65 | test-mypy-plugin: 66 | rm -rf .coverage 67 | nox -p $(PYTHON) -s test_mypy_plugin coverage_report -- $(TARGET) 68 | 69 | test-mypy-plugin-full: 70 | rm -rf .coverage 71 | nox -s test_mypy_plugin -- $(TARGET) 72 | nox -p 3.10 -s coverage_report 73 | 74 | cov: 75 | rm -rf .coverage 76 | nox -p $(PYTHON) -s coverage_test coverage_report -- $(TARGET) 77 | 78 | cov-full: 79 | rm -rf .coverage 80 | nox -s coverage_test -- $(TARGET) 81 | nox -p 3.10 -s coverage_report 82 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Data Extractor 3 | ============== 4 | 5 | |license| |Pypi Status| |Python version| |Package version| |PyPI - Downloads| 6 | |GitHub last commit| |Code style: black| |Build Status| |codecov| 7 | |Documentation Status| |PDM managed| 8 | 9 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting. 10 | 11 | Quickstarts 12 | <<<<<<<<<<< 13 | 14 | Installation 15 | ~~~~~~~~~~~~ 16 | 17 | Install the stable version from PYPI. 18 | 19 | .. code-block:: shell 20 | 21 | pip install "data-extractor[jsonpath-extractor]" # for extracting JSON data 22 | pip install "data-extractor[lxml]" # for extracting HTML data 23 | 24 | Or install the latest version from Github. 25 | 26 | .. code-block:: shell 27 | 28 | pip install "data-extractor[jsonpath-extractor] @ git+https://github.com/linw1995/data_extractor.git@master" 29 | 30 | Extract JSON data 31 | ~~~~~~~~~~~~~~~~~ 32 | 33 | Currently supports to extract JSON data with below optional dependencies 34 | 35 | - jsonpath-extractor_ 36 | - jsonpath-rw_ 37 | - jsonpath-rw-ext_ 38 | 39 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath 40 | .. _jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw 41 | .. _jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/ 42 | 43 | install one dependency of them to extract JSON data. 44 | 45 | Extract HTML(XML) data 46 | ~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | Currently supports to extract HTML(XML) data with below optional dependencies 49 | 50 | - lxml_ for using XPath_ 51 | - cssselect_ for using CSS-Selectors_ 52 | 53 | .. _lxml: https://lxml.de/ 54 | .. _XPath: https://www.w3.org/TR/xpath-10/ 55 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/ 56 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/ 57 | 58 | Usage 59 | ~~~~~ 60 | 61 | .. code-block:: python3 62 | 63 | from data_extractor import Field, Item, JSONExtractor 64 | 65 | 66 | class Count(Item): 67 | followings = Field(JSONExtractor("countFollowings")) 68 | fans = Field(JSONExtractor("countFans")) 69 | 70 | 71 | class User(Item): 72 | name_ = Field(JSONExtractor("name"), name="name") 73 | age = Field(JSONExtractor("age"), default=17) 74 | count = Count() 75 | 76 | 77 | assert User(JSONExtractor("data.users[*]"), is_many=True).extract( 78 | { 79 | "data": { 80 | "users": [ 81 | { 82 | "name": "john", 83 | "age": 19, 84 | "countFollowings": 14, 85 | "countFans": 212, 86 | }, 87 | { 88 | "name": "jack", 89 | "description": "", 90 | "countFollowings": 54, 91 | "countFans": 312, 92 | }, 93 | ] 94 | } 95 | } 96 | ) == [ 97 | {"name": "john", "age": 19, "count": {"followings": 14, "fans": 212}}, 98 | {"name": "jack", "age": 17, "count": {"followings": 54, "fans": 312}}, 99 | ] 100 | 101 | Changelog 102 | <<<<<<<<< 103 | 104 | v1.0.1 105 | ~~~~~~ 106 | 107 | **Build** 108 | 109 | - Supports Python 3.13 110 | 111 | 112 | 113 | Contributing 114 | <<<<<<<<<<<< 115 | 116 | 117 | Environment Setup 118 | ~~~~~~~~~~~~~~~~~ 119 | 120 | Clone the source codes from Github. 121 | 122 | .. code-block:: shell 123 | 124 | git clone https://github.com/linw1995/data_extractor.git 125 | cd data_extractor 126 | 127 | Setup the development environment. 128 | Please make sure you install the pdm_, 129 | pre-commit_ and nox_ CLIs in your environment. 130 | 131 | .. code-block:: shell 132 | 133 | make init 134 | make PYTHON=3.7 init # for specific python version 135 | 136 | Linting 137 | ~~~~~~~ 138 | 139 | Use pre-commit_ for installing linters to ensure a good code style. 140 | 141 | .. code-block:: shell 142 | 143 | make pre-commit 144 | 145 | Run linters. Some linters run via CLI nox_, so make sure you install it. 146 | 147 | .. code-block:: shell 148 | 149 | make check-all 150 | 151 | Testing 152 | ~~~~~~~ 153 | 154 | Run quick tests. 155 | 156 | .. code-block:: shell 157 | 158 | make 159 | 160 | Run quick tests with verbose. 161 | 162 | .. code-block:: shell 163 | 164 | make vtest 165 | 166 | Run tests with coverage. 167 | Testing in multiple Python environments is powered by CLI nox_. 168 | 169 | .. code-block:: shell 170 | 171 | make cov 172 | 173 | .. _pdm: https://github.com/pdm-project/pdm 174 | .. _pre-commit: https://pre-commit.com/ 175 | .. _nox: https://nox.thea.codes/en/stable/ 176 | 177 | .. |license| image:: https://img.shields.io/github/license/linw1995/data_extractor.svg 178 | :target: https://github.com/linw1995/data_extractor/blob/master/LICENSE 179 | 180 | .. |Pypi Status| image:: https://img.shields.io/pypi/status/data_extractor.svg 181 | :target: https://pypi.org/project/data_extractor 182 | 183 | .. |Python version| image:: https://img.shields.io/pypi/pyversions/data_extractor.svg 184 | :target: https://pypi.org/project/data_extractor 185 | 186 | .. |Package version| image:: https://img.shields.io/pypi/v/data_extractor.svg 187 | :target: https://pypi.org/project/data_extractor 188 | 189 | .. |PyPI - Downloads| image:: https://img.shields.io/pypi/dm/data-extractor.svg 190 | :target: https://pypi.org/project/data_extractor 191 | 192 | .. |GitHub last commit| image:: https://img.shields.io/github/last-commit/linw1995/data_extractor.svg 193 | :target: https://github.com/linw1995/data_extractor 194 | 195 | .. |Code style: black| image:: https://img.shields.io/badge/code%20style-black-000000.svg 196 | :target: https://github.com/ambv/black 197 | 198 | .. |Build Status| image:: https://github.com/linw1995/data_extractor/workflows/Lint&Test/badge.svg 199 | :target: https://github.com/linw1995/data_extractor/actions?query=workflow%3ALint%26Test 200 | 201 | .. |codecov| image:: https://codecov.io/gh/linw1995/data_extractor/branch/master/graph/badge.svg 202 | :target: https://codecov.io/gh/linw1995/data_extractor 203 | 204 | .. |Documentation Status| image:: https://readthedocs.org/projects/data-extractor/badge/?version=latest 205 | :target: https://data-extractor.readthedocs.io/en/latest/?badge=latest 206 | 207 | .. |PDM managed| image:: https://img.shields.io/badge/pdm-managed-blueviolet 208 | :target: https://pdm.fming.dev 209 | -------------------------------------------------------------------------------- /README.template.rst: -------------------------------------------------------------------------------- 1 | .. include:: docs/source/readme.rst 2 | -------------------------------------------------------------------------------- /data_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`data_extractor` 3 | ===================== 4 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting. 5 | """ 6 | 7 | # Local Folder 8 | from .core import ( 9 | AbstractComplexExtractor, 10 | AbstractExtractors, 11 | AbstractSimpleExtractor, 12 | ComplexExtractorMeta, 13 | ) 14 | from .exceptions import ExprError, ExtractError 15 | from .item import RV, Convertor, Field, Item 16 | from .json import ( 17 | JSONExtractor, 18 | JSONPathExtractor, 19 | JSONPathRWExtExtractor, 20 | JSONPathRWExtractor, 21 | ) 22 | from .lxml import ( 23 | AttrCSSExtractor, 24 | CSSExtractor, 25 | Element, 26 | TextCSSExtractor, 27 | XPathExtractor, 28 | ) 29 | from .utils import ( 30 | LazyStr, 31 | is_complex_extractor, 32 | is_extractor, 33 | is_simple_extractor, 34 | sentinel, 35 | ) 36 | 37 | __all__ = ( 38 | "AbstractComplexExtractor", 39 | "AbstractExtractors", 40 | "AbstractSimpleExtractor", 41 | "AttrCSSExtractor", 42 | "CSSExtractor", 43 | "ComplexExtractorMeta", 44 | "Convertor", 45 | "Element", 46 | "ExprError", 47 | "ExtractError", 48 | "Field", 49 | "Item", 50 | "JSONExtractor", 51 | "JSONPathExtractor", 52 | "JSONPathRWExtExtractor", 53 | "JSONPathRWExtractor", 54 | "LazyStr", 55 | "RV", 56 | "TextCSSExtractor", 57 | "XPathExtractor", 58 | "is_complex_extractor", 59 | "is_extractor", 60 | "is_simple_extractor", 61 | "sentinel", 62 | ) 63 | -------------------------------------------------------------------------------- /data_extractor/contrib/mypy/__init__.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | import logging 3 | 4 | from functools import partial 5 | from typing import Callable, Dict, List, Optional, Type, Union 6 | 7 | # Third Party Library 8 | from mypy.checker import TypeChecker, is_true_literal 9 | from mypy.nodes import ( 10 | AssignmentStmt, 11 | CallExpr, 12 | ClassDef, 13 | Expression, 14 | IndexExpr, 15 | MemberExpr, 16 | MypyFile, 17 | NameExpr, 18 | RefExpr, 19 | StrExpr, 20 | SymbolNode, 21 | TypeAlias, 22 | TypeInfo, 23 | Var, 24 | ) 25 | from mypy.options import Options 26 | from mypy.plugin import ( 27 | DynamicClassDefContext, 28 | FunctionContext, 29 | MethodSigContext, 30 | Plugin, 31 | ) 32 | from mypy.semanal import SemanticAnalyzerInterface 33 | from mypy.semanal_typeddict import TypedDictAnalyzer 34 | from mypy.traverser import TraverserVisitor 35 | from mypy.types import AnyType, CallableType, FunctionLike, Instance 36 | from mypy.types import Type as MypyType 37 | from mypy.types import TypedDictType, TypeOfAny, TypeType, UninhabitedType, UnionType 38 | 39 | logger = logging.getLogger(__name__) 40 | 41 | 42 | class RelationshipVisitor(TraverserVisitor): 43 | relationships: Dict[str, List[str]] 44 | 45 | def __init__(self) -> None: 46 | self.relationships = {} 47 | 48 | def is_data_extractor_cls(self, obj: Optional[SymbolNode]) -> bool: 49 | return obj is not None and obj.fullname in ( 50 | "data_extractor.item.Field", 51 | "data_extractor.item.Item", 52 | ) 53 | 54 | def is_making_extractor_assignment_stmt(self, stmt: AssignmentStmt) -> bool: 55 | rvalue = stmt.rvalue 56 | if not isinstance(rvalue, CallExpr): 57 | return False 58 | 59 | node: Union[Expression, SymbolNode, MypyType] = rvalue.callee 60 | if isinstance(node, IndexExpr): 61 | logger.debug("node=%s", node) 62 | base = node.base 63 | assert base is not None 64 | node = base 65 | 66 | assert isinstance(node, RefExpr) 67 | logger.debug("node=%s", node) 68 | node_ = node.node 69 | if node_ is None: 70 | return False 71 | node = node_ 72 | 73 | logger.debug("node=%r", node) 74 | if isinstance(node, Var): 75 | tt = node.type 76 | logger.debug("tt=%s", tt) 77 | if not isinstance(tt, TypeType): 78 | return False 79 | node = tt.item 80 | 81 | logger.debug("node=%r", node) 82 | if isinstance(node, TypeAlias): 83 | node = node.target 84 | 85 | logger.debug("node=%r", node) 86 | if isinstance(node, Instance): 87 | return node.type.has_base("data_extractor.item.Field") 88 | 89 | logger.debug("node=%r", node) 90 | if isinstance(node, TypeInfo): 91 | return self.is_data_extractor_cls(node) 92 | 93 | return False 94 | 95 | def locate_field_in_classdef(self, defn: ClassDef, name: str) -> str: 96 | for block in defn.defs.body: 97 | if not isinstance(block, AssignmentStmt): 98 | continue 99 | 100 | for lvalue in block.lvalues: 101 | assert isinstance(lvalue, NameExpr) 102 | if lvalue.name == name: 103 | assert block.type is not None 104 | return str((block.type.line, block.type.column)) 105 | else: # pragma: no cover 106 | raise ValueError(f"Field name = {name!r} not exists in defn = {defn!s}") 107 | 108 | def anal_assignment_stmt(self, stmt: AssignmentStmt) -> None: 109 | logger.debug("stmt=%s", stmt) 110 | if self.is_making_extractor_assignment_stmt(stmt): 111 | rvalue_loc = str((stmt.rvalue.line, stmt.rvalue.column)) 112 | logger.debug("stmt=%s, rloc=%r", stmt, rvalue_loc) 113 | for lvalue in stmt.lvalues: 114 | lvalue_loc = "" 115 | logger.debug(f"lvalue = {lvalue!s}") 116 | assert isinstance(lvalue, RefExpr) 117 | if isinstance(lvalue, MemberExpr): 118 | expr = lvalue.expr 119 | assert isinstance(expr, NameExpr) 120 | node = expr.node 121 | if node is None: 122 | return 123 | assert isinstance(node, TypeInfo) 124 | lvalue_loc = self.locate_field_in_classdef(node.defn, lvalue.name) 125 | elif isinstance(lvalue, NameExpr): 126 | node = lvalue.node 127 | assert isinstance(node, SymbolNode) 128 | lvalue_loc = str((node.line, node.column)) 129 | 130 | if not lvalue_loc: # pragma: no cover 131 | logger.debug(f"n = {node!s}, stmt = {stmt!s}") 132 | continue 133 | 134 | self.relationships.setdefault(rvalue_loc, []).append(lvalue_loc) 135 | 136 | def visit_assignment_stmt(self, o: AssignmentStmt) -> None: 137 | self.anal_assignment_stmt(o) 138 | super().visit_assignment_stmt(o) 139 | 140 | 141 | class DataExtractorPlugin(Plugin): 142 | cache: Dict[str, Dict[str, List[str]]] 143 | item_typeddict_mapping: Dict[str, TypedDictType] 144 | 145 | def __init__(self, options: Options) -> None: 146 | super().__init__(options) 147 | self.cache = {} 148 | self.item_typeddict_mapping = {} 149 | 150 | def get_current_code(self, ctx: FunctionContext) -> MypyFile: 151 | api = ctx.api 152 | assert isinstance(api, TypeChecker) 153 | module_name = api.tscope.module 154 | assert module_name is not None 155 | return api.modules[module_name] 156 | 157 | def anal_code(self, code: MypyFile) -> Dict[str, List[str]]: 158 | logger.debug(f"code.fullname = {code.fullname!r}, self.cache = {self.cache!r}") 159 | if code.fullname not in self.cache: 160 | try: 161 | visitor = RelationshipVisitor() 162 | except TypeError: # pragma: no cover 163 | # Only supports versions that are bigger than 0.820 164 | return {} 165 | 166 | code.accept(visitor) 167 | self.cache[code.fullname] = visitor.relationships 168 | 169 | return self.cache[code.fullname] 170 | 171 | def check_field_generic_type(self, ctx: FunctionContext) -> MypyType: 172 | rv_type = ctx.default_return_type 173 | if self.options.disallow_any_generics: 174 | return rv_type 175 | 176 | self.anal_code(self.get_current_code(ctx)) 177 | 178 | assert isinstance(rv_type, Instance) 179 | if rv_type.args and not isinstance(rv_type.args[0], UninhabitedType): 180 | return rv_type 181 | 182 | return self.apply_any_generic(type=rv_type) 183 | 184 | def apply_any_generic(self, type: Instance) -> Instance: 185 | any_type = AnyType(TypeOfAny.special_form) 186 | args = [any_type] 187 | return type.copy_modified(args=args) 188 | 189 | def check_is_many(self, ctx: FunctionContext) -> bool: 190 | is_many_idx = ctx.callee_arg_names.index("is_many") 191 | is_many_exprs = ctx.args[is_many_idx] 192 | if is_many_exprs: 193 | return is_true_literal(is_many_exprs[0]) 194 | 195 | return False 196 | 197 | def prepare_type_annotations(self, ctx: FunctionContext, fullname: str) -> MypyType: 198 | logger.debug("fullname=%r", fullname) 199 | 200 | # check parameter "is_many" 201 | expr = ctx.context 202 | assert isinstance(expr, CallExpr) 203 | 204 | callee = expr.callee 205 | if isinstance(callee, IndexExpr): 206 | callee = callee.base 207 | assert isinstance(callee, NameExpr) 208 | 209 | sym_field_class: Union[MypyType, SymbolNode, None] = callee.node 210 | if isinstance(sym_field_class, TypeAlias): 211 | sym_field_class = sym_field_class.target 212 | elif isinstance(sym_field_class, Var): 213 | typetype = sym_field_class.type 214 | assert isinstance(typetype, TypeType) 215 | sym_field_class = typetype.item 216 | 217 | if isinstance(sym_field_class, Instance): 218 | sym_field_class = sym_field_class.type 219 | 220 | assert isinstance(sym_field_class, TypeInfo) 221 | relationship = self.anal_code(self.get_current_code(ctx)) 222 | lvalue_key = str((expr.line, expr.column)) 223 | keys = [lvalue_key] 224 | if lvalue_key in relationship: 225 | keys.extend(relationship[lvalue_key]) 226 | 227 | for key in keys: 228 | logger.debug( 229 | f"lvalue_key = {lvalue_key!r}, " 230 | f"key = {key!r}, relationship = {relationship!r}" 231 | ) 232 | 233 | if self.check_is_many(ctx): 234 | sym_field_class.metadata[key] = {"is_many": True} 235 | else: 236 | sym_field_class.metadata[key] = {"is_many": False} 237 | 238 | rv_type = self.check_field_generic_type(ctx) 239 | return rv_type 240 | 241 | def is_extractor_cls(self, fullname: str, is_item_subcls=False) -> bool: 242 | node = self.lookup_fully_qualified(fullname) 243 | if node is not None: 244 | typenode = node.node 245 | if isinstance(typenode, TypeInfo): 246 | if is_item_subcls: 247 | return typenode.has_base("data_extractor.item.Item") 248 | else: 249 | return typenode.has_base("data_extractor.item.Field") 250 | 251 | return False 252 | 253 | def get_function_hook( 254 | self, fullname: str 255 | ) -> Optional[Callable[[FunctionContext], MypyType]]: 256 | logger.debug("fullname=%r", fullname) 257 | if self.is_extractor_cls(fullname): 258 | return partial(self.prepare_type_annotations, fullname=fullname) 259 | 260 | return super().get_function_hook(fullname) 261 | 262 | def apply_is_many_on_extract_method( 263 | self, ctx: MethodSigContext, fullname: str 264 | ) -> CallableType: 265 | origin: CallableType = ctx.default_signature 266 | origin_ret_type = origin.ret_type 267 | assert isinstance(origin_ret_type, UnionType) 268 | 269 | self_class = ctx.type 270 | assert isinstance(self_class, Instance) 271 | metadata = self_class.type.metadata 272 | 273 | # in case of stmt `Field().extract(...)` 274 | key = str((ctx.type.line, ctx.type.column)) 275 | if key not in metadata: 276 | expr = ctx.context 277 | assert isinstance(expr, CallExpr) 278 | callee = expr.callee 279 | assert isinstance(callee, MemberExpr) 280 | callee_expr = callee.expr 281 | assert isinstance(callee_expr, NameExpr) 282 | obj = callee_expr.node 283 | assert isinstance(obj, Var) 284 | key = str((obj.line, obj.column)) 285 | 286 | logger.debug("fullname=%r, key=%r, metadata=%r", fullname, key, metadata) 287 | if key in metadata: 288 | is_many = metadata[key]["is_many"] 289 | ret_type = origin_ret_type.items[int(is_many)] 290 | return origin.copy_modified(ret_type=ret_type) 291 | else: 292 | api = ctx.api 293 | assert isinstance(api, TypeChecker) 294 | api.fail("Cant determine extract method return type", context=ctx.context) 295 | return origin 296 | 297 | def is_extract_method(self, fullname: str) -> bool: 298 | suffix = ".extract" 299 | if fullname.endswith(suffix): 300 | return self.is_extractor_cls(fullname[: -len(suffix)]) 301 | return False 302 | 303 | def apply_extract_method( 304 | self, ctx: MethodSigContext, fullname: str 305 | ) -> CallableType: 306 | rv = self.apply_is_many_on_extract_method(ctx, fullname) 307 | 308 | # apply item typeddict 309 | item_classname = fullname[: -len(".extract")] 310 | if item_classname in self.item_typeddict_mapping: 311 | logger.debug("fullname=%r, ret_type=%r", fullname, rv.ret_type) 312 | original = rv.ret_type 313 | typeddict = self.item_typeddict_mapping[item_classname] 314 | ret_type: Optional[MypyType] 315 | if isinstance(original, AnyType): # is_many=False 316 | rv = rv.copy_modified(ret_type=typeddict) 317 | else: 318 | assert isinstance(original, Instance) 319 | if original.type.name == "list": # is_many=True 320 | ret_type = original 321 | ret_type.args = (typeddict,) 322 | rv = rv.copy_modified(ret_type=ret_type) 323 | else: # pragma: no cover 324 | api = ctx.api 325 | assert isinstance(api, TypeChecker) 326 | api.fail( 327 | "Cant determine extract method return type", context=ctx.context 328 | ) 329 | ret_type = None 330 | 331 | logger.debug( 332 | "fullname=%r, rv=%r, item_typeddict_mapping=%r", 333 | fullname, 334 | rv, 335 | self.item_typeddict_mapping, 336 | ) 337 | return rv 338 | 339 | def get_method_signature_hook( 340 | self, fullname: str 341 | ) -> Optional[Callable[[MethodSigContext], FunctionLike]]: 342 | if self.is_extract_method(fullname): 343 | return partial(self.apply_extract_method, fullname=fullname) 344 | return super().get_method_signature_hook(fullname) 345 | 346 | def get_name_arg(self, call: CallExpr) -> str: 347 | name = "" 348 | try: 349 | idx = call.arg_names.index("name") 350 | arg = call.args[idx] 351 | assert isinstance(arg, StrExpr) 352 | name = arg.value 353 | except ValueError: 354 | pass 355 | return name 356 | 357 | def prepare_typeddict(self, ctx: DynamicClassDefContext, fullname: str) -> None: 358 | logger.debug("fullname=%r", fullname) 359 | if fullname in self.item_typeddict_mapping: 360 | return 361 | 362 | api = ctx.api 363 | assert isinstance(api, SemanticAnalyzerInterface) 364 | analyzer = TypedDictAnalyzer(api.options, api, api.msg) # type: ignore 365 | 366 | items: List[str] = [] 367 | types: List[MypyType] = [] 368 | callee = ctx.call.callee 369 | assert isinstance(callee, NameExpr) 370 | node = callee.node 371 | assert isinstance(node, TypeInfo) 372 | for block in node.defn.defs.body: 373 | if not isinstance(block, AssignmentStmt): 374 | continue 375 | 376 | rvalue = block.rvalue 377 | if not isinstance(rvalue, CallExpr): 378 | continue 379 | 380 | param_name = self.get_name_arg(rvalue) 381 | logger.debug("param_name = %r from rvalue = %s", param_name, rvalue) 382 | 383 | rvalue_type: MypyType 384 | callee = rvalue.callee 385 | if isinstance(callee, IndexExpr): 386 | index = callee.index 387 | assert isinstance(index, NameExpr) 388 | name = index.fullname 389 | assert name is not None 390 | named_type = api.named_type_or_none(name, []) 391 | assert named_type is not None 392 | rvalue_type = named_type 393 | else: 394 | rvalue_type = AnyType(TypeOfAny.special_form) 395 | 396 | if param_name: 397 | items.append(param_name) 398 | types.append(rvalue_type) 399 | else: 400 | for lvalue in block.lvalues: 401 | assert isinstance(lvalue, NameExpr) 402 | items.append(lvalue.name) 403 | types.append(rvalue_type) 404 | 405 | callee = ctx.call.callee 406 | assert isinstance(callee, NameExpr) 407 | typeinfo = analyzer.build_typeddict_typeinfo( 408 | callee.name, 409 | items, 410 | types, 411 | set(items), 412 | -1, 413 | None, 414 | ) 415 | assert typeinfo.typeddict_type is not None 416 | self.item_typeddict_mapping[fullname] = typeinfo.typeddict_type 417 | logger.debug( 418 | "fullname=%r, item_typeddict_mapping=%r", 419 | fullname, 420 | self.item_typeddict_mapping, 421 | ) 422 | 423 | def get_dynamic_class_hook( 424 | self, fullname: str 425 | ) -> Optional[Callable[[DynamicClassDefContext], None]]: 426 | logger.debug("fullname=%r", fullname) 427 | if self.is_extractor_cls(fullname, is_item_subcls=True): 428 | return partial(self.prepare_typeddict, fullname=fullname) 429 | 430 | return super().get_dynamic_class_hook(fullname) 431 | 432 | 433 | def plugin(version: str) -> Type[Plugin]: 434 | return DataExtractorPlugin 435 | -------------------------------------------------------------------------------- /data_extractor/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | ===================================== 3 | :mod:`core` -- Abstract Base Classes. 4 | ===================================== 5 | """ 6 | 7 | # Standard Library 8 | import ast 9 | import inspect 10 | 11 | from abc import abstractmethod 12 | from collections import namedtuple 13 | from types import FrameType, FunctionType, MethodType 14 | from typing import Any, Dict, Optional, Tuple, Union 15 | 16 | # Local Folder 17 | from .utils import Property, getframe, sentinel 18 | 19 | _LineInfo = namedtuple("_LineInfo", ["file", "lineno", "offset", "line"]) 20 | 21 | 22 | def _find_line_info_of_attr_in_source( 23 | frame: Optional[FrameType], key: str, attr: "AbstractComplexExtractor" 24 | ) -> _LineInfo: 25 | if frame is None: 26 | return _LineInfo(None, None, None, f"{key}={attr!r}") 27 | 28 | file = frame.f_code.co_filename 29 | firstlineno = frame.f_lineno 30 | firstline_idx = firstlineno - 1 31 | try: 32 | lines, _ = inspect.findsource(frame) 33 | except OSError: 34 | # can't get the source code from python repl 35 | return _LineInfo(None, None, None, f"{key}={attr!r}") 36 | 37 | start_index = inspect.indentsize(lines[firstline_idx]) 38 | for lineno, line in enumerate(lines[firstline_idx + 1 :], start=firstlineno + 1): 39 | # iterate line in the code block body 40 | cur_index = inspect.indentsize(line) 41 | if cur_index <= start_index: 42 | # reach end of the code block, 43 | # use code block firstlineno as SyntaxError.lineno 44 | line = lines[firstline_idx] 45 | lineno = firstlineno 46 | break 47 | 48 | if line.lstrip().startswith(key): 49 | # find the line as SyntaxError.text 50 | break 51 | 52 | else: 53 | # reach EOF, 54 | # use code block firstlineno as SyntaxError.lineno 55 | line = lines[firstline_idx] 56 | lineno = firstlineno 57 | 58 | offset = inspect.indentsize(line) 59 | line = line.strip() 60 | return _LineInfo(file, lineno, offset, line) 61 | 62 | 63 | def _check_field_overwrites_bases_property( 64 | cls: object, 65 | name: str, 66 | bases: Tuple[object], 67 | key: str, 68 | attr: "AbstractComplexExtractor", 69 | ) -> None: 70 | attr_from_bases = getattr(bases[-1], key, None) 71 | if isinstance(attr_from_bases, Property) or key == "_field_names": 72 | # Item's attribute overwrites its property. 73 | frame = getframe(2) 74 | exc_args = _find_line_info_of_attr_in_source(frame, key, attr) 75 | *_, line = exc_args 76 | err_msg = ( 77 | f"{line!r} overwriten " 78 | f"the property {key!r} of {name}. " 79 | f"Please using the optional parameter name={key!r} " 80 | f"in {attr!r} to avoid overwriting property." 81 | ) 82 | raise SyntaxError(err_msg, exc_args) 83 | 84 | 85 | def _check_field_overwrites_bases_method( 86 | cls: object, 87 | name: str, 88 | bases: Tuple[object], 89 | key: str, 90 | attr: "AbstractComplexExtractor", 91 | ) -> None: 92 | attr_from_bases = getattr(bases[-1], key, None) 93 | if isinstance(attr_from_bases, (FunctionType, MethodType)): 94 | # Item's attribute overwrites its class bases' method. 95 | frame = getframe(2) 96 | exc_args = _find_line_info_of_attr_in_source(frame, key, attr) 97 | *_, line = exc_args 98 | err_msg = ( 99 | f"{line!r} overwriten " 100 | f"the method {key!r} of {name!r}. " 101 | f"Please using the optional parameter name={key!r} " 102 | f"in {attr!r} to avoid overwriting method." 103 | ) 104 | raise SyntaxError(err_msg, exc_args) 105 | 106 | 107 | def _check_field_overwrites_method(cls: object) -> None: 108 | frame = getframe(2) 109 | if frame is None: 110 | return 111 | 112 | filename = frame.f_code.co_filename 113 | firstlineno = frame.f_lineno 114 | try: 115 | lines, _ = inspect.findsource(frame) 116 | except OSError: 117 | # can't get the source code from python repl 118 | return 119 | 120 | source = "".join(lines) 121 | mod = ast.parse(source) 122 | for node in ast.walk(mod): 123 | if isinstance(node, (ast.ClassDef, ast.Call)) and node.lineno == firstlineno: 124 | item_node = node 125 | break 126 | else: # pragma: no cover 127 | assert 0, f"Can't find the source of {cls}." 128 | 129 | if isinstance(item_node, ast.Call): 130 | # There is no point to check if field overwrites method, 131 | # due to item is created by `type` function. 132 | return 133 | 134 | assigns: Dict[str, ast.Assign] = {} 135 | methods: Dict[str, ast.FunctionDef] = {} 136 | for node in item_node.body: 137 | if isinstance(node, ast.Assign): 138 | for target_ in node.targets: 139 | if not isinstance(target_, ast.Name): 140 | continue 141 | 142 | assigns[target_.id] = node 143 | elif isinstance(node, ast.FunctionDef): 144 | methods[node.name] = node 145 | 146 | unions = assigns.keys() & methods.keys() 147 | if not unions: 148 | return 149 | 150 | key = next(iter(unions)) 151 | assign = assigns[key] 152 | method = methods[key] 153 | if assign.lineno > method.lineno: 154 | lineno = assign.lineno 155 | offset = assign.col_offset 156 | line = lines[lineno - 1].strip() 157 | 158 | msg = ( 159 | f"method {lines[method.lineno - 1].strip()!r} " 160 | f"on lineno={method.lineno} " 161 | f"overwrited by assign {line!r}. " 162 | f"Please using the optional parameter name={key!r} " 163 | f"in {line!r} to avoid overwriting." 164 | ) 165 | else: 166 | lineno = method.lineno 167 | offset = method.col_offset 168 | line = lines[lineno - 1].strip() 169 | msg = ( 170 | f"assign {lines[assign.lineno - 1].strip()!r} " 171 | f"on lineno={assign.lineno} " 172 | f"overwrited by method {line!r}. " 173 | f"Please using the optional parameter name={key!r} " 174 | f"in {lines[assign.lineno - 1].strip()!r} to avoid overwriting." 175 | ) 176 | 177 | raise SyntaxError(msg, (filename, lineno, offset, line)) 178 | 179 | 180 | class SimpleExtractorMeta(type): 181 | """ 182 | Simple Extractor Meta Class. 183 | """ 184 | 185 | 186 | class ComplexExtractorMeta(SimpleExtractorMeta): 187 | """ 188 | Complex Extractor Meta Class. 189 | """ 190 | 191 | def __init__( 192 | cls, # noqa: B902 193 | name: str, 194 | bases: Tuple[type], 195 | attr_dict: Dict[str, Any], 196 | ): 197 | super().__init__(name, bases, attr_dict) 198 | 199 | field_names = set() 200 | for key, attr in attr_dict.items(): 201 | if isinstance(type(attr), ComplexExtractorMeta): 202 | # can't using data_extractor.utils.is_complex_extractor here, 203 | # because AbstractComplexExtractor which being used in it 204 | # bases on ComplexExtractorMeta. 205 | _check_field_overwrites_bases_method(cls, name, bases, key, attr) 206 | _check_field_overwrites_bases_property(cls, name, bases, key, attr) 207 | 208 | field_names.add(key) 209 | 210 | # check field overwrites method 211 | _check_field_overwrites_method(cls) 212 | 213 | field_names |= set(getattr(cls, "_field_names", [])) 214 | for key in field_names.copy(): 215 | attr = getattr(cls, key, None) 216 | if not attr or not isinstance(type(attr), ComplexExtractorMeta): 217 | field_names.remove(key) 218 | 219 | cls._field_names: Tuple[str, ...] = tuple(field_names) 220 | 221 | 222 | class AbstractSimpleExtractor(metaclass=SimpleExtractorMeta): 223 | """ 224 | Abstract Simple Extractor Class. 225 | 226 | Its metaclass is :class:`data_extractor.core.SimpleExtractorMeta` 227 | 228 | :param expr: Extractor selector expression. 229 | :type expr: str 230 | """ 231 | 232 | expr = Property[str]() 233 | 234 | def __init__(self, expr: str): 235 | self.expr = expr 236 | 237 | def __repr__(self) -> str: 238 | return f"{self.__class__.__name__}({self.expr!r})" 239 | 240 | @abstractmethod 241 | def extract(self, element: Any) -> Any: 242 | """ 243 | Extract data or subelement from element. 244 | 245 | :param element: The target data node element. 246 | :type element: Any 247 | 248 | :returns: Data or subelement. 249 | :rtype: Any 250 | 251 | :raises ~data_extractor.exceptions.ExprError: Extractor Expression Error. 252 | """ 253 | raise NotImplementedError 254 | 255 | def extract_first(self, element: Any, default: Any = sentinel) -> Any: 256 | """ 257 | Extract the first data or subelement from `extract` method call result. 258 | 259 | :param element: The target data node element. 260 | :type element: Any 261 | :param default: Default value when not found. \ 262 | Default: :data:`data_extractor.utils.sentinel`. 263 | :type default: Any, optional 264 | 265 | :returns: Data or subelement. 266 | :rtype: Any 267 | 268 | :raises ~data_extractor.exceptions.ExtractError: \ 269 | Thrown by extractor extracting wrong data. 270 | """ 271 | rv = self.extract(element) 272 | if not rv: 273 | if default is sentinel: 274 | # Local Folder 275 | from .exceptions import ExtractError 276 | 277 | raise ExtractError(self, element) 278 | 279 | return default 280 | 281 | return rv[0] 282 | 283 | 284 | class AbstractComplexExtractor(metaclass=ComplexExtractorMeta): 285 | """ 286 | Abstract Complex Extractor Clase. 287 | 288 | Its metaclass is :class:`data_extractor.core.ComplexExtractorMeta` 289 | """ 290 | 291 | @abstractmethod 292 | def extract(self, element: Any) -> Any: 293 | """ 294 | Extract the wanted data. 295 | 296 | :param element: The target data node element. 297 | :type element: Any 298 | 299 | :returns: Data or subelement. 300 | :rtype: Any 301 | 302 | :raises ~data_extractor.exceptions.ExtractError: \ 303 | Thrown by extractor extracting wrong data. 304 | """ 305 | raise NotImplementedError 306 | 307 | 308 | AbstractExtractors = Union[AbstractSimpleExtractor, AbstractComplexExtractor] 309 | 310 | __all__ = ( 311 | "AbstractComplexExtractor", 312 | "AbstractExtractors", 313 | "AbstractSimpleExtractor", 314 | "ComplexExtractorMeta", 315 | "SimpleExtractorMeta", 316 | ) 317 | -------------------------------------------------------------------------------- /data_extractor/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | =========================================== 3 | :mod:`exceptions` -- Extracting Exceptions. 4 | =========================================== 5 | """ 6 | 7 | # Standard Library 8 | import reprlib 9 | 10 | from typing import Any 11 | 12 | # Local Folder 13 | from .core import AbstractExtractors, AbstractSimpleExtractor 14 | from .utils import LazyStr 15 | 16 | 17 | class ExprError(Exception): 18 | """ 19 | Invalid Expr. 20 | 21 | :param extractor: The object for data extracting. 22 | :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor` 23 | :param exc: The actual exception is thrown when extracting. 24 | :type exc: Exception 25 | """ 26 | 27 | def __init__(self, extractor: AbstractSimpleExtractor, exc: Exception): 28 | self.extractor = extractor 29 | self.exc = exc 30 | 31 | def __str__(self) -> str: 32 | return f"ExprError with {self.exc!r} raised by {self.extractor!r} extracting" 33 | 34 | def __repr__(self) -> str: 35 | return f"{self.__class__.__name__}({self.extractor!r}, exc={self.exc!r})" 36 | 37 | 38 | class ExtractError(Exception): 39 | """ 40 | Thrown by extractor extracting wrong data. 41 | 42 | :param extractor: The object for data extracting. 43 | :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor`, \ 44 | :class:`data_extractor.core.AbstractComplexExtractor` 45 | :param element: The target data node element. 46 | :type element: Any 47 | """ 48 | 49 | def __init__(self, extractor: AbstractExtractors, element: Any): 50 | super().__init__(LazyStr(func=lambda: self._trace_repr)) 51 | self.element = element 52 | self.extractors = [extractor] 53 | 54 | def __repr__(self) -> str: 55 | return ( 56 | f"{self.__class__.__name__}" 57 | f"({self.extractors[0]!r}, element={reprlib.repr(self.element)})" 58 | ) 59 | 60 | def _append(self, extractor: AbstractExtractors) -> None: 61 | self.extractors.append(extractor) 62 | 63 | @property 64 | def _trace_repr(self) -> str: 65 | return f"{self.__repr__()}\n" + "\n".join( 66 | " " * idx + "|-" + repr(extractor) 67 | for idx, extractor in enumerate([*self.extractors[::-1], self.element]) 68 | ) 69 | 70 | 71 | __all__ = ("ExprError", "ExtractError") 72 | -------------------------------------------------------------------------------- /data_extractor/item.py: -------------------------------------------------------------------------------- 1 | """ 2 | ===================================================== 3 | :mod:`item` -- Complex Extractor for data extracting. 4 | ===================================================== 5 | """ 6 | 7 | # Standard Library 8 | import copy 9 | 10 | from typing import ( 11 | Any, 12 | Callable, 13 | Dict, 14 | Generic, 15 | Iterator, 16 | List, 17 | Optional, 18 | Type, 19 | TypeVar, 20 | Union, 21 | ) 22 | 23 | # Local Folder 24 | from .core import AbstractComplexExtractor, AbstractSimpleExtractor 25 | from .exceptions import ExtractError 26 | from .utils import Property, is_simple_extractor, sentinel 27 | 28 | RV = TypeVar("RV") 29 | Convertor = Callable[[Any], RV] 30 | 31 | 32 | class Field(Generic[RV], AbstractComplexExtractor): 33 | """ 34 | Extract data by cooperating with extractor. 35 | 36 | :param extractor: The object for data extracting 37 | :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor` 38 | :param name: Optional parameter for special field name. 39 | :type name: str, optional 40 | :param default: Default value when not found. \ 41 | Default: :data:`data_extractor.utils.sentinel`. 42 | :type default: Any 43 | :param is_many: Indicate the data which extractor extracting is more than one. 44 | :type is_many: bool 45 | 46 | :raises ValueError: Invalid SimpleExtractor. 47 | :raises ValueError: Can't both set default and is_manay=True. 48 | """ 49 | 50 | extractor = Property[Optional[AbstractSimpleExtractor]]() 51 | name = Property[Optional[str]]() 52 | default = Property[Any]() 53 | is_many = Property[bool]() 54 | 55 | type = Property[Optional[Type[RV]]]() 56 | convertor = Property[Optional[Convertor[RV]]]() 57 | 58 | def __init__( 59 | self, 60 | extractor: Optional[AbstractSimpleExtractor] = None, 61 | name: Optional[str] = None, 62 | default: Any = sentinel, 63 | is_many: bool = False, 64 | type: Optional[Type[RV]] = None, 65 | convertor: Optional[Convertor[RV]] = None, 66 | ): 67 | super().__init__() 68 | 69 | if extractor is not None and not is_simple_extractor(extractor): 70 | raise ValueError(f"Invalid SimpleExtractor: {extractor!r}") 71 | 72 | if default is not sentinel and is_many: 73 | raise ValueError(f"Can't both set default={default} and is_many=True") 74 | 75 | self.extractor = extractor 76 | self.name = name 77 | self.default = default 78 | self.is_many = is_many 79 | self.type = type 80 | self.convertor = convertor 81 | 82 | def __class_getitem__(cls, rv_type: Type[RV]): 83 | def new_init( 84 | self, 85 | extractor: Optional[AbstractSimpleExtractor] = None, 86 | name: Optional[str] = None, 87 | default: Any = sentinel, 88 | is_many: bool = False, 89 | type: Optional[Type[RV]] = None, 90 | convertor: Optional[Convertor[RV]] = None, 91 | ): 92 | cls.__init__( 93 | self, 94 | extractor=extractor, 95 | name=name, 96 | default=default, 97 | is_many=is_many, 98 | type=type or rv_type, 99 | convertor=convertor, 100 | ) 101 | 102 | if rv_type is RV: # type: ignore 103 | # it is a type-unbound container class 104 | return cls 105 | else: 106 | return type(cls.__name__, (cls,), {"__init__": new_init}) 107 | 108 | def __repr__(self) -> str: 109 | args = [f"{self.extractor!r}"] 110 | if self.name is not None: 111 | args.append(f"name={self.name!r}") 112 | 113 | if self.default is not sentinel: 114 | args.append(f"default={self.default!r}") 115 | 116 | if self.is_many: 117 | args.append(f"is_many={self.is_many!r}") 118 | 119 | return f"{self.__class__.__name__}({', '.join(args)})" 120 | 121 | def extract(self, element: Any) -> Union[RV, List[RV]]: 122 | if self.extractor is None: 123 | if isinstance(element, list): 124 | rv = element 125 | else: 126 | rv = [element] 127 | else: 128 | rv = self.extractor.extract(element) 129 | 130 | if self.is_many: 131 | return [self._extract(r) for r in rv] 132 | 133 | if not rv: 134 | if self.default is sentinel: 135 | raise ExtractError(self, element) 136 | 137 | return self.default 138 | 139 | return self._extract(rv[0]) 140 | 141 | def _extract(self, element: Any) -> RV: 142 | if self.convertor is not None: 143 | return self.convertor(element) 144 | else: 145 | cls = self.type 146 | if cls is not None and callable(cls): 147 | # TODO: inspect function signature for supporting better conversion 148 | return cls(element) # type: ignore 149 | else: 150 | return element 151 | 152 | def __deepcopy__(self, memo: Dict[int, Any]) -> AbstractComplexExtractor: 153 | deepcopy_method = self.__deepcopy__ 154 | self.__deepcopy__ = None # type: ignore 155 | cp = copy.deepcopy(self, memo) 156 | self.__deepcopy__ = deepcopy_method # type: ignore 157 | 158 | # avoid duplicating the sentinel object. 159 | if self.default is sentinel: 160 | Property.change_internal_value(cp, "default", sentinel) 161 | 162 | return cp 163 | 164 | 165 | class Item(Field[RV]): 166 | """ 167 | Extract data by cooperating with extractors, fields and items. 168 | """ 169 | 170 | def __init__( 171 | self, 172 | extractor=None, 173 | name=None, 174 | default=sentinel, 175 | is_many=False, 176 | type=None, 177 | convertor=None, 178 | ): 179 | super().__init__( 180 | extractor=extractor, 181 | name=name, 182 | default=default, 183 | is_many=is_many, 184 | type=type, 185 | convertor=convertor or self.default_convertor, 186 | ) 187 | 188 | def default_convertor(self, rv: Dict[str, Any]) -> RV: 189 | cls = self.type 190 | if cls is not None and callable(cls): 191 | # TODO: inspect function signature for supporting better conversion 192 | return cls(**rv) # type: ignore 193 | 194 | return rv # type: ignore 195 | 196 | def _extract(self, element: Any) -> RV: 197 | rv = {} 198 | for field in self.field_names(): 199 | try: 200 | extractor = getattr(self, field) 201 | if extractor.name is not None: 202 | field = extractor.name 203 | 204 | rv[field] = extractor.extract(element) 205 | except ExtractError as exc: 206 | exc._append(extractor=self) 207 | raise exc 208 | 209 | return super()._extract(rv) 210 | 211 | @classmethod 212 | def field_names(cls) -> Iterator[str]: 213 | """ 214 | Iterate all `Item` or `Field` type attributes' name. 215 | """ 216 | yield from cls._field_names 217 | 218 | def simplify(self) -> AbstractSimpleExtractor: 219 | """ 220 | Create an extractor that has compatible API like SimpleExtractor's. 221 | 222 | :returns: A simple extractor. 223 | :rtype: :class:`data_extractor.core.AbstractSimpleExtractor` 224 | """ 225 | # duplication seems to be useless due to the properties of Item is unchageable 226 | # but it maybe need to change is_many property of Item. 227 | duplicated = copy.deepcopy(self) 228 | # set for fixing in SimpeExtractor.extract method signature 229 | Property.change_internal_value(duplicated, "is_many", True) 230 | 231 | def extract(self: AbstractSimpleExtractor, element: Any) -> List[RV]: 232 | return duplicated.extract(element) # type: ignore 233 | 234 | def getter(self: AbstractSimpleExtractor, name: str) -> Any: 235 | if ( 236 | name not in ("extract", "extract_first") 237 | and not name.startswith("__") 238 | and hasattr(duplicated.extractor, name) 239 | ): 240 | return getattr(duplicated.extractor, name) 241 | return super(type(self), self).__getattribute__(name) 242 | 243 | classname = f"{type(duplicated).__name__}Simplified" 244 | base = AbstractSimpleExtractor 245 | if duplicated.extractor is not None: 246 | base = type(duplicated.extractor) 247 | 248 | new_cls = type( 249 | classname, 250 | (base,), 251 | { 252 | "extract": extract, 253 | "__getattribute__": getter, 254 | }, 255 | ) 256 | # wrapper class no needs for initialization 257 | obj: AbstractSimpleExtractor = base.__new__(new_cls) 258 | if not hasattr(obj, "expr"): 259 | # handle case of Item with extractor=None. 260 | # and its expr property will raise AttributeError, 261 | # so hasattr return False 262 | obj.expr = "" # set to avoid class.__repr__ raising AttributeError 263 | 264 | return obj 265 | 266 | 267 | __all__ = ("Field", "Item", "RV", "Convertor") 268 | -------------------------------------------------------------------------------- /data_extractor/json.py: -------------------------------------------------------------------------------- 1 | """ 2 | =================================================== 3 | :mod:`json` -- Extractors for JSON data extracting. 4 | =================================================== 5 | """ 6 | 7 | # Standard Library 8 | from typing import TYPE_CHECKING, Any, Optional, Type 9 | 10 | # Local Folder 11 | from .core import AbstractSimpleExtractor 12 | from .exceptions import ExprError 13 | from .utils import Property, _missing_dependency 14 | 15 | 16 | class JSONExtractor(AbstractSimpleExtractor): 17 | """ 18 | Use JSONPath expression implementated by **jsonpath-extractor**, 19 | **jsonpath-rw** or **jsonpath-rw-ext** packages for JSON data extracting. 20 | Change **json_extractor_backend** value to indicate which package to use. 21 | 22 | >>> import data_extractor.json 23 | >>> from data_extractor.json import JSONPathExtractor 24 | >>> data_extractor.json.json_extractor_backend = JSONPathExtractor 25 | 26 | Before extracting, should parse the JSON text into Python object. 27 | 28 | :param expr: JSONPath Expression. 29 | :type expr: str 30 | """ 31 | 32 | def __new__( 33 | cls: Type["JSONExtractor"], *args: Any, **kwargs: Any 34 | ) -> "JSONExtractor": 35 | if json_extractor_backend is None: 36 | raise RuntimeError( 37 | "'jsonpath-extractor', 'jsonpath-rw' or 'jsonpath-rw-ext' " 38 | "package is needed, run pip to install it. " 39 | ) 40 | 41 | obj: JSONExtractor 42 | if cls is JSONExtractor: 43 | # invoke the json extractor backend for object creation 44 | # TODO: cache renamed type 45 | obj = super(AbstractSimpleExtractor, cls).__new__( 46 | type( 47 | "JSONExtractor", (json_extractor_backend,), {} 48 | ) # rename into JSONExtractor 49 | ) 50 | else: 51 | # invoke subclasses directly 52 | obj = super(AbstractSimpleExtractor, cls).__new__(cls) 53 | 54 | return obj 55 | 56 | def extract(self, element: Any) -> Any: 57 | raise NotImplementedError 58 | 59 | 60 | try: 61 | # Third Party Library 62 | import jsonpath_rw 63 | 64 | _missing_jsonpath_rw = False 65 | except ImportError: 66 | _missing_jsonpath_rw = True 67 | 68 | 69 | class JSONPathRWExtractor(JSONExtractor): 70 | """ 71 | Use JSONPath expression implementated by **jsonpath-rw** package 72 | for JSON data extracting. 73 | 74 | Before extracting, should parse the JSON text into Python object. 75 | 76 | :param expr: JSONPath Expression. 77 | :type expr: str 78 | """ 79 | 80 | if TYPE_CHECKING: 81 | # Third Party Library 82 | from jsonpath_rw import JSONPath 83 | _jsonpath = Property["JSONPath"]() 84 | 85 | def __init__(self, expr: str) -> None: 86 | super(JSONExtractor, self).__init__(expr) 87 | if _missing_jsonpath_rw: 88 | _missing_dependency("jsonpath-rw") 89 | 90 | # Third Party Library 91 | from jsonpath_rw.lexer import JsonPathLexerError 92 | 93 | try: 94 | self._jsonpath = jsonpath_rw.parse(self.expr) 95 | except (JsonPathLexerError, Exception) as exc: 96 | # jsonpath_rw.parser.JsonPathParser.p_error raises exc of Exception type 97 | raise ExprError(extractor=self, exc=exc) from exc 98 | 99 | def extract(self, element: Any) -> Any: 100 | """ 101 | Extract data from JSON data. 102 | 103 | :param element: Python object parsed from JSON text. 104 | :type element: Any 105 | 106 | :returns: Data. 107 | :rtype: Any 108 | """ 109 | return [m.value for m in self._jsonpath.find(element)] 110 | 111 | 112 | try: 113 | # Third Party Library 114 | import jsonpath_rw_ext 115 | 116 | _missing_jsonpath_rw_ext = False 117 | except ImportError: 118 | _missing_jsonpath_rw_ext = True 119 | 120 | 121 | class JSONPathRWExtExtractor(JSONPathRWExtractor): 122 | """ 123 | Use JSONPath expression implementated by **jsonpath-rw-ext** package 124 | for JSON data extracting. 125 | 126 | Before extracting, should parse the JSON text into Python object. 127 | 128 | :param expr: JSONPath Expression. 129 | :type expr: str 130 | """ 131 | 132 | if TYPE_CHECKING: 133 | # Third Party Library 134 | from jsonpath_rw_ext import JSONPath as JSONPathExt 135 | _jsonpath = Property["JSONPathExt"]() 136 | 137 | def __init__(self, expr: str) -> None: 138 | super(JSONExtractor, self).__init__(expr) 139 | if _missing_jsonpath_rw_ext: 140 | _missing_dependency("jsonpath-rw-ext") 141 | 142 | # Third Party Library 143 | from jsonpath_rw.lexer import JsonPathLexerError 144 | 145 | try: 146 | self._jsonpath = jsonpath_rw_ext.parse(self.expr) 147 | except (JsonPathLexerError, Exception) as exc: 148 | # jsonpath_rw.parser.JsonPathParser.p_error raises exc of Exception type 149 | raise ExprError(extractor=self, exc=exc) from exc 150 | 151 | 152 | try: 153 | # Third Party Library 154 | import jsonpath 155 | 156 | _missing_jsonpath = False 157 | except ImportError: 158 | _missing_jsonpath = True 159 | 160 | 161 | class JSONPathExtractor(JSONExtractor): 162 | """ 163 | Use JSONPath expression implementated by **jsonpath-extractor** package 164 | for JSON data extracting. 165 | 166 | Before extracting, should parse the JSON text into Python object. 167 | 168 | :param expr: JSONPath Expression. 169 | :type expr: str 170 | """ 171 | 172 | if TYPE_CHECKING: 173 | # Third Party Library 174 | from jsonpath import Expr 175 | 176 | _jsonpath = Property["Expr"]() 177 | 178 | def __init__(self, expr: str) -> None: 179 | super(JSONExtractor, self).__init__(expr) 180 | 181 | if _missing_jsonpath: 182 | _missing_dependency("jsonpath-extractor") 183 | 184 | try: 185 | self._jsonpath = jsonpath.parse(self.expr) 186 | except SyntaxError as exc: 187 | raise ExprError(extractor=self, exc=exc) from exc 188 | 189 | def extract(self, element: Any) -> Any: 190 | """ 191 | Extract data from JSON data. 192 | 193 | :param element: Python object parsed from JSON text. 194 | :type element: Any 195 | 196 | :returns: Data. 197 | :rtype: Any 198 | """ 199 | return self._jsonpath.find(element) 200 | 201 | 202 | json_extractor_backend: Optional[Type[JSONExtractor]] = None 203 | if not _missing_jsonpath: 204 | json_extractor_backend = JSONPathExtractor 205 | elif not _missing_jsonpath_rw_ext: 206 | json_extractor_backend = JSONPathRWExtExtractor 207 | elif not _missing_jsonpath_rw: 208 | json_extractor_backend = JSONPathRWExtractor 209 | 210 | 211 | __all__ = ( 212 | "JSONExtractor", 213 | "JSONPathExtractor", 214 | "JSONPathRWExtExtractor", 215 | "JSONPathRWExtractor", 216 | "json_extractor_backend", 217 | ) 218 | -------------------------------------------------------------------------------- /data_extractor/lxml.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`lxml` -- Extractors for XML or HTML data extracting. 3 | ========================================================== 4 | """ 5 | 6 | # Standard Library 7 | from typing import List, Union 8 | 9 | # Local Folder 10 | from .core import AbstractSimpleExtractor 11 | from .exceptions import ExprError 12 | from .utils import Property, _missing_dependency 13 | 14 | try: 15 | # Third Party Library 16 | from lxml.etree import XPath, XPathSyntaxError 17 | from lxml.etree import _Element as Element 18 | 19 | _missing_lxml = False 20 | except ImportError: 21 | _missing_lxml = True 22 | 23 | Element = None # TODO: Find a way to get rid of this. See PEP 562 24 | 25 | 26 | class XPathExtractor(AbstractSimpleExtractor): 27 | """ 28 | Use XPath for XML or HTML data extracting. 29 | 30 | Before extracting, should parse the XML or HTML text \ 31 | into :class:`data_extractor.lxml.Element` object. 32 | 33 | :param expr: XPath Expression. 34 | :type exprt: str 35 | """ 36 | 37 | _find = Property["XPath"]() 38 | 39 | def __init__(self, expr: str): 40 | super().__init__(expr) 41 | 42 | if _missing_lxml: 43 | _missing_dependency("lxml") 44 | 45 | try: 46 | self._find = XPath(self.expr) 47 | except XPathSyntaxError as exc: 48 | raise ExprError(extractor=self, exc=exc) from exc 49 | 50 | def extract(self, element: Element) -> Union[List[Element], List[str]]: 51 | """ 52 | Extract subelements or data from XML or HTML data. 53 | 54 | :param element: Target. 55 | :type element: :class:`data_extractor.lxml.Element` 56 | 57 | :returns: List of :class:`data_extractor.lxml.Element` objects, \ 58 | List of str, or str. 59 | :rtype: list 60 | 61 | :raises data_extractor.exceptions.ExprError: XPath Expression Error. 62 | """ 63 | # Third Party Library 64 | from lxml.etree import XPathEvalError 65 | 66 | try: 67 | rv = self._find(element) 68 | if not isinstance(rv, list): 69 | return [rv] 70 | else: 71 | return rv 72 | except XPathEvalError as exc: 73 | raise ExprError(extractor=self, exc=exc) from exc 74 | 75 | 76 | try: 77 | # Third Party Library 78 | import cssselect 79 | 80 | del cssselect 81 | _missing_cssselect = False 82 | except ImportError: 83 | _missing_cssselect = True 84 | 85 | 86 | class CSSExtractor(AbstractSimpleExtractor): 87 | """ 88 | Use CSS Selector for XML or HTML data subelements extracting. 89 | 90 | Before extracting, should parse the XML or HTML text \ 91 | into :class:`data_extractor.lxml.Element` object. 92 | 93 | :param expr: CSS Selector Expression. 94 | :type expr: str 95 | """ 96 | 97 | _extractor = Property[XPathExtractor]() 98 | 99 | def __init__(self, expr: str): 100 | super().__init__(expr) 101 | 102 | if _missing_cssselect: 103 | _missing_dependency("cssselect") 104 | 105 | # Third Party Library 106 | from cssselect import GenericTranslator 107 | from cssselect.parser import SelectorError 108 | 109 | try: 110 | xpath_expr = GenericTranslator().css_to_xpath(self.expr) 111 | except SelectorError as exc: 112 | raise ExprError(extractor=self, exc=exc) from exc 113 | 114 | self._extractor = XPathExtractor(xpath_expr) 115 | 116 | def extract(self, element: Element) -> List[Element]: 117 | """ 118 | Extract subelements from XML or HTML data. 119 | 120 | :param element: Target. 121 | :type element: :class:`data_extractor.lxml.Element` 122 | 123 | :returns: List of :class:`data_extractor.lxml.Element` objects, \ 124 | extracted result. 125 | :rtype: list 126 | """ 127 | return self._extractor.extract(element) 128 | 129 | 130 | class TextCSSExtractor(CSSExtractor): 131 | """ 132 | Use CSS Selector for XML or HTML data subelements' text extracting. 133 | 134 | Before extracting, should parse the XML or HTML text \ 135 | into :class:`data_extractor.lxml.Element` object. 136 | 137 | :param expr: CSS Selector Expression. 138 | :type expr: str 139 | """ 140 | 141 | def extract(self, element: Element) -> List[str]: 142 | """ 143 | Extract subelements' text from XML or HTML data. 144 | 145 | :param element: Target. 146 | :type element: :class:`data_extractor.lxml.Element` 147 | 148 | :returns: List of str, extracted result. 149 | :rtype: list 150 | 151 | :raises ~data_extractor.exceptions.ExprError: CSS Selector Expression Error. 152 | """ 153 | return [ele.text for ele in super().extract(element)] 154 | 155 | 156 | class AttrCSSExtractor(CSSExtractor): 157 | """ 158 | Use CSS Selector for XML or HTML data subelements' attribute value extracting. 159 | 160 | Before extracting, should parse the XML or HTML text \ 161 | into :class:`data_extractor.lxml.Element` object. 162 | 163 | :param expr: CSS Selector Expression. 164 | :type expr: str 165 | :param attr: Target attribute name. 166 | :type attr: str 167 | """ 168 | 169 | attr = Property[str]() 170 | 171 | def __init__(self, expr: str, attr: str): 172 | super().__init__(expr) 173 | self.attr = attr 174 | 175 | def __repr__(self) -> str: 176 | return f"{self.__class__.__name__}(expr={self.expr!r}, attr={self.attr!r})" 177 | 178 | def extract(self, element: Element) -> List[str]: 179 | """ 180 | Extract subelements' attribute value from XML or HTML data. 181 | 182 | :param element: Target. 183 | :type element: :class:`data_extractor.lxml.Element` 184 | 185 | :returns: List of str, extracted result. 186 | :rtype: list 187 | 188 | :raises ~data_extractor.exceptions.ExprError: CSS Selector Expression Error. 189 | """ 190 | return [ 191 | ele.get(self.attr) 192 | for ele in super().extract(element) 193 | if self.attr in ele.keys() 194 | ] 195 | 196 | 197 | __all__ = ( 198 | "AttrCSSExtractor", 199 | "CSSExtractor", 200 | "Element", 201 | "TextCSSExtractor", 202 | "XPathExtractor", 203 | ) 204 | -------------------------------------------------------------------------------- /data_extractor/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linw1995/data_extractor/ca1a4c4dacec7852590ad7bbf1bee421a3ab1e4a/data_extractor/py.typed -------------------------------------------------------------------------------- /data_extractor/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================= 3 | :mod:`utils` -- Extracting Utils. 4 | ================================= 5 | """ 6 | 7 | # Standard Library 8 | import inspect 9 | 10 | from types import FrameType 11 | from typing import ( 12 | TYPE_CHECKING, 13 | Any, 14 | Callable, 15 | Generic, 16 | Optional, 17 | Type, 18 | TypeVar, 19 | Union, 20 | overload, 21 | ) 22 | 23 | 24 | class __Sentinel: 25 | """Singleton.""" 26 | 27 | def __repr__(self) -> str: 28 | return "sentinel" 29 | 30 | 31 | sentinel = __Sentinel() 32 | 33 | 34 | class LazyStr: 35 | """ 36 | Lazy String. 37 | 38 | :param func: Lazy __str__ function. 39 | """ 40 | 41 | def __init__(self, func: Callable[[], str]): 42 | self.func = func 43 | 44 | def __str__(self) -> str: 45 | return self.func() 46 | 47 | 48 | def is_extractor(obj: Any) -> bool: 49 | """ 50 | Determine the object if it is an extractor, return :obj:`True` if it is. 51 | """ 52 | # Local Folder 53 | from .core import AbstractComplexExtractor, AbstractSimpleExtractor 54 | 55 | return isinstance(obj, (AbstractComplexExtractor, AbstractSimpleExtractor)) 56 | 57 | 58 | def is_simple_extractor(obj: Any) -> bool: 59 | """ 60 | Determine the object if it is a simple extractor, return :obj:`True` if it is. 61 | """ 62 | # Local Folder 63 | from .core import AbstractSimpleExtractor 64 | 65 | return isinstance(obj, AbstractSimpleExtractor) 66 | 67 | 68 | def is_complex_extractor(obj: Any) -> bool: 69 | """ 70 | Determine the object if it is a complex extractor, return :obj:`True` if it is. 71 | """ 72 | # Local Folder 73 | from .core import AbstractComplexExtractor 74 | 75 | return isinstance(obj, AbstractComplexExtractor) 76 | 77 | 78 | def getframe(depth: int = 0) -> Optional[FrameType]: 79 | cur = frame = inspect.currentframe() 80 | if frame is None: 81 | # If running in an implementation without Python stack frame support, 82 | return None 83 | 84 | while depth > -1: 85 | if cur is None: 86 | raise ValueError(f"Invalid depth = {depth!r} for frame = {frame!r}") 87 | 88 | cur = cur.f_back 89 | depth -= 1 90 | 91 | return cur 92 | 93 | 94 | T = TypeVar("T") 95 | 96 | if TYPE_CHECKING: 97 | # Local Folder 98 | from .core import AbstractExtractors 99 | 100 | 101 | class Property(Generic[T]): 102 | """ 103 | Extractor property. 104 | """ 105 | 106 | def __set_name__(self, owner: Any, name: str) -> None: 107 | """ 108 | Customized names -- Descriptor HowTo Guide 109 | https://docs.python.org/3/howto/descriptor.html#customized-names 110 | """ 111 | self.public_name = name 112 | self.private_name = f"__property_{name}" 113 | 114 | @overload 115 | def __get__(self, obj: None, cls: Type["AbstractExtractors"]) -> "Property[T]": 116 | pass 117 | 118 | @overload 119 | def __get__(self, obj: Any, cls: Type["AbstractExtractors"]) -> T: 120 | pass 121 | 122 | def __get__( 123 | self, obj: Any, cls: Type["AbstractExtractors"] 124 | ) -> Union["Property[T]", T]: 125 | if obj is None: 126 | return self 127 | 128 | try: 129 | return getattr(obj, self.private_name) 130 | except AttributeError as exc: 131 | # raise right AttributeError 132 | msg: str = exc.args[0] 133 | raise AttributeError(msg.replace(self.private_name, self.public_name)) 134 | 135 | def __set__(self, obj: Any, value: T) -> T: 136 | if hasattr(obj, self.private_name): 137 | raise AttributeError("can't set attribute") 138 | else: 139 | setattr(obj, self.private_name, value) 140 | return value 141 | 142 | @staticmethod 143 | def change_internal_value( 144 | obj: "AbstractExtractors", property_name: str, value: T 145 | ) -> None: 146 | attr = getattr(type(obj), property_name) 147 | if not isinstance(attr, Property): 148 | raise AttributeError(f"Type of attribute {property_name!r} is not Property") 149 | 150 | setattr(obj, attr.private_name, value) 151 | 152 | 153 | def _missing_dependency(dependency: str) -> None: 154 | """ 155 | Raise :class:RuntimeError for the extractor class that missing optional dependency. 156 | """ 157 | raise RuntimeError(f"{dependency!r} package is needed, run pip to install it. ") 158 | 159 | 160 | __all__ = ( 161 | "LazyStr", 162 | "Property", 163 | "getframe", 164 | "is_complex_extractor", 165 | "is_extractor", 166 | "is_simple_extractor", 167 | "sentinel", 168 | ) 169 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { 2 | lib, 3 | dream2nix, 4 | ... 5 | }: { 6 | imports = [ 7 | dream2nix.modules.dream2nix.WIP-python-pdm 8 | ]; 9 | 10 | mkDerivation = { 11 | src = lib.cleanSourceWith { 12 | src = lib.cleanSource ./.; 13 | filter = name: type: 14 | !(builtins.any (x: x) [ 15 | (lib.hasSuffix ".nix" name) 16 | (lib.hasPrefix "." (builtins.baseNameOf name)) 17 | (lib.hasSuffix "flake.lock" name) 18 | ]); 19 | }; 20 | }; 21 | 22 | pdm.lockfile = ./pdm.lock; 23 | pdm.pyproject = ./pyproject.toml; 24 | 25 | buildPythonPackage = { 26 | pythonImportsCheck = [ 27 | "data_extractor" 28 | ]; 29 | }; 30 | 31 | pdm.editables = lib.mkForce {}; 32 | } 33 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | div.sphinxsidebar { 2 | width: 250px; 3 | } 4 | 5 | dl.class{ 6 | margin-bottom: 1rem; 7 | } 8 | 9 | dl.method, dl.field-list { 10 | margin-bottom: 0.5rem; 11 | } 12 | 13 | pre { 14 | padding: 7px 15px; 15 | } 16 | -------------------------------------------------------------------------------- /docs/source/api_core.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: data_extractor.core 2 | 3 | .. autoclass:: data_extractor.core.SimpleExtractorMeta 4 | 5 | .. autoclass:: data_extractor.core.ComplexExtractorMeta 6 | 7 | .. autoclass:: data_extractor.core.AbstractSimpleExtractor 8 | :members: 9 | 10 | .. autoclass:: data_extractor.core.AbstractComplexExtractor 11 | :members: 12 | -------------------------------------------------------------------------------- /docs/source/api_exceptions.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: data_extractor.exceptions 2 | 3 | .. autoexception:: data_extractor.exceptions.ExprError 4 | 5 | .. autoexception:: data_extractor.exceptions.ExtractError 6 | -------------------------------------------------------------------------------- /docs/source/api_item.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: data_extractor.item 2 | 3 | .. autoclass:: data_extractor.item.Field 4 | :show-inheritance: 5 | :inherited-members: 6 | :members: 7 | 8 | .. autoclass:: data_extractor.item.Item 9 | :show-inheritance: 10 | :inherited-members: 11 | :members: extract, field_names, simplify 12 | -------------------------------------------------------------------------------- /docs/source/api_json.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: data_extractor.json 2 | :members: 3 | :inherited-members: 4 | :show-inheritance: 5 | -------------------------------------------------------------------------------- /docs/source/api_lxml.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: data_extractor.lxml 2 | :members: 3 | :inherited-members: 4 | :show-inheritance: 5 | -------------------------------------------------------------------------------- /docs/source/api_reference.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | API Reference 3 | ============= 4 | 5 | .. automodule:: data_extractor 6 | 7 | .. toctree:: 8 | :name: API Reference 9 | :maxdepth: 2 10 | 11 | api_core 12 | api_exceptions 13 | api_utils 14 | api_lxml 15 | api_json 16 | api_item 17 | -------------------------------------------------------------------------------- /docs/source/api_utils.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: data_extractor.utils 2 | 3 | .. autoclass:: data_extractor.utils.LazyStr 4 | 5 | .. autodata:: data_extractor.utils.sentinel 6 | 7 | .. autofunction:: data_extractor.utils.is_extractor 8 | 9 | .. autofunction:: data_extractor.utils.is_simple_extractor 10 | 11 | .. autofunction:: data_extractor.utils.is_complex_extractor 12 | 13 | .. autoclass:: data_extractor.utils.Property 14 | -------------------------------------------------------------------------------- /docs/source/changelog.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | v1.0.1 6 | ~~~~~~ 7 | 8 | **Build** 9 | 10 | - Supports Python 3.13 11 | 12 | 13 | .. include:: history.rst 14 | :start-line: 4 15 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | # Standard Library 21 | from datetime import date 22 | 23 | project = "Data-Extractor" 24 | year = date.today().year 25 | copyright = f"{year}, 林玮" 26 | author = "林玮" 27 | 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.githubpages"] 35 | autodoc_inherit_docstrings = True 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ["_templates"] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = [] 44 | 45 | 46 | # -- Options for HTML output ------------------------------------------------- 47 | 48 | # The theme to use for HTML and HTML Help pages. See the documentation for 49 | # a list of builtin themes. 50 | # 51 | html_theme = "alabaster" 52 | html_theme_options = { 53 | "description": "Combine XPath, CSS Selectors and JSONPath for Web data extracting.", 54 | "github_button": True, 55 | "github_type": "star", 56 | "travis_button": True, 57 | "codecov_button": True, 58 | "github_user": "linw1995", 59 | "github_repo": "data_extractor", 60 | "fixed_sidebar": False, 61 | "page_width": "1024px", 62 | "sidebar_width": "230px", 63 | } 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ["_static"] 68 | html_sidebars = { 69 | "**": ["about.html", "navigation.html", "relations.html", "searchbox.html"] 70 | } 71 | -------------------------------------------------------------------------------- /docs/source/contributing.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | 6 | Environment Setup 7 | ~~~~~~~~~~~~~~~~~ 8 | 9 | Clone the source codes from Github. 10 | 11 | .. code-block:: shell 12 | 13 | git clone https://github.com/linw1995/data_extractor.git 14 | cd data_extractor 15 | 16 | Setup the development environment. 17 | Please make sure you install the pdm_, 18 | pre-commit_ and nox_ CLIs in your environment. 19 | 20 | .. code-block:: shell 21 | 22 | make init 23 | make PYTHON=3.7 init # for specific python version 24 | 25 | Linting 26 | ~~~~~~~ 27 | 28 | Use pre-commit_ for installing linters to ensure a good code style. 29 | 30 | .. code-block:: shell 31 | 32 | make pre-commit 33 | 34 | Run linters. Some linters run via CLI nox_, so make sure you install it. 35 | 36 | .. code-block:: shell 37 | 38 | make check-all 39 | 40 | Testing 41 | ~~~~~~~ 42 | 43 | Run quick tests. 44 | 45 | .. code-block:: shell 46 | 47 | make 48 | 49 | Run quick tests with verbose. 50 | 51 | .. code-block:: shell 52 | 53 | make vtest 54 | 55 | Run tests with coverage. 56 | Testing in multiple Python environments is powered by CLI nox_. 57 | 58 | .. code-block:: shell 59 | 60 | make cov 61 | 62 | .. _pdm: https://github.com/pdm-project/pdm 63 | .. _pre-commit: https://pre-commit.com/ 64 | .. _nox: https://nox.thea.codes/en/stable/ 65 | -------------------------------------------------------------------------------- /docs/source/history.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | v1.0.0 6 | ~~~~~~ 7 | 8 | **Feature** 9 | 10 | - Generic extractor with convertor (#83) 11 | - mypy plugin for type annotation of extracting result (#83) 12 | 13 | v0.10.2 14 | ~~~~~~~ 15 | 16 | **Build** 17 | 18 | - upgrade jsonpath-extractor to v0.8.0 19 | 20 | 21 | v0.10.1 22 | ~~~~~~~ 23 | 24 | **Fix** 25 | 26 | - typo in .utils.Property 27 | 28 | v0.10.0 29 | ~~~~~~~ 30 | 31 | **Feature** 32 | 33 | - supports PEP 561 -- Distributing and Packaging Type Information 34 | 35 | **Fix** 36 | 37 | - remove LICENSE file from dist files 38 | - duplicated extracting if class attrs overlap happened #67 39 | - remove super class sub-extractors error #68 40 | 41 | **Refactor** 42 | 43 | - remove duplciated module "data_extractor.abc" 44 | - remove the lazy build mechanism of extractors 45 | - JSON backend invoking mechanism 46 | - make all properties of extractors immutable 47 | 48 | **Document** 49 | 50 | - fix wrong docstring of "data_extractor.utils.Property" 51 | 52 | v0.9.0 53 | ~~~~~~ 54 | 55 | **Fix** 56 | 57 | - type annotations #63 #64 58 | 59 | **Refactor** 60 | 61 | - .utils.Property with "Customized names" support #64 62 | - rename .abc to .core and mark elder duplciated #65 63 | 64 | v0.8.0 65 | ~~~~~~ 66 | 67 | - 11bfd2c supports latest jsonpath-extractor package 68 | 69 | v0.7.0 70 | ~~~~~~ 71 | 72 | - 65d1fce Fix:Create JSONExtractor with wrong subtype 73 | - 407cd78 New:Make lxml and cssselect optional (#61) 74 | 75 | v0.6.1 76 | ~~~~~~ 77 | 78 | - d28fff4 Fix:Item created error by ``type`` function. (Issue #56) 79 | 80 | v0.6.0 81 | ~~~~~~ 82 | 83 | - f1d21fe New:Make different implementations of JSONExtractor optional 84 | - 0175cde New:Add jsonpath-extractor as opitional json extractor backend 85 | - 3b6da8b Chg:Upgrade dependencies 86 | 87 | v0.6.0-alpha.3 88 | ~~~~~~~~~~~~~~ 89 | 90 | - 1982302 Fix:Type annotation error 91 | 92 | v0.6.0.dev2 93 | ~~~~~~~~~~~ 94 | 95 | - b7edbae Dev,New:Use nox test in multi-py-versions, Update workflow 96 | - a043838 Fix:Can't import JSONPathExtractor from root module 97 | - a23ece9 Test,Fix:Missing JSONPathExtractor in simple extractor tests 98 | - 5903ff9 Dev,Fix:Nox changes symlink '.venv' of virtualenv of development 99 | - 57d03ad Dev,Fix:Install unneeded development dependencies 100 | 101 | v0.6.0.dev1 102 | ~~~~~~~~~~~ 103 | 104 | - 2459f7d Dev,New:Add Github Actions for CI 105 | - a151a91 Dev,New:Add scripts/export_requirements_txt.sh 106 | - f7cdaa3 Dev,Chg:Remove travis-ci 107 | - f1d21fe New:Make different implementations of JSONExtractor optional 108 | - 9f74619 Fix:Use __getattr__ on the module in the wrong way 109 | - 25a8bf8 Dev,Fix:Cannot use pytest.mark.usefixtures() in pytest.param 110 | - 8f51603 Dev,Chg:Upgrade poetry version in Makefile 111 | - 21aa08e Dev,Chg:Test in two ways 112 | - 4cb4678 Chg:Upgrade dependencies 113 | - 4177b98 Dev,Fix:remove the venv before pretest installation 114 | - 0175cde New:Add jsonpath-extractor as opitional json extractor backend 115 | 116 | v0.5.4 117 | ~~~~~~ 118 | 119 | - 9552c79 Fix:Simplified item's extract_first method fail to raise ExtractError 120 | - 08167ab Fix:Simplified item's extract_first method 121 | should support param default 122 | - 6e4c269 New:More unittest for testing the simplified items 123 | - a35b85a Chg:Update poetry.lock 124 | - e5ff37b Docs,Chg:Update travis-ci status source in the README.rst 125 | 126 | v0.5.3 127 | ~~~~~~ 128 | 129 | - 6a26be5 Chg:Wrap the single return value as a list 130 | - 0b63927 Fix:Item can not extract the data is list type 131 | - 9deeb5f Chg:Update poetry.lock 132 | 133 | v0.5.2 134 | ~~~~~~ 135 | 136 | - 0561672 Fix:Wrong parameter name 137 | 138 | v0.5.1 139 | ~~~~~~ 140 | 141 | - c9b07f4 Fix:Wrong shield placing 142 | - b198712 Dev,Fix:Build travis-ci config validation 143 | 144 | v0.5.0 145 | ~~~~~~ 146 | 147 | - 0056f37 Split AbstractExtractor into AbstractSimpleExtractor and 148 | AbstractComplexExtractor 149 | - c42aeb5 Feature/more friendly development setup (#34) 150 | - 2f9a71c New:Support testing in 3.8 151 | - c8bd593 New:Stash unstaged code before testing 152 | - d2a18a8 New:Best way to raise new exc 153 | - 90fa9c8 New:ExprError ``__str__`` implementation 154 | - d961768 Fix:Update mypy pre-commit config 155 | - e5d59c3 New:Raise SyntaxError when field overwrites method (#38) 156 | - 7720fb9 Feature/avoid field overwriting (#39) 157 | - b722717 Dev,Fix:Black configure not working 158 | - f8f0df8 New:Implement extractors' build method 159 | - 98ada74 Chg:Update docs 160 | 161 | v0.4.1 162 | ~~~~~~ 163 | 164 | - d180992 Add pre-commit support and fix pre-commit check error (#32) 165 | - bd680c1 Update pyproject.toml 166 | - 64f30f7 remove unhappened condtional 167 | 168 | v0.4.0 169 | ~~~~~~ 170 | 171 | - 74f569b Update docs and lint docs (#31) 172 | - 4188634 Support RTD (#30) 173 | - a5b776f Separate dependencies (#29) 174 | - 69079b4 Generate simple extractor from a complex extractor (#28) 175 | - 58a7570 Support JSONPath ext syntax (#26) 176 | - bb7c602 Replace Pipenv with Poetry (#24) 177 | 178 | v0.3.2 179 | ~~~~~~ 180 | 181 | - cd65ad0 Make Parameter extractor Optional 182 | 183 | v0.2.2 184 | ~~~~~~ 185 | 186 | - fca801a Merge pull request #22 from linw1995/hotfix 187 | 188 | + 8bf2a62 Fix name overwritten syntax checking 189 | that includes the ``__init__`` first parameter. 190 | 191 | + 10e2ca0 Fix raise wrong execption from python repl, 192 | oneline code or type() creation. 193 | 194 | v0.2.1 195 | ~~~~~~ 196 | 197 | - a05b75f Export all from the root module. 198 | - d2900d3 Add Optional Parameter name for special field name. (#19) 199 | - 99a4a7f Raise SyntaxError 200 | when the field name is the same as Item's parameter… (#18) 201 | 202 | v0.2.0 203 | ~~~~~~ 204 | 205 | - 9c2e2cd Rename ExtractFirstMixin into SimpleExtractorBase (#12) 206 | - bac925d Raise ValueError 207 | when misplaced the complex extractor in complex extractor. (#13) 208 | 209 | - 88b9227 Wrap expr exception (#14) 210 | - aeb9520 Deploy Docs on GitHub Pages. (#15) 211 | 212 | + Update docstring. 213 | + Deploy Docs on Github Pages. 214 | + Add Quickstarts.rst 215 | 216 | - Bump into beta 217 | 218 | v0.1.5 219 | ~~~~~~ 220 | 221 | - cabfac3 Add utils.py 222 | - 9e1c005 Make all extractor class inherit the same ABC. 223 | - 7828a1a Make easy to trace exception thrown 224 | by complex extractor extracting data. 225 | 226 | v0.1.4 227 | ~~~~~~ 228 | 229 | - f4267fe Modify docstr 230 | - 6f2f8d1 Add more docstr 231 | 232 | v0.1.3 233 | ~~~~~~ 234 | 235 | - 5f4b0e0 Update README.md 236 | - 1b8bfb9 Add UserWarning when extractor can't extract first item from result 237 | - dd2cd25 Remove the useless _extract call 238 | - 655ec9d Add UserWarning when expr is conflict with parameter is_many=True 239 | - bcade2c No alow user to set is_many=True and default!=sentinel at same time 240 | - 761bd30 Add more unit tests 241 | 242 | v0.1.2 243 | ~~~~~~ 244 | 245 | - Add exceptions.py and ExprError 246 | - Change travis-ci deploy stage condition 247 | - Add travis-ci deploy github release 248 | 249 | v0.1.1 250 | ~~~~~~ 251 | 252 | - Rename ``.html`` to ``.lxml``; 253 | Remove ``fromstring``, ``tostring`` function from ``.lxml`` 254 | 255 | + Rename .html to .lxml 256 | + use ``lxml.html.fromstring`` and ``lxml.html.tostring`` to process HTML 257 | + use ``lxml.etree.fromstring`` and ``lxml.etree.tostring`` to process XML 258 | 259 | - Add check_isort, check_black, check, 260 | check_all, fc: format_code into Makefile for development. 261 | 262 | v0.1.0 263 | ~~~~~~ 264 | 265 | - initialize project 266 | - add Extractor to extract data from the text which format is HTML or JSON. 267 | - add complex extractor: Field, Item 268 | -------------------------------------------------------------------------------- /docs/source/howto/index.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | Data-Extractor HOWTOs 3 | ===================== 4 | 5 | Learning how to use data-extractor. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | json 11 | lxml 12 | item 13 | -------------------------------------------------------------------------------- /docs/source/howto/item.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Complex Extracting 3 | ================== 4 | 5 | .. include:: lxml.rst 6 | :start-line: 7 7 | :end-before: Using 8 | 9 | Defining :class:`ChannelItem` class, then extracting the data. 10 | 11 | .. code-block:: python3 12 | 13 | from data_extractor import Field, Item, XPathExtractor 14 | 15 | 16 | class ChannelItem(Item): 17 | title = Field(XPathExtractor("./title/text()"), default="") 18 | link = Field(XPathExtractor("./link/text()"), default="") 19 | description = Field(XPathExtractor("./description/text()")) 20 | publish_date = Field(XPathExtractor("./pubDate/text()")) 21 | guid = Field(XPathExtractor("./guid/text()")) 22 | 23 | Extracting all channel items from file. 24 | 25 | .. code-block:: python3 26 | 27 | from data_extractor import XPathExtractor 28 | 29 | extractor = ChannelItem(XPathExtractor("//channel/item"), is_many=True) 30 | assert extractor.extract(root)[:2] == [ 31 | { 32 | "title": "Star City", 33 | "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp", 34 | "description": 'How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s Star City.', 35 | "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT", 36 | "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573", 37 | }, 38 | { 39 | "title": "", 40 | "link": "", 41 | "description": 'Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a partial eclipse of the Sun on Saturday, May 31st.', 42 | "publish_date": "Fri, 30 May 2003 11:06:42 GMT", 43 | "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572", 44 | }, 45 | ] 46 | 47 | Nested Extractors 48 | ~~~~~~~~~~~~~~~~~ 49 | 50 | Defining :class:`Channel` class with :class:`ChannelItem`. 51 | 52 | .. code-block:: python3 53 | 54 | class Channel(Item): 55 | title = Field(XPathExtractor("./title/text()")) 56 | link = Field(XPathExtractor("./link/text()")) 57 | description = Field(XPathExtractor("./description/text()")) 58 | language = Field(XPathExtractor("./language/text()")) 59 | publish_date = Field(XPathExtractor("./pubDate/text()")) 60 | last_build_date = Field(XPathExtractor("./lastBuildDate/text()")) 61 | docs = Field(XPathExtractor("./docs/text()")) 62 | generator = Field(XPathExtractor("./generator/text()")) 63 | managing_editor = Field(XPathExtractor("./managingEditor/text()")) 64 | web_master = Field(XPathExtractor("./webMaster/text()")) 65 | 66 | items = ChannelItem(XPathExtractor("./item[position()<3]"), is_many=True) 67 | 68 | Extracting the rss channel data from file. 69 | 70 | .. code-block:: python3 71 | 72 | from data_extractor import XPathExtractor 73 | 74 | extractor = Channel(XPathExtractor("//channel")) 75 | assert extractor.extract(root) == { 76 | "title": "Liftoff News", 77 | "link": "http://liftoff.msfc.nasa.gov/", 78 | "description": "Liftoff to Space Exploration.", 79 | "language": "en-us", 80 | "publish_date": "Tue, 10 Jun 2003 04:00:00 GMT", 81 | "last_build_date": "Tue, 10 Jun 2003 09:41:01 GMT", 82 | "docs": "http://blogs.law.harvard.edu/tech/rss", 83 | "generator": "Weblog Editor 2.0", 84 | "managing_editor": "editor@example.com", 85 | "web_master": "webmaster@example.com", 86 | "items": [ 87 | { 88 | "title": "Star City", 89 | "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp", 90 | "description": 'How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s Star City.', 91 | "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT", 92 | "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573", 93 | }, 94 | { 95 | "title": "", 96 | "link": "", 97 | "description": 'Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a partial eclipse of the Sun on Saturday, May 31st.', 98 | "publish_date": "Fri, 30 May 2003 11:06:42 GMT", 99 | "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572", 100 | }, 101 | ], 102 | } 103 | 104 | Simplifying Complex Extractor 105 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 106 | 107 | A complex extractor can be simplified 108 | into a simple extractor 109 | by using :meth:`data_extractor.item.Item.simplify`. 110 | 111 | .. code-block:: python3 112 | 113 | from data_extractor import XPathExtractor 114 | 115 | complex_extractorra = ChannelItem(XPathExtractor("//channel/item")) 116 | simple_extractor = complex_extractor.simplify() 117 | 118 | complex_extractor.is_many = False 119 | assert simple_extractor.extract_first(root) == complex_extractor.extract(root) 120 | 121 | complex_extractor.is_many = True 122 | assert simple_extractor.extract(root) == complex_extractor.extract(root) 123 | 124 | Set Paramater Extractor Be None To Extract Data From Root 125 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 126 | 127 | .. code-block:: python3 128 | 129 | from data_extractor import Item, Field, JSONExtractor 130 | 131 | 132 | class User(Item): 133 | nickname = Field(JSONExtractor("name")) 134 | age = Field(JSONExtractor("age")) 135 | raw = Field() 136 | 137 | 138 | assert User().extract({"name": "john", "age": 17, "gender": "male"}) == { 139 | "nickname": "john", 140 | "age": 17, 141 | "raw": {"name": "john", "age": 17, "gender": "male"}, 142 | } 143 | 144 | Avoid Field Overwrites Property Or Method 145 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 146 | 147 | To avoid complex extractor's field overwrites its property or method, 148 | use the parameter **name** of the complex extractor. 149 | 150 | .. code-block:: python3 151 | 152 | from data_extractor import Field, Item, JSONExtractor 153 | 154 | 155 | class User(Item): 156 | name_ = Field(JSONExtractor("name"), name="name") 157 | 158 | 159 | assert User().extract({"name": "john", "age": 17}) == {"name": "john"} 160 | -------------------------------------------------------------------------------- /docs/source/howto/json.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Extract JSON Data 3 | ================= 4 | 5 | The function to extract data from the JSON file 6 | powered by python-jsonpath-rw_ and python-jsonpath-rw-ext_ 7 | to support JSONPath_. 8 | Or use a new syntax of JSONPATH for extracting 9 | by installing optional dependency jsonpath-extractor_. 10 | 11 | Run below command to install optional dependency. 12 | 13 | .. code-block:: shell 14 | 15 | pip install "data_extractor[jsonpath-rw]" 16 | pip install "data_extractor[jsonpath-rw-ext]" 17 | 18 | pip install "data_extractor[jsonpath-extractor]" 19 | 20 | Use the :class:`data_extractor.json.JSONExtractor` to extract data. 21 | 22 | .. code-block:: python3 23 | 24 | import json 25 | from data_extractor import JSONExtractor 26 | 27 | text = '{"foo": [{"baz": 1}, {"baz": 2}]}' 28 | data = json.loads(text) 29 | assert JSONExtractor("foo[*].baz").extract(data) == [1, 2] 30 | 31 | .. _python-jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw 32 | .. _python-jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/ 33 | .. _JSONPath: https://goessner.net/articles/JsonPath/ 34 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath 35 | 36 | By changing :data:`json_extractor_backend` 37 | to use a specific backend of JSON extractor. 38 | See APIs ref of :class:`data_extractor.json.JSONExtractor` 39 | for additional details. 40 | -------------------------------------------------------------------------------- /docs/source/howto/lxml.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Extract HTML or XML Data 3 | ======================== 4 | 5 | The function to extract data from the html or xml file 6 | powered by lxml_ to support XPath_, by cssselect_ to support CSS-Selectors_. 7 | 8 | Run below command to install optional dependency. 9 | 10 | .. code-block:: shell 11 | 12 | pip install "data_extractor[lxml]" # For using XPath 13 | pip install "data_extractor[cssselect]" # For using CSS-Selectors 14 | 15 | Download RSS Sample file for demonstrate. 16 | 17 | .. code-block:: shell 18 | 19 | wget http://www.rssboard.org/files/sample-rss-2.xml 20 | 21 | Parse it into :class:`data_extractor.lxml.Element`. 22 | 23 | .. code-block:: python3 24 | 25 | from pathlib import Path 26 | 27 | from lxml.etree import fromstring 28 | 29 | root = fromstring(Path("sample-rss-2.xml").read_text()) 30 | 31 | Using :class:`data_extractor.lxml.XPathExtractor` to extract rss channel title. 32 | 33 | .. code-block:: python3 34 | 35 | from data_extractor import XPathExtractor 36 | 37 | assert XPathExtractor("//channel/title/text()").extract_first(root) == "Liftoff News" 38 | 39 | Using :class:`data_extractor.lxml.TextCSSExtractor` 40 | to extract all rss item links. 41 | 42 | .. code-block:: python3 43 | 44 | from data_extractor import TextCSSExtractor 45 | 46 | assert TextCSSExtractor("item>link").extract(root) == [ 47 | "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp", 48 | "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp", 49 | "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp", 50 | ] 51 | 52 | Using :class:`data_extractor.lxml.AttrCSSExtractor` to extract rss version. 53 | 54 | .. code-block:: python3 55 | 56 | from data_extractor import AttrCSSExtractor 57 | 58 | assert AttrCSSExtractor("rss", attr="version").extract_first(root) == "2.0" 59 | 60 | .. _lxml: https://lxml.de 61 | .. _XPath: https://www.w3.org/TR/xpath-10/ 62 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/ 63 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/ 64 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ========================================== 2 | Welcome to Data-Extractor's documentation! 3 | ========================================== 4 | 5 | .. include:: readme.rst 6 | :start-line: 4 7 | 8 | Contents 9 | ======== 10 | 11 | .. toctree:: 12 | :maxdepth: 4 13 | 14 | quickstarts 15 | howto/index 16 | contributing 17 | api_reference 18 | changelog 19 | 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`search` 26 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | Install the stable version from PYPI. 6 | 7 | .. code-block:: shell 8 | 9 | pip install "data-extractor[jsonpath-extractor]" # for extracting JSON data 10 | pip install "data-extractor[lxml]" # for extracting HTML data 11 | 12 | Or install the latest version from Github. 13 | 14 | .. code-block:: shell 15 | 16 | pip install "data-extractor[jsonpath-extractor] @ git+https://github.com/linw1995/data_extractor.git@master" 17 | 18 | Extract JSON data 19 | ~~~~~~~~~~~~~~~~~ 20 | 21 | Currently supports to extract JSON data with below optional dependencies 22 | 23 | - jsonpath-extractor_ 24 | - jsonpath-rw_ 25 | - jsonpath-rw-ext_ 26 | 27 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath 28 | .. _jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw 29 | .. _jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/ 30 | 31 | install one dependency of them to extract JSON data. 32 | 33 | Extract HTML(XML) data 34 | ~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | Currently supports to extract HTML(XML) data with below optional dependencies 37 | 38 | - lxml_ for using XPath_ 39 | - cssselect_ for using CSS-Selectors_ 40 | 41 | .. _lxml: https://lxml.de/ 42 | .. _XPath: https://www.w3.org/TR/xpath-10/ 43 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/ 44 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/ 45 | -------------------------------------------------------------------------------- /docs/source/quickstarts.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Quickstarts 3 | =========== 4 | 5 | Installation 6 | ~~~~~~~~~~~~ 7 | 8 | .. include:: installation.rst 9 | :start-line: 4 10 | 11 | Usage 12 | ~~~~~ 13 | 14 | .. code-block:: python3 15 | 16 | from data_extractor import Field, Item, JSONExtractor 17 | 18 | 19 | class Count(Item): 20 | followings = Field(JSONExtractor("countFollowings")) 21 | fans = Field(JSONExtractor("countFans")) 22 | 23 | 24 | class User(Item): 25 | name_ = Field(JSONExtractor("name"), name="name") 26 | age = Field(JSONExtractor("age"), default=17) 27 | count = Count() 28 | 29 | 30 | assert User(JSONExtractor("data.users[*]"), is_many=True).extract( 31 | { 32 | "data": { 33 | "users": [ 34 | { 35 | "name": "john", 36 | "age": 19, 37 | "countFollowings": 14, 38 | "countFans": 212, 39 | }, 40 | { 41 | "name": "jack", 42 | "description": "", 43 | "countFollowings": 54, 44 | "countFans": 312, 45 | }, 46 | ] 47 | } 48 | } 49 | ) == [ 50 | {"name": "john", "age": 19, "count": {"followings": 14, "fans": 212}}, 51 | {"name": "jack", "age": 17, "count": {"followings": 54, "fans": 312}}, 52 | ] 53 | -------------------------------------------------------------------------------- /docs/source/readme.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Data Extractor 3 | ============== 4 | 5 | |license| |Pypi Status| |Python version| |Package version| |PyPI - Downloads| 6 | |GitHub last commit| |Code style: black| |Build Status| |codecov| 7 | |Documentation Status| |PDM managed| 8 | 9 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting. 10 | 11 | Quickstarts 12 | <<<<<<<<<<< 13 | 14 | .. include:: quickstarts.rst 15 | :start-line: 4 16 | 17 | Changelog 18 | <<<<<<<<< 19 | 20 | .. include:: changelog.rst 21 | :start-line: 4 22 | :end-before: .. include:: history.rst 23 | 24 | Contributing 25 | <<<<<<<<<<<< 26 | 27 | .. include:: contributing.rst 28 | :start-line: 4 29 | 30 | .. |license| image:: https://img.shields.io/github/license/linw1995/data_extractor.svg 31 | :target: https://github.com/linw1995/data_extractor/blob/master/LICENSE 32 | 33 | .. |Pypi Status| image:: https://img.shields.io/pypi/status/data_extractor.svg 34 | :target: https://pypi.org/project/data_extractor 35 | 36 | .. |Python version| image:: https://img.shields.io/pypi/pyversions/data_extractor.svg 37 | :target: https://pypi.org/project/data_extractor 38 | 39 | .. |Package version| image:: https://img.shields.io/pypi/v/data_extractor.svg 40 | :target: https://pypi.org/project/data_extractor 41 | 42 | .. |PyPI - Downloads| image:: https://img.shields.io/pypi/dm/data-extractor.svg 43 | :target: https://pypi.org/project/data_extractor 44 | 45 | .. |GitHub last commit| image:: https://img.shields.io/github/last-commit/linw1995/data_extractor.svg 46 | :target: https://github.com/linw1995/data_extractor 47 | 48 | .. |Code style: black| image:: https://img.shields.io/badge/code%20style-black-000000.svg 49 | :target: https://github.com/ambv/black 50 | 51 | .. |Build Status| image:: https://github.com/linw1995/data_extractor/workflows/Lint&Test/badge.svg 52 | :target: https://github.com/linw1995/data_extractor/actions?query=workflow%3ALint%26Test 53 | 54 | .. |codecov| image:: https://codecov.io/gh/linw1995/data_extractor/branch/master/graph/badge.svg 55 | :target: https://codecov.io/gh/linw1995/data_extractor 56 | 57 | .. |Documentation Status| image:: https://readthedocs.org/projects/data-extractor/badge/?version=latest 58 | :target: https://data-extractor.readthedocs.io/en/latest/?badge=latest 59 | 60 | .. |PDM managed| image:: https://img.shields.io/badge/pdm-managed-blueviolet 61 | :target: https://pdm.fming.dev 62 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "dream2nix": { 4 | "inputs": { 5 | "nixpkgs": "nixpkgs", 6 | "purescript-overlay": "purescript-overlay", 7 | "pyproject-nix": "pyproject-nix" 8 | }, 9 | "locked": { 10 | "lastModified": 1728585693, 11 | "narHash": "sha256-rhx5SYpIkPu7d+rjF9FGGBVxS0BwAEkmYIsJg2a3E20=", 12 | "owner": "nix-community", 13 | "repo": "dream2nix", 14 | "rev": "c6935471f7e1a9e190aaa9ac9823dca34e00d92a", 15 | "type": "github" 16 | }, 17 | "original": { 18 | "owner": "nix-community", 19 | "repo": "dream2nix", 20 | "type": "github" 21 | } 22 | }, 23 | "flake-compat": { 24 | "flake": false, 25 | "locked": { 26 | "lastModified": 1696426674, 27 | "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", 28 | "owner": "edolstra", 29 | "repo": "flake-compat", 30 | "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", 31 | "type": "github" 32 | }, 33 | "original": { 34 | "owner": "edolstra", 35 | "repo": "flake-compat", 36 | "type": "github" 37 | } 38 | }, 39 | "nixpkgs": { 40 | "locked": { 41 | "lastModified": 1728538411, 42 | "narHash": "sha256-f0SBJz1eZ2yOuKUr5CA9BHULGXVSn6miBuUWdTyhUhU=", 43 | "owner": "NixOS", 44 | "repo": "nixpkgs", 45 | "rev": "b69de56fac8c2b6f8fd27f2eca01dcda8e0a4221", 46 | "type": "github" 47 | }, 48 | "original": { 49 | "owner": "NixOS", 50 | "ref": "nixpkgs-unstable", 51 | "repo": "nixpkgs", 52 | "type": "github" 53 | } 54 | }, 55 | "purescript-overlay": { 56 | "inputs": { 57 | "flake-compat": "flake-compat", 58 | "nixpkgs": [ 59 | "dream2nix", 60 | "nixpkgs" 61 | ], 62 | "slimlock": "slimlock" 63 | }, 64 | "locked": { 65 | "lastModified": 1724504251, 66 | "narHash": "sha256-TIw+sac0NX0FeAneud+sQZT+ql1G/WEb7/Vb436rUXM=", 67 | "owner": "thomashoneyman", 68 | "repo": "purescript-overlay", 69 | "rev": "988b09676c2a0e6a46dfa3589aa6763c90476b8a", 70 | "type": "github" 71 | }, 72 | "original": { 73 | "owner": "thomashoneyman", 74 | "repo": "purescript-overlay", 75 | "type": "github" 76 | } 77 | }, 78 | "pyproject-nix": { 79 | "flake": false, 80 | "locked": { 81 | "lastModified": 1702448246, 82 | "narHash": "sha256-hFg5s/hoJFv7tDpiGvEvXP0UfFvFEDgTdyHIjDVHu1I=", 83 | "owner": "davhau", 84 | "repo": "pyproject.nix", 85 | "rev": "5a06a2697b228c04dd2f35659b4b659ca74f7aeb", 86 | "type": "github" 87 | }, 88 | "original": { 89 | "owner": "davhau", 90 | "ref": "dream2nix", 91 | "repo": "pyproject.nix", 92 | "type": "github" 93 | } 94 | }, 95 | "root": { 96 | "inputs": { 97 | "dream2nix": "dream2nix", 98 | "nixpkgs": [ 99 | "dream2nix", 100 | "nixpkgs" 101 | ] 102 | } 103 | }, 104 | "slimlock": { 105 | "inputs": { 106 | "nixpkgs": [ 107 | "dream2nix", 108 | "purescript-overlay", 109 | "nixpkgs" 110 | ] 111 | }, 112 | "locked": { 113 | "lastModified": 1688756706, 114 | "narHash": "sha256-xzkkMv3neJJJ89zo3o2ojp7nFeaZc2G0fYwNXNJRFlo=", 115 | "owner": "thomashoneyman", 116 | "repo": "slimlock", 117 | "rev": "cf72723f59e2340d24881fd7bf61cb113b4c407c", 118 | "type": "github" 119 | }, 120 | "original": { 121 | "owner": "thomashoneyman", 122 | "repo": "slimlock", 123 | "type": "github" 124 | } 125 | } 126 | }, 127 | "root": "root", 128 | "version": 7 129 | } 130 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | inputs = { 3 | dream2nix.url = "github:nix-community/dream2nix"; 4 | nixpkgs.follows = "dream2nix/nixpkgs"; 5 | }; 6 | 7 | outputs = { 8 | self, 9 | dream2nix, 10 | nixpkgs, 11 | }: let 12 | eachSystem = nixpkgs.lib.genAttrs [ 13 | "aarch64-darwin" 14 | "aarch64-linux" 15 | "x86_64-darwin" 16 | "x86_64-linux" 17 | ]; 18 | in { 19 | packages = eachSystem (system: { 20 | default = dream2nix.lib.evalModules { 21 | packageSets.nixpkgs = import nixpkgs {inherit system;}; 22 | modules = [ 23 | ./default.nix 24 | { 25 | paths.projectRoot = ./.; 26 | paths.projectRootFile = "flake.nix"; 27 | paths.package = ./.; 28 | } 29 | ]; 30 | }; 31 | }); 32 | devShells = eachSystem (system: let 33 | pkgs = import nixpkgs {inherit system;}; 34 | in { 35 | default = pkgs.mkShell { 36 | inputsFrom = [ 37 | self.packages.${system}.default.devShell 38 | ]; 39 | 40 | packages = with pkgs; [ 41 | pre-commit 42 | python312Packages.nox 43 | 44 | python39 45 | python310 46 | python311 47 | python312 48 | python313 49 | ]; 50 | }; 51 | }); 52 | }; 53 | } 54 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | import os 3 | 4 | from pathlib import Path 5 | 6 | # Third Party Library 7 | import nox 8 | 9 | nox.options.stop_on_first_error = True 10 | 11 | 12 | pythons = ["3.10", "3.11", "3.12", "3.13"] 13 | 14 | os.environ.update({"PDM_IGNORE_SAVED_PYTHON": "1"}) 15 | os.environ.pop("PYTHONPATH", None) 16 | 17 | 18 | def venv_setup_on_create(session, install): 19 | cwd = os.getcwd() 20 | session.cd(session.create_tmp()) 21 | if session.run( 22 | "python", "-Esc", "import data_extractor", success_codes=(1, 0), silent=True 23 | ): 24 | install(session) 25 | session.cd(cwd) 26 | 27 | 28 | @nox.session(python=pythons, venv_backend="venv") 29 | @nox.parametrize( 30 | "extractor_backend", 31 | [ 32 | None, 33 | "jsonpath-extractor", 34 | "jsonpath-rw", 35 | "jsonpath-rw-ext", 36 | "lxml", 37 | "cssselect", 38 | ], 39 | ) 40 | def coverage_test(session, extractor_backend): 41 | venv_setup_on_create( 42 | session, 43 | lambda s: s.run( 44 | "pdm", 45 | "sync", 46 | "-G", 47 | "test", 48 | *(("-G", extractor_backend) if extractor_backend else tuple()), 49 | external=True, 50 | ), 51 | ) 52 | session.run( 53 | "pytest", 54 | "-vv", 55 | "--cov=data_extractor", 56 | "--cov-append", 57 | "--ignore", 58 | "tests/typesafety", 59 | *session.posargs, 60 | ) 61 | 62 | 63 | @nox.session(python=pythons, venv_backend="venv") 64 | def coverage_report(session): 65 | venv_setup_on_create( 66 | session, 67 | lambda s: s.run("pdm", "sync", "-G", "test", external=True), 68 | ) 69 | session.run("coverage", "report") 70 | session.run("coverage", "xml") 71 | session.run("coverage", "html") 72 | session.log( 73 | f">> open file:/{(Path() / 'htmlcov/index.html').absolute()} to see coverage" 74 | ) 75 | 76 | 77 | @nox.session(python=pythons, venv_backend="venv") 78 | def test_mypy_plugin(session): 79 | venv_setup_on_create( 80 | session, 81 | lambda s: s.run("pdm", "sync", "-G", "test-mypy-plugin", external=True), 82 | ) 83 | 84 | session.run( 85 | "pytest", 86 | "-vv", 87 | "--cov=data_extractor/contrib/mypy", 88 | "--cov-append", 89 | "--mypy-same-process", 90 | "--mypy-ini-file=./tests/mypy.ini", 91 | "tests/typesafety", 92 | *(session.posargs if session.posargs else tuple()), 93 | ) 94 | 95 | 96 | @nox.session(python=pythons[-1:], venv_backend="venv") 97 | def build_readme(session): 98 | venv_setup_on_create( 99 | session, 100 | lambda s: s.run("pdm", "sync", "-G", "build_readme", external=True), 101 | ) 102 | session.run( 103 | "python", "scripts/build_readme.py", "README.template.rst", "README.rst" 104 | ) 105 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "data_extractor" 3 | authors = [{ name = "林玮 (Jade Lin)", email = "linw1995@icloud.com" }] 4 | description = "Combine XPath, CSS Selectors and JSONPath for Web data extracting." 5 | readme = "README.rst" 6 | classifiers = [ 7 | "Intended Audience :: Developers", 8 | "License :: OSI Approved :: MIT License", 9 | "Programming Language :: Python", 10 | "Programming Language :: Python :: 3", 11 | "Programming Language :: Python :: 3.10", 12 | "Programming Language :: Python :: 3.11", 13 | "Programming Language :: Python :: 3.12", 14 | "Programming Language :: Python :: 3.13", 15 | "Development Status :: 5 - Production/Stable", 16 | "Operating System :: POSIX", 17 | "Operating System :: MacOS :: MacOS X", 18 | "Operating System :: Microsoft :: Windows", 19 | ] 20 | keywords = [ 21 | "data-extractor", 22 | "data-extraction", 23 | "xpath", 24 | "css-selectors", 25 | "jsonpath", 26 | ] 27 | dependencies = [] 28 | requires-python = ">=3.10" 29 | license = { text = "MIT" } 30 | dynamic = ["version"] 31 | 32 | [project.urls] 33 | homepage = "https://github.com/linw1995/data_extractor" 34 | repository = "https://github.com/linw1995/data_extractor" 35 | documentation = "https://data-extractor.readthedocs.io/en/latest/" 36 | 37 | [project.optional-dependencies] 38 | lxml = ["lxml >= 4.3, < 6"] 39 | cssselect = ["lxml >= 4.3, < 6", "cssselect >= 1.0.3, < 2"] 40 | jsonpath-extractor = ["jsonpath-extractor >= 0.5, < 0.9"] 41 | jsonpath-rw = ["jsonpath-rw >= 1.4, < 2"] 42 | jsonpath-rw-ext = ["jsonpath-rw >= 1.4, < 2", "jsonpath-rw-ext >= 1.2, < 2"] 43 | 44 | [build-system] 45 | requires = ["pdm-pep517[setuptools]"] 46 | build-backend = "pdm.pep517.api" 47 | 48 | [tool.commitizen] 49 | name = "cz_conventional_commits" 50 | version = "0.9.0" 51 | tag_format = "v$version" 52 | 53 | [tool.pdm] 54 | includes = [ 55 | "data_extractor/*.py", 56 | "data_extractor/py.typed", 57 | "data_extractor/contrib/", 58 | ] 59 | version = { use_scm = true } 60 | 61 | [tool.pdm.dev-dependencies] 62 | docs = [ 63 | "lxml >= 4.3, < 6", 64 | "cssselect >= 1.0.3, < 2", 65 | "jsonpath-extractor >= 0.5, < 0.9", 66 | "jsonpath-rw >= 1.4, < 2", 67 | "jsonpath-rw-ext >= 1.2, < 2", 68 | "sphinx ~= 7.4", 69 | ] 70 | build_readme = ["click >= 7.1.2, < 8", "docutils >= 0.16", "pygments ~= 2.8"] 71 | test = ["pytest >= 6, < 8", "pytest-cov >= 2.7.1, < 3"] 72 | test-mypy-plugin = [ 73 | "pytest >= 6, < 8", 74 | "pytest-cov >= 2.7.1, < 3", 75 | "pytest-mypy-plugins ~= 1.6", 76 | "mypy~=0.930", 77 | ] 78 | 79 | [[tool.pdm.source]] 80 | name = "pypi" 81 | url = "https://pypi.org/simple" 82 | verify_ssl = true 83 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # https://github.com/pytest-dev/pytest/issues/3062 2 | # Don't move below settings into setup.cfg 3 | [pytest] 4 | testpaths = ./tests 5 | log_format = %(asctime)s - %(name)s - %(levelname)s - %(message)s 6 | xfail_strict=true 7 | -------------------------------------------------------------------------------- /scripts/build_readme.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | .. _issues-172: https://github.com/github/markup/issues/172 4 | 5 | Because Github markup do not render :include: directive. (issues-172_) 6 | """ 7 | 8 | # Standard Library 9 | from pathlib import Path 10 | from unittest import mock 11 | 12 | # Third Party Library 13 | import click 14 | import docutils.nodes 15 | import docutils.parsers.rst 16 | import docutils.parsers.rst.directives.misc 17 | import docutils.statemachine 18 | import docutils.utils 19 | 20 | 21 | @click.command() 22 | @click.argument("source_file") 23 | @click.argument("target_file") 24 | def build_readme(source_file, target_file): 25 | old_string2lines = docutils.statemachine.string2lines 26 | old_run = docutils.parsers.rst.directives.misc.Include.run 27 | text = "" 28 | target_text = None 29 | 30 | def string2lines(*args, **kwargs): 31 | nonlocal text, target_text 32 | if target_text is not None: 33 | text = text.replace(target_text, args[0]) 34 | target_text = None 35 | else: 36 | text += args[0] 37 | 38 | rv = old_string2lines(*args, **kwargs) 39 | return rv 40 | 41 | def run(self): 42 | nonlocal target_text 43 | target_text = self.block_text 44 | rv = old_run(self) 45 | return rv 46 | 47 | with ( 48 | mock.patch.object(docutils.statemachine, "string2lines", string2lines), 49 | mock.patch.object(docutils.parsers.rst.directives.misc.Include, "run", run), 50 | ): 51 | source_file_path: Path = Path.cwd() / source_file 52 | target_file_path: Path = Path.cwd() / target_file 53 | parser = docutils.parsers.rst.Parser() 54 | default_settings = docutils.frontend.OptionParser( 55 | components=(docutils.parsers.rst.Parser,) 56 | ).get_default_values() 57 | document = docutils.utils.new_document(source_file_path.name, default_settings) 58 | parser.parse(source_file_path.read_text(encoding="utf-8"), document) 59 | text = text.rstrip() + "\n" 60 | if ( 61 | target_file_path.exists() 62 | and target_file_path.read_text(encoding="utf-8") == text 63 | ): 64 | return 65 | 66 | target_file_path.write_text(text, encoding="utf-8") 67 | 68 | 69 | if __name__ == "__main__": 70 | build_readme() 71 | -------------------------------------------------------------------------------- /scripts/export_requirements_txt.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | import enum 3 | import shlex 4 | import subprocess 5 | 6 | from pathlib import Path 7 | 8 | Format = enum.Enum("Format", "requirements setuppy") 9 | BASE_DIR = Path(__file__).parent / "requirements" 10 | 11 | 12 | def fix_end_of_file(text): 13 | return text.rstrip() + "\n" 14 | 15 | 16 | def pdm_export(args, filename, format: Format): 17 | output = subprocess.check_output( 18 | shlex.split(f"pdm export -f {format.name} {' '.join(args)}"), encoding="utf-8" 19 | ) 20 | output = fix_end_of_file(output) 21 | if format is Format.setuppy: 22 | output = "\n".join( 23 | ['# This a dummy setup.py to enable GitHub "Used By" stats', output] 24 | ) 25 | p = Path(filename) 26 | if not p.parent.exists(): 27 | p.parent.mkdir(parents=True) 28 | is_new = not p.exists() 29 | if is_new or p.read_text() != output: 30 | p.write_text(output) 31 | if is_new: 32 | raise RuntimeError("Create a new file") 33 | 34 | 35 | pdm_export( 36 | args=["--prod"], 37 | filename=BASE_DIR / "requirements-mini.txt", 38 | format=Format.requirements, 39 | ) 40 | pdm_export( 41 | args=[ 42 | "--prod", 43 | "-G:all", 44 | ], 45 | filename=BASE_DIR / "requirements.txt", 46 | format=Format.requirements, 47 | ) 48 | pdm_export( 49 | args=["-G:all"], 50 | filename=BASE_DIR / "requirements-dev.txt", 51 | format=Format.requirements, 52 | ) 53 | pdm_export( 54 | args=["-G", "docs"], 55 | filename=BASE_DIR / "requirements-docs.txt", 56 | format=Format.requirements, 57 | ) 58 | # pdm_export(args=[], filename=BASE_DIR / "setup.py", format=Format.setuppy) 59 | -------------------------------------------------------------------------------- /scripts/requirements/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | # This file is @generated by PDM. 2 | # Please do not edit it manually. 3 | 4 | alabaster==0.7.16 \ 5 | --hash=sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65 \ 6 | --hash=sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92 7 | babel==2.16.0 \ 8 | --hash=sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b \ 9 | --hash=sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316 10 | certifi==2024.12.14 \ 11 | --hash=sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56 \ 12 | --hash=sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db 13 | charset-normalizer==3.4.0 \ 14 | --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \ 15 | --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \ 16 | --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \ 17 | --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \ 18 | --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \ 19 | --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \ 20 | --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \ 21 | --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \ 22 | --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \ 23 | --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \ 24 | --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \ 25 | --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \ 26 | --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \ 27 | --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \ 28 | --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \ 29 | --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \ 30 | --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \ 31 | --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \ 32 | --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \ 33 | --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \ 34 | --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \ 35 | --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \ 36 | --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \ 37 | --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \ 38 | --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \ 39 | --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \ 40 | --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \ 41 | --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \ 42 | --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \ 43 | --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \ 44 | --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \ 45 | --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \ 46 | --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \ 47 | --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \ 48 | --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \ 49 | --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \ 50 | --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \ 51 | --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \ 52 | --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \ 53 | --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \ 54 | --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \ 55 | --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \ 56 | --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \ 57 | --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \ 58 | --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \ 59 | --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \ 60 | --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \ 61 | --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \ 62 | --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \ 63 | --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \ 64 | --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \ 65 | --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \ 66 | --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \ 67 | --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \ 68 | --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \ 69 | --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \ 70 | --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \ 71 | --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \ 72 | --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \ 73 | --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \ 74 | --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \ 75 | --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482 76 | colorama==0.4.6; sys_platform == "win32" \ 77 | --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \ 78 | --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 79 | cssselect==1.2.0 \ 80 | --hash=sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc \ 81 | --hash=sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e 82 | decorator==5.1.1 \ 83 | --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ 84 | --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 85 | docutils==0.21.2 \ 86 | --hash=sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f \ 87 | --hash=sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 88 | idna==3.10 \ 89 | --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ 90 | --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 91 | imagesize==1.4.1 \ 92 | --hash=sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b \ 93 | --hash=sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a 94 | jinja2==3.1.5 \ 95 | --hash=sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb \ 96 | --hash=sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb 97 | jsonpath-extractor==0.8.0 \ 98 | --hash=sha256:08c53808f981fbd27f3488687940607b6213da38cc8c67e56cb41610acd53783 \ 99 | --hash=sha256:e82fcd6ae89123eb5ea09a2afb76d2884346369d0cd0c9509efff65c49fd15b6 100 | jsonpath-rw==1.4.0 \ 101 | --hash=sha256:05c471281c45ae113f6103d1268ec7a4831a2e96aa80de45edc89b11fac4fbec 102 | jsonpath-rw-ext==1.2.2 \ 103 | --hash=sha256:0947e018c4e6d46f9d04c56487793c702eb225fa252891aa4ed41a9ca26f3d84 \ 104 | --hash=sha256:a9e44e803b6d87d135b09d1e5af0db4d4cf97ba62711a80aa51c8c721980a994 105 | lxml==5.3.0 \ 106 | --hash=sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3 \ 107 | --hash=sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002 \ 108 | --hash=sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd \ 109 | --hash=sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832 \ 110 | --hash=sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e \ 111 | --hash=sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30 \ 112 | --hash=sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51 \ 113 | --hash=sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4 \ 114 | --hash=sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4 \ 115 | --hash=sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86 \ 116 | --hash=sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8 \ 117 | --hash=sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f \ 118 | --hash=sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03 \ 119 | --hash=sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e \ 120 | --hash=sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99 \ 121 | --hash=sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7 \ 122 | --hash=sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d \ 123 | --hash=sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22 \ 124 | --hash=sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492 \ 125 | --hash=sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b \ 126 | --hash=sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f \ 127 | --hash=sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a \ 128 | --hash=sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a \ 129 | --hash=sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4 \ 130 | --hash=sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442 \ 131 | --hash=sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b \ 132 | --hash=sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c \ 133 | --hash=sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1 \ 134 | --hash=sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be \ 135 | --hash=sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367 \ 136 | --hash=sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e \ 137 | --hash=sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16 \ 138 | --hash=sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d \ 139 | --hash=sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83 \ 140 | --hash=sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba \ 141 | --hash=sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763 \ 142 | --hash=sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff \ 143 | --hash=sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b \ 144 | --hash=sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c \ 145 | --hash=sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8 \ 146 | --hash=sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f \ 147 | --hash=sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a \ 148 | --hash=sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce \ 149 | --hash=sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1 \ 150 | --hash=sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330 \ 151 | --hash=sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18 \ 152 | --hash=sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff \ 153 | --hash=sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c \ 154 | --hash=sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179 \ 155 | --hash=sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080 \ 156 | --hash=sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d \ 157 | --hash=sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32 \ 158 | --hash=sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a \ 159 | --hash=sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79 \ 160 | --hash=sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3 \ 161 | --hash=sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5 \ 162 | --hash=sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f \ 163 | --hash=sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d \ 164 | --hash=sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3 \ 165 | --hash=sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9 \ 166 | --hash=sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957 \ 167 | --hash=sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb \ 168 | --hash=sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656 \ 169 | --hash=sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b \ 170 | --hash=sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d \ 171 | --hash=sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd \ 172 | --hash=sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859 \ 173 | --hash=sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a \ 174 | --hash=sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005 \ 175 | --hash=sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654 \ 176 | --hash=sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80 \ 177 | --hash=sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec \ 178 | --hash=sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7 \ 179 | --hash=sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965 \ 180 | --hash=sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8 181 | markupsafe==3.0.2 \ 182 | --hash=sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4 \ 183 | --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \ 184 | --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \ 185 | --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \ 186 | --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \ 187 | --hash=sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca \ 188 | --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \ 189 | --hash=sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832 \ 190 | --hash=sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b \ 191 | --hash=sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579 \ 192 | --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \ 193 | --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \ 194 | --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \ 195 | --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \ 196 | --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \ 197 | --hash=sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb \ 198 | --hash=sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e \ 199 | --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \ 200 | --hash=sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a \ 201 | --hash=sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d \ 202 | --hash=sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b \ 203 | --hash=sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8 \ 204 | --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \ 205 | --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \ 206 | --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \ 207 | --hash=sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d \ 208 | --hash=sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93 \ 209 | --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \ 210 | --hash=sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158 \ 211 | --hash=sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84 \ 212 | --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \ 213 | --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \ 214 | --hash=sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171 \ 215 | --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \ 216 | --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \ 217 | --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \ 218 | --hash=sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d \ 219 | --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \ 220 | --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \ 221 | --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \ 222 | --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \ 223 | --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \ 224 | --hash=sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798 \ 225 | --hash=sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c \ 226 | --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \ 227 | --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \ 228 | --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \ 229 | --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \ 230 | --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \ 231 | --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430 \ 232 | --hash=sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50 233 | packaging==24.2 \ 234 | --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \ 235 | --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f 236 | pbr==6.1.0 \ 237 | --hash=sha256:788183e382e3d1d7707db08978239965e8b9e4e5ed42669bf4758186734d5f24 \ 238 | --hash=sha256:a776ae228892d8013649c0aeccbb3d5f99ee15e005a4cbb7e61d55a067b28a2a 239 | ply==3.11 \ 240 | --hash=sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3 \ 241 | --hash=sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce 242 | pygments==2.18.0 \ 243 | --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ 244 | --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a 245 | requests==2.32.3 \ 246 | --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ 247 | --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 248 | six==1.17.0 \ 249 | --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \ 250 | --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81 251 | snowballstemmer==2.2.0 \ 252 | --hash=sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1 \ 253 | --hash=sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a 254 | sphinx==7.4.7 \ 255 | --hash=sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe \ 256 | --hash=sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239 257 | sphinxcontrib-applehelp==2.0.0 \ 258 | --hash=sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1 \ 259 | --hash=sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5 260 | sphinxcontrib-devhelp==2.0.0 \ 261 | --hash=sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad \ 262 | --hash=sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2 263 | sphinxcontrib-htmlhelp==2.1.0 \ 264 | --hash=sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8 \ 265 | --hash=sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9 266 | sphinxcontrib-jsmath==1.0.1 \ 267 | --hash=sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178 \ 268 | --hash=sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8 269 | sphinxcontrib-qthelp==2.0.0 \ 270 | --hash=sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab \ 271 | --hash=sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb 272 | sphinxcontrib-serializinghtml==2.0.0 \ 273 | --hash=sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331 \ 274 | --hash=sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d 275 | tomli==2.2.1; python_version < "3.11" \ 276 | --hash=sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6 \ 277 | --hash=sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd \ 278 | --hash=sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c \ 279 | --hash=sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b \ 280 | --hash=sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8 \ 281 | --hash=sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6 \ 282 | --hash=sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77 \ 283 | --hash=sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff \ 284 | --hash=sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea \ 285 | --hash=sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192 \ 286 | --hash=sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249 \ 287 | --hash=sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee \ 288 | --hash=sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4 \ 289 | --hash=sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98 \ 290 | --hash=sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8 \ 291 | --hash=sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4 \ 292 | --hash=sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281 \ 293 | --hash=sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744 \ 294 | --hash=sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69 \ 295 | --hash=sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13 \ 296 | --hash=sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140 \ 297 | --hash=sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e \ 298 | --hash=sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e \ 299 | --hash=sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc \ 300 | --hash=sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff \ 301 | --hash=sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec \ 302 | --hash=sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2 \ 303 | --hash=sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222 \ 304 | --hash=sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106 \ 305 | --hash=sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272 \ 306 | --hash=sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a \ 307 | --hash=sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7 308 | typing-extensions==3.10.0.2 \ 309 | --hash=sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e \ 310 | --hash=sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34 311 | urllib3==2.3.0 \ 312 | --hash=sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df \ 313 | --hash=sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d 314 | --index-url https://pypi.org/simple 315 | -------------------------------------------------------------------------------- /scripts/requirements/requirements-mini.txt: -------------------------------------------------------------------------------- 1 | # This file is @generated by PDM. 2 | # Please do not edit it manually. 3 | 4 | --index-url https://pypi.org/simple 5 | -------------------------------------------------------------------------------- /scripts/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file is @generated by PDM. 2 | # Please do not edit it manually. 3 | 4 | cssselect==1.2.0 \ 5 | --hash=sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc \ 6 | --hash=sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e 7 | decorator==5.1.1 \ 8 | --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ 9 | --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 10 | jsonpath-extractor==0.8.0 \ 11 | --hash=sha256:08c53808f981fbd27f3488687940607b6213da38cc8c67e56cb41610acd53783 \ 12 | --hash=sha256:e82fcd6ae89123eb5ea09a2afb76d2884346369d0cd0c9509efff65c49fd15b6 13 | jsonpath-rw==1.4.0 \ 14 | --hash=sha256:05c471281c45ae113f6103d1268ec7a4831a2e96aa80de45edc89b11fac4fbec 15 | jsonpath-rw-ext==1.2.2 \ 16 | --hash=sha256:0947e018c4e6d46f9d04c56487793c702eb225fa252891aa4ed41a9ca26f3d84 \ 17 | --hash=sha256:a9e44e803b6d87d135b09d1e5af0db4d4cf97ba62711a80aa51c8c721980a994 18 | lxml==5.3.0 \ 19 | --hash=sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3 \ 20 | --hash=sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002 \ 21 | --hash=sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd \ 22 | --hash=sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832 \ 23 | --hash=sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e \ 24 | --hash=sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30 \ 25 | --hash=sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51 \ 26 | --hash=sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4 \ 27 | --hash=sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4 \ 28 | --hash=sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86 \ 29 | --hash=sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8 \ 30 | --hash=sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f \ 31 | --hash=sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03 \ 32 | --hash=sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e \ 33 | --hash=sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99 \ 34 | --hash=sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7 \ 35 | --hash=sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d \ 36 | --hash=sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22 \ 37 | --hash=sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492 \ 38 | --hash=sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b \ 39 | --hash=sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f \ 40 | --hash=sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a \ 41 | --hash=sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a \ 42 | --hash=sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4 \ 43 | --hash=sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442 \ 44 | --hash=sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b \ 45 | --hash=sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c \ 46 | --hash=sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1 \ 47 | --hash=sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be \ 48 | --hash=sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367 \ 49 | --hash=sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e \ 50 | --hash=sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16 \ 51 | --hash=sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d \ 52 | --hash=sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83 \ 53 | --hash=sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba \ 54 | --hash=sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763 \ 55 | --hash=sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff \ 56 | --hash=sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b \ 57 | --hash=sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c \ 58 | --hash=sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8 \ 59 | --hash=sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f \ 60 | --hash=sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a \ 61 | --hash=sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce \ 62 | --hash=sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1 \ 63 | --hash=sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330 \ 64 | --hash=sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18 \ 65 | --hash=sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff \ 66 | --hash=sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c \ 67 | --hash=sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179 \ 68 | --hash=sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080 \ 69 | --hash=sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d \ 70 | --hash=sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32 \ 71 | --hash=sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a \ 72 | --hash=sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79 \ 73 | --hash=sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3 \ 74 | --hash=sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5 \ 75 | --hash=sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f \ 76 | --hash=sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d \ 77 | --hash=sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3 \ 78 | --hash=sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9 \ 79 | --hash=sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957 \ 80 | --hash=sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb \ 81 | --hash=sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656 \ 82 | --hash=sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b \ 83 | --hash=sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d \ 84 | --hash=sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd \ 85 | --hash=sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859 \ 86 | --hash=sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a \ 87 | --hash=sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005 \ 88 | --hash=sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654 \ 89 | --hash=sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80 \ 90 | --hash=sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec \ 91 | --hash=sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7 \ 92 | --hash=sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965 \ 93 | --hash=sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8 94 | pbr==6.1.0 \ 95 | --hash=sha256:788183e382e3d1d7707db08978239965e8b9e4e5ed42669bf4758186734d5f24 \ 96 | --hash=sha256:a776ae228892d8013649c0aeccbb3d5f99ee15e005a4cbb7e61d55a067b28a2a 97 | ply==3.11 \ 98 | --hash=sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3 \ 99 | --hash=sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce 100 | six==1.17.0 \ 101 | --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \ 102 | --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81 103 | typing-extensions==3.10.0.2 \ 104 | --hash=sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e \ 105 | --hash=sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34 106 | --index-url https://pypi.org/simple 107 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203, W503 4 | 5 | [isort] 6 | profile=black 7 | lines_between_types=1 8 | 9 | import_heading_stdlib=Standard Library 10 | import_heading_thirdparty=Third Party Library 11 | import_heading_firstparty=First Party Library 12 | import_heading_localfolder=Local Folder 13 | 14 | [coverage:run] 15 | branch = true 16 | omit = 17 | site-packages 18 | 19 | [coverage:report] 20 | precision = 2 21 | # Regexes for lines to exclude from consideration 22 | exclude_lines = 23 | # Have to re-enable the standard pragma 24 | pragma: no cover 25 | 26 | # Don't complain about missing debug-only code: 27 | def __repr__ 28 | if self\.debug 29 | 30 | # Don't complain if tests don't hit defensive assertion code: 31 | raise AssertionError 32 | raise NotImplementedError 33 | 34 | # Don't complain if non-runnable code isn't run: 35 | if 0: 36 | if __name__ == .__main__.: 37 | if TYPE_CHECKING: 38 | 39 | # type annotations 40 | @overload 41 | 42 | 43 | ignore_errors = True 44 | 45 | [mypy] 46 | follow_imports = silent 47 | warn_redundant_casts = true 48 | check_untyped_defs = true 49 | disallow_any_generics = false 50 | no_implicit_optional = true 51 | #disallow_untyped_defs = true 52 | #warn_unused_ignores = true 53 | plugins = data_extractor.contrib.mypy:plugin 54 | 55 | [mypy-lxml.*] 56 | ignore_missing_imports = true 57 | 58 | [mypy-cssselect.*] 59 | ignore_missing_imports = true 60 | 61 | [mypy-jsonpath.*] 62 | ignore_missing_imports = true 63 | 64 | [mypy-jsonpath_rw.*] 65 | ignore_missing_imports = true 66 | 67 | [mypy-jsonpath_rw_ext.*] 68 | ignore_missing_imports = true 69 | 70 | [mypy-mypy.*] 71 | ignore_missing_imports = true 72 | 73 | [mypy-tests.*] 74 | disallow_untyped_defs = false 75 | 76 | [mypy-pytest.*] 77 | ignore_missing_imports = true 78 | 79 | [mypy-_pytest.*] 80 | ignore_missing_imports = true 81 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | import platform 3 | import sys 4 | 5 | from pathlib import Path 6 | 7 | current_python_version = "%s.%s" % platform.python_version_tuple()[:2] 8 | 9 | # when executing pytest cli, the sys.path will be changed. 10 | # jsonpath-extractor package's module `jsonpath` same as 11 | # the file `jsonpath.py` in f'{sys.prefix}/bin'. 12 | # So need to remove it to avoid import the wrong module. 13 | for p in [ 14 | Path(f"{sys.prefix}/bin/jsonpath.py"), 15 | Path(f"__pypackages__/{current_python_version}/bin/jsonpath.py"), 16 | ]: 17 | if p.exists(): 18 | p.unlink() 19 | 20 | # pdm 21 | -------------------------------------------------------------------------------- /tests/assets/sample-rss-2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Liftoff News 5 | http://liftoff.msfc.nasa.gov/ 6 | Liftoff to Space Exploration. 7 | en-us 8 | Tue, 10 Jun 2003 04:00:00 GMT 9 | Tue, 10 Jun 2003 09:41:01 GMT 10 | http://blogs.law.harvard.edu/tech/rss 11 | Weblog Editor 2.0 12 | editor@example.com 13 | webmaster@example.com 14 | 15 | Star City 16 | http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp 17 | How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. 18 | Tue, 03 Jun 2003 09:39:21 GMT 19 | http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 20 | 21 | 22 | Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st. 23 | Fri, 30 May 2003 11:06:42 GMT 24 | http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 25 | 26 | 27 | The Engine That Does More 28 | http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp 29 | Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that. 30 | Tue, 27 May 2003 08:37:32 GMT 31 | http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 32 | 33 | 34 | Astronauts' Dirty Laundry 35 | http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp 36 | Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options. 37 | Tue, 20 May 2003 08:56:02 GMT 38 | http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | import importlib.util 3 | 4 | from unittest import mock 5 | 6 | # Third Party Library 7 | import pytest 8 | 9 | # First Party Library 10 | import data_extractor.json 11 | import data_extractor.utils 12 | 13 | 14 | @pytest.fixture( 15 | params=[ 16 | ( 17 | "jsonpath-extractor", 18 | "jsonpath", 19 | data_extractor.json.JSONPathExtractor, 20 | ), 21 | ("jsonpath-rw", "jsonpath_rw", data_extractor.json.JSONPathRWExtractor), 22 | ( 23 | "jsonpath-rw-ext", 24 | "jsonpath_rw_ext", 25 | data_extractor.json.JSONPathRWExtExtractor, 26 | ), 27 | ], 28 | ids=lambda r: r[1] if r[1] else f"Missing {r[0]!r}", 29 | ) 30 | def json_extractor_backend(request): 31 | package_name, module_name, backend_cls = request.param 32 | if not importlib.util.find_spec(module_name): 33 | pytest.skip(f"missing {package_name!r}") 34 | return 35 | 36 | data_extractor.json.json_extractor_backend = backend_cls 37 | return backend_cls 38 | 39 | 40 | @pytest.fixture 41 | def json0(): 42 | return { 43 | "data": { 44 | "users": [ 45 | {"id": 0, "name": "Vang Stout", "gender": "female"}, 46 | {"id": 1, "name": "Jeannie Gaines", "gender": "male"}, 47 | {"id": 2, "name": "Guzman Hunter", "gender": "female"}, 48 | {"id": 3, "name": "Janine Gross"}, 49 | {"id": 4, "name": "Clarke Patrick", "gender": "male"}, 50 | {"id": 5, "name": "Whitney Mcfadden"}, 51 | ], 52 | "start": 0, 53 | "size": 5, 54 | "total": 100, 55 | }, 56 | "status": 0, 57 | } 58 | 59 | 60 | @pytest.fixture(params=[False, True], ids=lambda x: f"stack_frame_support={x}") 61 | def stack_frame_support(request): 62 | if request.param: 63 | yield True 64 | else: 65 | with mock.patch("inspect.currentframe") as mocked: 66 | mocked.return_value = None 67 | yield False 68 | -------------------------------------------------------------------------------- /tests/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | follow_imports = silent 3 | disallow_any_generics = false 4 | plugins = data_extractor.contrib.mypy:plugin 5 | 6 | [mypy-mypy.*] 7 | ignore_missing_imports = true 8 | 9 | [mypy-lxml.*] 10 | ignore_missing_imports = true 11 | 12 | [mypy-cssselect.*] 13 | ignore_missing_imports = true 14 | 15 | [mypy-jsonpath.*] 16 | ignore_missing_imports = true 17 | 18 | [mypy-jsonpath_rw.*] 19 | ignore_missing_imports = true 20 | 21 | [mypy-jsonpath_rw_ext.*] 22 | ignore_missing_imports = true 23 | 24 | [mypy-tests.*] 25 | ignore_missing_imports = true 26 | disallow_untyped_defs = false 27 | 28 | [mypy-pytest.*] 29 | ignore_missing_imports = true 30 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | # Third Party Library 2 | import pytest 3 | 4 | # First Party Library 5 | import data_extractor.json 6 | 7 | from data_extractor.exceptions import ExtractError 8 | from data_extractor.item import Field, Item 9 | from data_extractor.json import JSONExtractor 10 | 11 | 12 | def test_no_needed_packages(): 13 | data_extractor.json.json_extractor_backend = None 14 | with pytest.raises(RuntimeError): 15 | JSONExtractor() 16 | 17 | 18 | @pytest.mark.usefixtures("json_extractor_backend") 19 | def test_exception_trace(json0): 20 | data = json0 21 | 22 | class User(Item): 23 | uid = Field(JSONExtractor("id")) 24 | username = Field(JSONExtractor("name"), name="name") 25 | gender = Field(JSONExtractor("gender")) 26 | 27 | class UserResponse(Item): 28 | start = Field(JSONExtractor("start"), default=0) 29 | size = Field(JSONExtractor("size")) 30 | total = Field(JSONExtractor("total")) 31 | data = User(JSONExtractor("users[*]"), is_many=True) 32 | 33 | extractor = UserResponse(JSONExtractor("data")) 34 | 35 | with pytest.raises(ExtractError) as catch: 36 | extractor.extract(data) 37 | 38 | exc = catch.value 39 | assert len(exc.extractors) == 3 40 | assert exc.extractors[0] is User.gender 41 | assert exc.extractors[1] is UserResponse.data 42 | assert exc.extractors[2] is extractor 43 | assert exc.element == {"id": 3, "name": "Janine Gross"} 44 | 45 | assert ( 46 | str(exc.args[0]) 47 | == """ 48 | ExtractError(Field(JSONExtractor('gender')), element={'id': 3, 'name': 'Janine Gross'}) 49 | |-UserResponse(JSONExtractor('data')) 50 | |-User(JSONExtractor('users[*]'), is_many=True) 51 | |-Field(JSONExtractor('gender')) 52 | |-{'id': 3, 'name': 'Janine Gross'} 53 | """.strip() 54 | ) 55 | -------------------------------------------------------------------------------- /tests/test_generic_item.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | from collections import namedtuple 3 | 4 | # Third Party Library 5 | import pytest 6 | 7 | # First Party Library 8 | from data_extractor.item import RV, Field, Item 9 | from data_extractor.json import JSONExtractor 10 | 11 | # Local Folder 12 | from .utils import D 13 | 14 | 15 | def test_field_with_type(): 16 | StrField = Field[str] 17 | f = StrField(D()) 18 | assert f.type is str 19 | assert f.extract(1) == "1" 20 | 21 | f = Field[str](D()) 22 | assert f.type is str 23 | assert f.extract(1) == "1" 24 | 25 | assert Field[str](D()).extract(1) == "1" 26 | 27 | 28 | def test_field_with_convertor(): 29 | f = Field(D(), convertor=lambda x: str(x).upper()) 30 | assert f.type is None 31 | assert f.extract("abc") == "ABC" 32 | f = Field(D(), type=str, convertor=lambda x: str(x).upper()) 33 | assert f.type is str 34 | assert f.extract("abc") == "ABC" 35 | 36 | 37 | @pytest.mark.usefixtures("json_extractor_backend") 38 | def test_item_with_type(): 39 | class Article(Item[RV]): 40 | title = Field[str](JSONExtractor("title")) 41 | 42 | ArticleTuple = namedtuple("ArticleTuple", "title") 43 | article = Article[ArticleTuple]() 44 | rv = article.extract({"title": "example"}) 45 | assert isinstance(rv, ArticleTuple) 46 | assert rv.title == "example" 47 | -------------------------------------------------------------------------------- /tests/test_json.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | import json 3 | import re 4 | 5 | # Third Party Library 6 | import pytest 7 | 8 | # First Party Library 9 | import data_extractor.json 10 | 11 | from data_extractor.exceptions import ExprError, ExtractError 12 | from data_extractor.json import JSONExtractor 13 | 14 | 15 | @pytest.fixture(scope="module") 16 | def text(): 17 | return """ 18 | { 19 | "foo": [ 20 | { 21 | "baz": 1 22 | }, 23 | { 24 | "baz": 2 25 | } 26 | ] 27 | } 28 | """ 29 | 30 | 31 | @pytest.fixture(scope="module") 32 | def element(text): 33 | return json.loads(text) 34 | 35 | 36 | @pytest.mark.usefixtures("json_extractor_backend") 37 | @pytest.mark.parametrize( 38 | "expr,expect", 39 | [ 40 | ("foo[*].baz", [1, 2]), 41 | ("foo.baz", []), 42 | ("foo[0].baz", [1]), 43 | ("foo[1].baz", [2]), 44 | ("foo[2].baz", []), 45 | ], 46 | ids=repr, 47 | ) 48 | def test_extract(element, expr, expect): 49 | extractor = JSONExtractor(expr) 50 | assert expect == extractor.extract(element) 51 | 52 | 53 | @pytest.mark.usefixtures("json_extractor_backend") 54 | @pytest.mark.parametrize( 55 | "expr,expect", 56 | [ 57 | ("foo[*].baz", 1), 58 | ("foo.baz", "default"), 59 | ("foo[0].baz", 1), 60 | ("foo[1].baz", 2), 61 | ("foo[2].baz", "default"), 62 | ], 63 | ids=repr, 64 | ) 65 | def test_extract_first(element, expr, expect): 66 | extractor = JSONExtractor(expr) 67 | assert expect == extractor.extract_first(element, default="default") 68 | 69 | 70 | @pytest.mark.usefixtures("json_extractor_backend") 71 | @pytest.mark.parametrize("expr", ["foo.baz", "foo[2].baz"], ids=repr) 72 | def test_extract_first_without_default(element, expr): 73 | extractor = JSONExtractor(expr) 74 | 75 | with pytest.raises(ExtractError) as catch: 76 | extractor.extract_first(element) 77 | 78 | exc = catch.value 79 | assert len(exc.extractors) == 1 80 | assert exc.extractors[0] is extractor 81 | assert exc.element is element 82 | 83 | 84 | @pytest.mark.usefixtures("json_extractor_backend") 85 | @pytest.mark.parametrize("expr", ["foo..", "a[]", ""], ids=repr) 86 | def test_invalid_jsonpath_expr(element, expr): 87 | with pytest.raises(ExprError) as catch: 88 | JSONExtractor(expr) 89 | 90 | exc = catch.value 91 | 92 | if ( 93 | data_extractor.json.json_extractor_backend 94 | is data_extractor.json.JSONPathExtractor 95 | ): 96 | # JSONExtractor implementated by 'jsonpath-extractor' 97 | # only raise SyntaxError 98 | assert isinstance(exc.exc, SyntaxError) 99 | else: 100 | # Third Party Library 101 | from jsonpath_rw.lexer import JsonPathLexerError 102 | 103 | assert isinstance(exc.exc, (JsonPathLexerError, Exception)) 104 | 105 | assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc)) 106 | -------------------------------------------------------------------------------- /tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | import importlib.util 3 | import re 4 | 5 | # Third Party Library 6 | import pytest 7 | 8 | # First Party Library 9 | from data_extractor.exceptions import ExprError, ExtractError 10 | from data_extractor.lxml import ( 11 | AttrCSSExtractor, 12 | CSSExtractor, 13 | TextCSSExtractor, 14 | XPathExtractor, 15 | ) 16 | 17 | need_cssselect = pytest.mark.skipif( 18 | importlib.util.find_spec("cssselect") is None, 19 | reason="Missing 'cssselect'", 20 | ) 21 | need_lxml = pytest.mark.skipif( 22 | importlib.util.find_spec("lxml") is None, reason="Missing 'lxml'" 23 | ) 24 | 25 | 26 | @pytest.fixture(scope="module") 27 | def text(): 28 | return """ 29 | 30 | 47 | 48 | """ 49 | 50 | 51 | @pytest.fixture(scope="module") 52 | def element(text): 53 | try: 54 | # Third Party Library 55 | from lxml.html import fromstring 56 | except ImportError: 57 | pytest.skip("Missing 'lxml'") 58 | 59 | return fromstring(text) 60 | 61 | 62 | @pytest.mark.parametrize( 63 | "Extractor,expr,expect", 64 | [ 65 | pytest.param(TextCSSExtractor, "span.class_a", ["a"], marks=need_cssselect), 66 | pytest.param(TextCSSExtractor, "span.class_b", ["b"], marks=need_cssselect), 67 | pytest.param(TextCSSExtractor, "span", ["a", "b", "c"], marks=need_cssselect), 68 | pytest.param(TextCSSExtractor, "notexits", [], marks=need_cssselect), 69 | (XPathExtractor, "//span[@class='class_a']/text()", ["a"]), 70 | (XPathExtractor, "//span[@class='class_b']/text()", ["b"]), 71 | (XPathExtractor, "//span[@class]/text()", ["a", "b"]), 72 | (XPathExtractor, "//span/@class", ["class_a", "class_b"]), 73 | (XPathExtractor, "//notexists/text()", []), 74 | ], 75 | ids=repr, 76 | ) 77 | def test_extract(element, Extractor, expr, expect): 78 | extractor = Extractor(expr) 79 | assert expect == extractor.extract(element) 80 | 81 | 82 | @pytest.mark.parametrize( 83 | "Extractor,expr,expect", 84 | [ 85 | pytest.param(TextCSSExtractor, "span.class_a", "a", marks=need_cssselect), 86 | pytest.param(TextCSSExtractor, "span.class_b", "b", marks=need_cssselect), 87 | pytest.param(TextCSSExtractor, "span", "a", marks=need_cssselect), 88 | pytest.param(TextCSSExtractor, "notexits", "default", marks=need_cssselect), 89 | (XPathExtractor, "//span[@class='class_a']/text()", "a"), 90 | (XPathExtractor, "//span[@class='class_b']/text()", "b"), 91 | (XPathExtractor, "//span[@class]/text()", "a"), 92 | (XPathExtractor, "//span/@class", "class_a"), 93 | (XPathExtractor, "//notexists/text()", "default"), 94 | ], 95 | ids=repr, 96 | ) 97 | def test_extract_first(element, Extractor, expr, expect): 98 | extractor = Extractor(expr) 99 | assert expect == extractor.extract_first(element, default="default") 100 | 101 | 102 | @pytest.mark.parametrize( 103 | "Extractor,expr", 104 | [ 105 | pytest.param(TextCSSExtractor, "notexits", marks=need_cssselect), 106 | (XPathExtractor, "//notexists/text()"), 107 | ], 108 | ids=repr, 109 | ) 110 | def test_extract_first_without_default(element, Extractor, expr): 111 | extractor = Extractor(expr) 112 | with pytest.raises(ExtractError) as catch: 113 | extractor.extract_first(element) 114 | 115 | exc = catch.value 116 | assert len(exc.extractors) == 1 117 | assert exc.extractors[0] is extractor 118 | assert exc.element is element 119 | 120 | 121 | @need_cssselect 122 | @pytest.mark.parametrize( 123 | "expr,attr,expect", 124 | [ 125 | ("span.class_a", "class", ["class_a"]), 126 | ("span.class_b", "class", ["class_b"]), 127 | ("span", "class", ["class_a", "class_b"]), 128 | ("span", "notexists", []), 129 | ("notexists", "class", []), 130 | ], 131 | ids=repr, 132 | ) 133 | def test_attr_css_extract(element, expr, attr, expect): 134 | extractor = AttrCSSExtractor(expr=expr, attr=attr) 135 | assert expect == extractor.extract(element) 136 | 137 | 138 | @need_cssselect 139 | @pytest.mark.parametrize( 140 | "expr,attr,expect", 141 | [ 142 | ("span.class_a", "class", "class_a"), 143 | ("span.class_b", "class", "class_b"), 144 | ("span", "class", "class_a"), 145 | ("span", "notexists", "default"), 146 | ("notexists", "class", "default"), 147 | ], 148 | ids=repr, 149 | ) 150 | def test_attr_css_extract_first(element, expr, attr, expect): 151 | extractor = AttrCSSExtractor(expr=expr, attr=attr) 152 | assert expect == extractor.extract_first(element, default="default") 153 | 154 | 155 | @need_cssselect 156 | @pytest.mark.parametrize( 157 | "expr,attr", [("span", "notexists"), ("notexists", "class")], ids=repr 158 | ) 159 | def test_attr_css_extract_first_without_default(element, expr, attr): 160 | extractor = AttrCSSExtractor(expr=expr, attr=attr) 161 | with pytest.raises(ExtractError) as catch: 162 | extractor.extract_first(element) 163 | 164 | exc = catch.value 165 | assert len(exc.extractors) == 1 166 | assert exc.extractors[0] is extractor 167 | assert exc.element is element 168 | 169 | 170 | @need_lxml 171 | @pytest.mark.parametrize("expr", ["///", "/text(", ""]) 172 | def test_invalid_xpath_expr(expr): 173 | with pytest.raises(ExprError) as catch: 174 | XPathExtractor(expr) 175 | 176 | exc = catch.value 177 | # Third Party Library 178 | from lxml.etree import XPathError 179 | 180 | assert isinstance(exc.exc, XPathError) 181 | assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc)) 182 | 183 | 184 | @pytest.mark.parametrize("expr", ["//ns:a"]) 185 | def test_invalid_xpath_expr_by_XPathEvalError_from_extract(element, expr): 186 | extractor = XPathExtractor(expr) 187 | with pytest.raises(ExprError) as catch: 188 | extractor.extract(element) 189 | 190 | exc = catch.value 191 | assert exc.extractor is extractor 192 | # Third Party Library 193 | from lxml.etree import XPathEvalError 194 | 195 | assert isinstance(exc.exc, XPathEvalError) 196 | assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc)) 197 | 198 | 199 | @need_cssselect 200 | @pytest.mark.parametrize("expr", ["<", "a##", ""]) 201 | def test_invalid_css_selector_expr(element, expr): 202 | with pytest.raises(ExprError) as catch: 203 | CSSExtractor(expr) 204 | 205 | exc = catch.value 206 | # Third Party Library 207 | from cssselect.parser import SelectorError 208 | 209 | assert isinstance(exc.exc, SelectorError) 210 | assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc)) 211 | 212 | 213 | def test_xpath_result_not_list(element): 214 | extractor = XPathExtractor("normalize-space(//span)") 215 | assert extractor.extract(element) == ["a"] 216 | assert extractor.extract_first(element) == "a" 217 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Third Party Library 2 | # Standard Library 3 | import importlib.util 4 | import sys 5 | 6 | # Third Party Library 7 | import pytest 8 | 9 | # First Party Library 10 | from data_extractor.core import AbstractSimpleExtractor 11 | from data_extractor.item import Field, Item 12 | from data_extractor.json import ( 13 | JSONPathExtractor, 14 | JSONPathRWExtExtractor, 15 | JSONPathRWExtractor, 16 | _missing_jsonpath, 17 | _missing_jsonpath_rw, 18 | _missing_jsonpath_rw_ext, 19 | ) 20 | from data_extractor.lxml import ( 21 | AttrCSSExtractor, 22 | CSSExtractor, 23 | TextCSSExtractor, 24 | XPathExtractor, 25 | _missing_cssselect, 26 | _missing_lxml, 27 | ) 28 | from data_extractor.utils import ( 29 | LazyStr, 30 | Property, 31 | getframe, 32 | is_complex_extractor, 33 | is_extractor, 34 | is_simple_extractor, 35 | ) 36 | 37 | 38 | def test_lazy_str(): 39 | string = "" 40 | 41 | def func(): 42 | nonlocal string 43 | return string 44 | 45 | ls = LazyStr(func=func) 46 | assert str(ls) == "" 47 | 48 | string = "abc" 49 | assert str(ls) == "abc" 50 | 51 | 52 | @pytest.fixture(params=[Field(), Item()], ids=repr) 53 | def complex_extractor(request): 54 | return request.param 55 | 56 | 57 | @pytest.fixture( 58 | params=[ 59 | ( 60 | AttrCSSExtractor(expr="div.class", attr="id") 61 | if not _missing_cssselect 62 | else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()) 63 | ), 64 | ( 65 | CSSExtractor(expr="div.class") 66 | if not _missing_cssselect 67 | else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()) 68 | ), 69 | ( 70 | JSONPathExtractor(expr="boo") 71 | if not _missing_jsonpath 72 | else pytest.param("Missing 'jsonpath-extractor'", marks=pytest.mark.skip()) 73 | ), 74 | ( 75 | JSONPathRWExtractor(expr="boo") 76 | if not _missing_jsonpath_rw 77 | else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()) 78 | ), 79 | ( 80 | JSONPathRWExtExtractor(expr="boo") 81 | if not _missing_jsonpath_rw_ext 82 | else pytest.param("Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip()) 83 | ), 84 | ( 85 | TextCSSExtractor(expr="div.class") 86 | if not _missing_cssselect 87 | else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()) 88 | ), 89 | ( 90 | XPathExtractor(expr="//div") 91 | if not _missing_lxml 92 | else pytest.param("Missing 'lxml'", marks=pytest.mark.skip()) 93 | ), 94 | ], 95 | ids=repr, 96 | ) 97 | def simple_extractor(request): 98 | return request.param 99 | 100 | 101 | def test_complex_extractor_is_extractor(complex_extractor): 102 | assert is_extractor(complex_extractor) 103 | 104 | 105 | def test_simple_extractor_is_extractor(simple_extractor): 106 | assert is_extractor(simple_extractor) 107 | 108 | 109 | def test_is_complex_extractor(complex_extractor): 110 | assert is_complex_extractor(complex_extractor) 111 | 112 | 113 | def test_is_not_complex_extractor(simple_extractor): 114 | assert not is_complex_extractor(simple_extractor) 115 | 116 | 117 | def test_is_simple_extractor(simple_extractor): 118 | assert is_simple_extractor(simple_extractor) 119 | 120 | 121 | def test_is_not_simple_extractor(complex_extractor): 122 | assert not is_simple_extractor(complex_extractor) 123 | 124 | 125 | @pytest.mark.skipif( 126 | importlib.util.find_spec("cssselect") is not None, 127 | reason="'cssselect' installed", 128 | ) 129 | def test_missing_cssselect(): 130 | with pytest.raises(RuntimeError) as catch: 131 | CSSExtractor("a>b") 132 | 133 | assert "cssselect" in str(catch.value) 134 | 135 | with pytest.raises(RuntimeError) as catch: 136 | AttrCSSExtractor("a>b", "href") 137 | 138 | assert "cssselect" in str(catch.value) 139 | 140 | with pytest.raises(RuntimeError) as catch: 141 | TextCSSExtractor("a>b") 142 | 143 | assert "cssselect" in str(catch.value) 144 | 145 | 146 | @pytest.mark.skipif( 147 | importlib.util.find_spec("lxml") is not None, reason="'lxml' installed" 148 | ) 149 | def test_missing_lxml(): 150 | with pytest.raises(RuntimeError) as catch: 151 | XPathExtractor("//boo") 152 | 153 | assert "lxml" in str(catch.value) 154 | 155 | 156 | @pytest.mark.skipif( 157 | importlib.util.find_spec("jsonpath") is not None, 158 | reason="'jsonpath-extractor' installed", 159 | ) 160 | def test_missing_jsonpath_extractor(): 161 | with pytest.raises(RuntimeError) as catch: 162 | JSONPathExtractor("boo") 163 | 164 | assert "jsonpath-extractor" in str(catch.value) 165 | 166 | 167 | @pytest.mark.skipif( 168 | importlib.util.find_spec("jsonpath_rw") is not None, 169 | reason="'jsonpath-rw' installed", 170 | ) 171 | def test_missing_jsonpath_rw(): 172 | with pytest.raises(RuntimeError) as catch: 173 | JSONPathRWExtractor("boo") 174 | 175 | assert "jsonpath-rw" in str(catch.value) 176 | 177 | with pytest.raises(RuntimeError) as catch: 178 | JSONPathRWExtExtractor("boo") 179 | 180 | assert "jsonpath-rw" in str(catch.value) 181 | 182 | 183 | @pytest.mark.skipif( 184 | not ( 185 | importlib.util.find_spec("jsonpath_rw_ext") is None 186 | and importlib.util.find_spec("jsonpath_rw") is not None 187 | ), 188 | reason="'jsonpath-rw-ext' installed or 'jsonpath-rw' uninstalled", 189 | ) 190 | def test_missing_jsonpath_rw_ext(): 191 | with pytest.raises(RuntimeError) as catch: 192 | JSONPathRWExtExtractor("boo") 193 | 194 | assert "jsonpath-rw-ext" in str(catch.value) 195 | 196 | 197 | def test_getframe_value_error(): 198 | with pytest.raises(ValueError): 199 | getframe(sys.getrecursionlimit() + 1) 200 | 201 | 202 | def test_property_accessing_error(): 203 | class Bar(AbstractSimpleExtractor): 204 | unset_attribute = Property[None]() 205 | 206 | def extract(self, element): 207 | return super().extract(element) 208 | 209 | assert isinstance(Bar.unset_attribute, Property) 210 | 211 | with pytest.raises(AttributeError): 212 | bar = Bar("dummy expr") 213 | bar.unset_attribute 214 | 215 | 216 | def test_property_re_set_error(): 217 | class Bar(AbstractSimpleExtractor): 218 | boo = Property[int]() 219 | 220 | def extract(self, element): 221 | return super().extract(element) 222 | 223 | bar = Bar("dummy expr") 224 | bar.boo = 0 225 | assert bar.boo == 0 226 | with pytest.raises(AttributeError): 227 | bar.boo = 1 228 | assert bar.boo == 0 229 | 230 | 231 | def test_property_change_internal_value_success(): 232 | class Bar(AbstractSimpleExtractor): 233 | boo = Property[int]() 234 | 235 | def extract(self, element): 236 | return super().extract(element) 237 | 238 | bar = Bar("dummy expr") 239 | bar.boo = 0 240 | assert bar.boo == 0 241 | Property.change_internal_value(bar, "boo", 1) 242 | assert bar.boo == 1 243 | 244 | 245 | def test_property_change_internal_value_failure(): 246 | class Bar(AbstractSimpleExtractor): 247 | boo = 1 248 | 249 | def extract(self, element): 250 | return super().extract(element) 251 | 252 | bar = Bar("dummy expr") 253 | with pytest.raises(AttributeError): 254 | Property.change_internal_value(bar, "boo", 1) 255 | -------------------------------------------------------------------------------- /tests/typesafety/conftest.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | from typing import List 3 | 4 | # Third Party Library 5 | import pytest 6 | 7 | from _pytest.nodes import Node 8 | 9 | xfail: List[str] = [] 10 | 11 | 12 | def pytest_collection_modifyitems(config, items: List[Node]): 13 | for item in items: 14 | if item.name in xfail: 15 | item.add_marker(pytest.mark.xfail(strict=True)) 16 | -------------------------------------------------------------------------------- /tests/typesafety/test_extracted_typed_dict.yml: -------------------------------------------------------------------------------- 1 | - case: item_extracted_result_is_typeddict 2 | skip: sys.version_info.minor < 8 3 | main: | 4 | from tests.utils import D 5 | from data_extractor.item import Item, Field 6 | 7 | class Point2D(Item): 8 | x = Field[int](D()) 9 | y = Field[int](D()) 10 | _dummy = Field(D()) 11 | _dummy_val = 1 12 | 13 | p = Point2D(D()) 14 | rv = p.extract({"x": 1, "y": 3}) 15 | reveal_type(rv) 16 | out: | 17 | main:12: note: Revealed type is "TypedDict({'x': builtins.int, 'y': builtins.int, '_dummy': Any})" 18 | - case: item_extracted_many_results_are_typeddict 19 | skip: sys.version_info.minor < 8 20 | main: | 21 | from tests.utils import D 22 | from data_extractor.item import Item, Field 23 | 24 | class Point2D(Item): 25 | def distance(self, point) -> int: 26 | return 1 27 | _dummy = Field(D()) 28 | _dummy_val = 1 29 | x = Field[int](D()) 30 | y = Field[int](D()) 31 | 32 | p = Point2D(D(), is_many=True) 33 | rvs = p.extract([{"x": 1, "y": 3}]) 34 | reveal_type(rvs) 35 | out: | 36 | main:14: note: Revealed type is "builtins.list[TypedDict({'_dummy': Any, 'x': builtins.int, 'y': builtins.int})]" 37 | - case: item_in-place_extracting 38 | skip: sys.version_info.minor < 8 39 | main: | 40 | from tests.utils import D 41 | from data_extractor.item import Item, Field 42 | 43 | class Point2D(Item): 44 | def distance(self, point) -> int: 45 | return 1 46 | _dummy = Field(D()) 47 | _dummy_val = 1 48 | x = Field[int](D()) 49 | y = Field[int](D()) 50 | 51 | rvs = Point2D(D(), is_many=True).extract([{"x": 1, "y": 3}]) 52 | reveal_type(rvs) 53 | rv = Point2D(D()).extract([{"x": 1, "y": 3}]) 54 | reveal_type(rv) 55 | out: | 56 | main:13: note: Revealed type is "builtins.list[TypedDict({'_dummy': Any, 'x': builtins.int, 'y': builtins.int})]" 57 | main:15: note: Revealed type is "TypedDict({'_dummy': Any, 'x': builtins.int, 'y': builtins.int})" 58 | - case: name_paramater_overwrite_typeddict_type 59 | skip: sys.version_info.minor < 8 60 | main: | 61 | from tests.utils import D 62 | from data_extractor.item import Item, Field 63 | 64 | class NamedPoint(Item): 65 | x = Field[int](D()) 66 | y = Field[int](D()) 67 | name_ = Field[str](D(), name="name") 68 | 69 | p = NamedPoint(D()) 70 | rv = p.extract([{"x": 1, "y": 3, "name": "A"}]) 71 | reveal_type(rv) 72 | out: | 73 | main:11: note: Revealed type is "TypedDict({'x': builtins.int, 'y': builtins.int, 'name': builtins.str})" 74 | -------------------------------------------------------------------------------- /tests/typesafety/test_generic.yml: -------------------------------------------------------------------------------- 1 | - case: field_extract_without_typing 2 | main: | 3 | from tests.utils import D 4 | from data_extractor.item import Field 5 | 6 | f = Field(D()) 7 | rv = f.extract(1) 8 | reveal_type(rv) 9 | out: | 10 | main:6: note: Revealed type is "Any" 11 | - case: field_extract_with_typing 12 | main: | 13 | from tests.utils import D 14 | from data_extractor.item import Field 15 | 16 | f_str = Field[str](D()) 17 | rv_str = f_str.extract(1) 18 | reveal_type(rv_str) 19 | 20 | f_int = Field(D(), type=int) 21 | rv_int = f_int.extract("1") 22 | reveal_type(rv_int) 23 | 24 | f_str_2: Field[str] = Field(D()) 25 | rv_str_2 = f_str_2.extract("1") 26 | reveal_type(rv_str_2) 27 | 28 | reveal_type(Field[str](D()).extract(1)) 29 | out: | 30 | main:6: note: Revealed type is "builtins.str" 31 | main:10: note: Revealed type is "builtins.int" 32 | main:14: note: Revealed type is "builtins.str" 33 | main:16: note: Revealed type is "builtins.str" 34 | - case: field_extract_with_typing_alias 35 | main: | 36 | from tests.utils import D 37 | from data_extractor.item import Field 38 | 39 | StrField = Field[str] 40 | f = StrField(D()) 41 | rv = f.extract(1) 42 | reveal_type(rv) 43 | out: | 44 | main:7: note: Revealed type is "builtins.str" 45 | - case: field_type_hinting_conflict_with_type_param 46 | main: | 47 | from tests.utils import D 48 | from data_extractor.item import Field 49 | 50 | f_1 = Field[str](D(), type=int) 51 | reveal_type(f_1) 52 | 53 | f_2: Field[str] = Field(D(), type=int) 54 | reveal_type(f_2) 55 | out: | 56 | main:4: error: Argument "type" to "Field" has incompatible type "Type[int]"; expected "Optional[Type[str]]" [arg-type] 57 | main:5: note: Revealed type is "data_extractor.item.Field[builtins.str]" 58 | main:7: error: Argument "type" to "Field" has incompatible type "Type[int]"; expected "Optional[Type[str]]" [arg-type] 59 | main:8: note: Revealed type is "data_extractor.item.Field[builtins.str]" 60 | - case: field_type_hinting_conflict_with_convertor_param 61 | main: | 62 | from tests.utils import D 63 | from data_extractor.item import Field 64 | 65 | f_1 = Field[str](D(), convertor=int) 66 | reveal_type(f_1) 67 | 68 | f_2: Field[str] = Field(D(), convertor=int) 69 | reveal_type(f_2) 70 | out: | 71 | main:4: error: Argument "convertor" to "Field" has incompatible type "Type[int]"; expected "Optional[Callable[[Any], str]]" [arg-type] 72 | main:5: note: Revealed type is "data_extractor.item.Field[builtins.str]" 73 | main:7: error: Argument "convertor" to "Field" has incompatible type "Type[int]"; expected "Optional[Callable[[Any], str]]" [arg-type] 74 | main:8: note: Revealed type is "data_extractor.item.Field[builtins.str]" 75 | - case: field_extract_with_typing_while_trying_to_change_type 76 | main: | 77 | from tests.utils import D 78 | from data_extractor.item import Field 79 | 80 | f = Field[str](D()) 81 | rv = f.extract(1) 82 | reveal_type(rv) 83 | f.type = int # unable to change 84 | rv = f.extract("1") 85 | reveal_type(rv) 86 | out: | 87 | main:6: note: Revealed type is "builtins.str" 88 | main:7: error: Incompatible types in assignment (expression has type "Type[int]", variable has type "Optional[Type[str]]") [assignment] 89 | main:9: note: Revealed type is "builtins.str" 90 | - case: field_extract_with_flag_is_many 91 | main: | 92 | from tests.utils import D 93 | from data_extractor.item import Field 94 | 95 | f = Field[str](D(), is_many=True) 96 | rvs = f.extract([1]) 97 | reveal_type(rvs) 98 | 99 | reveal_type(Field[str](D(), is_many=True).extract([1])) 100 | out: | 101 | main:6: note: Revealed type is "builtins.list[builtins.str]" 102 | main:8: note: Revealed type is "builtins.list[builtins.str]" 103 | - case: field_extract_while_trying_to_change_the_flag_of_is_many 104 | main: | 105 | from tests.utils import D 106 | from data_extractor.item import Field 107 | 108 | f = Field[str](D(), is_many=True) 109 | rvs = f.extract([1]) 110 | reveal_type(rvs) 111 | 112 | f.is_many = False # unable to change 113 | rv = f.extract(1) 114 | reveal_type(rv) 115 | out: | 116 | main:6: note: Revealed type is "builtins.list[builtins.str]" 117 | main:10: note: Revealed type is "builtins.list[builtins.str]" 118 | - case: subclass_field_extract_with_flag_is_many 119 | main: | 120 | from tests.utils import D 121 | from data_extractor import Field, RV 122 | 123 | class CField(Field[RV]): 124 | pass 125 | 126 | f1 = CField[str](D()) 127 | rv = f1.extract([1]) 128 | reveal_type(rv) 129 | f2 = CField[str](D(), is_many=True) 130 | rvs = f2.extract([1]) 131 | reveal_type(rvs) 132 | out: | 133 | main:9: note: Revealed type is "builtins.str" 134 | main:12: note: Revealed type is "builtins.list[builtins.str]" 135 | - case: item_extract_with_flag_is_many 136 | main: | 137 | from tests.utils import D 138 | from data_extractor import RV, Item 139 | 140 | class C(Item[RV]): 141 | pass 142 | 143 | f1 = C(D()) 144 | rv = f1.extract([1]) 145 | reveal_type(rv) 146 | f2 = C(D(), is_many=True) 147 | rvs = f2.extract([1]) 148 | reveal_type(rvs) 149 | out: | 150 | main:9: note: Revealed type is "TypedDict({})" 151 | main:12: note: Revealed type is "builtins.list[TypedDict({})]" 152 | - case: field_is_many_work_with_assign_expr 153 | main: | 154 | from tests.utils import D 155 | from data_extractor import Field 156 | 157 | f2 = f1 = Field(D(), is_many=True) 158 | rv1 = f1.extract([1]) 159 | reveal_type(rv1) 160 | rv2 = f2.extract([1]) 161 | reveal_type(rv2) 162 | out: | 163 | main:6: note: Revealed type is "builtins.list[Any]" 164 | main:8: note: Revealed type is "builtins.list[Any]" 165 | - case: field_is_many_work_with_assign_expr_in_classdef 166 | main: | 167 | from tests.utils import D 168 | from data_extractor import Field 169 | 170 | class B: 171 | f2 = f1 = Field(D(), is_many=True) 172 | 173 | rv1 = B.f1.extract([1]) 174 | reveal_type(rv1) 175 | rv2 = B.f2.extract([1]) 176 | reveal_type(rv2) 177 | out: | 178 | main:8: note: Revealed type is "builtins.list[Any]" 179 | main:10: note: Revealed type is "builtins.list[Any]" 180 | - case: field_is_many_work_with_member_assign_expr 181 | main: | 182 | from typing import Any 183 | 184 | from tests.utils import D 185 | from data_extractor import Field 186 | 187 | class C: 188 | def bar(self): 189 | pass 190 | 191 | f: int = 0 192 | f1: Field 193 | f3: Any 194 | 195 | C.f3 = f2 = C.f1 = Field(D(), is_many=True) 196 | rv1 = C.f1.extract([1]) 197 | reveal_type(rv1) 198 | rv2 = f2.extract([1]) 199 | reveal_type(rv2) 200 | rv3 = C.f3.extract([1]) 201 | reveal_type(rv3) 202 | 203 | C.f = Field(D()) 204 | c.f = Field(D()) 205 | out: | 206 | main:16: note: Revealed type is "builtins.list[Any]" 207 | main:18: note: Revealed type is "builtins.list[Any]" 208 | main:20: note: Revealed type is "builtins.list[Any]" 209 | main:22: error: Incompatible types in assignment (expression has type "Field[Any]", variable has type "int") [assignment] 210 | main:23: error: Name "c" is not defined [name-defined] 211 | - case: disallow_any_generic 212 | main: | 213 | from data_extractor import Field 214 | f1 = Field() 215 | f2 = Field[int]() 216 | rv = f2.extract([1]) 217 | reveal_type(rv) 218 | mypy_config: | 219 | disallow_any_generics=true 220 | out: | 221 | main:2: error: Need type annotation for "f1" [var-annotated] 222 | main:5: note: Revealed type is "builtins.int" 223 | - case: extractor_cls_as_func_argument 224 | main: | 225 | from typing import Type 226 | from data_extractor import Field 227 | 228 | def bar1(CF): 229 | f1 = CF(is_many=True) 230 | rv1 = f1.extract([1]) 231 | reveal_type(rv1) 232 | 233 | def bar2(CF: Type[Field[int]]): 234 | f2 = CF(is_many=False) 235 | rv2 = f2.extract([1]) 236 | reveal_type(rv2) 237 | out: | 238 | main:7: note: Revealed type is "Any" 239 | main:7: note: 'reveal_type' always outputs 'Any' in unchecked functions 240 | main:12: note: Revealed type is "builtins.int" 241 | - case: item_classdef_not_effects_normal_function_call 242 | main: | 243 | import inspect 244 | from data_extractor import Item, Field 245 | 246 | class User(Item): 247 | uid = Field() 248 | 249 | _ = inspect.currentframe() 250 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | # First Party Library 2 | from data_extractor.core import AbstractSimpleExtractor 3 | 4 | 5 | class DumyExtractor(AbstractSimpleExtractor): 6 | def __init__(self, expr=""): 7 | super().__init__(expr) 8 | 9 | def extract(self, element): 10 | return [element] 11 | 12 | 13 | D = DumyExtractor 14 | --------------------------------------------------------------------------------