├── .github
├── dependabot.yml
└── workflows
│ ├── release.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── Makefile
├── README.rst
├── README.template.rst
├── data_extractor
├── __init__.py
├── contrib
│ └── mypy
│ │ └── __init__.py
├── core.py
├── exceptions.py
├── item.py
├── json.py
├── lxml.py
├── py.typed
└── utils.py
├── default.nix
├── docs
├── Makefile
├── make.bat
└── source
│ ├── _static
│ └── custom.css
│ ├── api_core.rst
│ ├── api_exceptions.rst
│ ├── api_item.rst
│ ├── api_json.rst
│ ├── api_lxml.rst
│ ├── api_reference.rst
│ ├── api_utils.rst
│ ├── changelog.rst
│ ├── conf.py
│ ├── contributing.rst
│ ├── history.rst
│ ├── howto
│ ├── index.rst
│ ├── item.rst
│ ├── json.rst
│ └── lxml.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── quickstarts.rst
│ └── readme.rst
├── flake.lock
├── flake.nix
├── noxfile.py
├── pdm.lock
├── pyproject.toml
├── pytest.ini
├── scripts
├── build_readme.py
├── export_requirements_txt.py
└── requirements
│ ├── requirements-dev.txt
│ ├── requirements-docs.txt
│ ├── requirements-mini.txt
│ └── requirements.txt
├── setup.cfg
└── tests
├── __init__.py
├── assets
└── sample-rss-2.xml
├── conftest.py
├── mypy.ini
├── test_exceptions.py
├── test_generic_item.py
├── test_item.py
├── test_json.py
├── test_lxml.py
├── test_utils.py
├── typesafety
├── conftest.py
├── test_extracted_typed_dict.yml
└── test_generic.yml
└── utils.py
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "pip"
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "*"
7 |
8 | jobs:
9 | release:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v1
13 | - name: Set up PDM
14 | uses: pdm-project/setup-pdm@v2.5
15 | with:
16 | python-version: 3.9
17 | version: 2.19.2
18 | - name: Build release
19 | run: |
20 | pdm build
21 | - name: Upload release
22 | uses: actions/upload-artifact@v4
23 | with:
24 | name: dist
25 | path: dist
26 | - name: Publish release to PYPI
27 | run: |
28 | pip install twine
29 | twine upload -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} --verbose dist/*
30 | - name: Publish release to GitHub Release
31 | uses: softprops/action-gh-release@v2
32 | with:
33 | files: dist/*
34 | env:
35 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
36 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Lint&Test
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | - release/*
8 | pull_request:
9 | branches:
10 | - "*"
11 |
12 | jobs:
13 | lint:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: actions/checkout@v1
17 | - name: Set up PDM
18 | uses: pdm-project/setup-pdm@v3
19 | with:
20 | python-version: 3.13
21 | version: 2.19.2
22 | - name: Cache Nox Virtualenvs
23 | uses: actions/cache@v1
24 | with:
25 | path: .nox
26 | key: ${{ runner.os }}-nox-${{ hashFiles('**/pdm.lock') }}
27 | restore-keys: ${{ runner.os }}-nox
28 | - name: Install nox
29 | run: |
30 | pip install nox
31 | pdm config python.use_venv true
32 | - uses: pre-commit/action@v2.0.0
33 | env:
34 | SKIP: export_requirements_txt
35 | test:
36 | needs: lint
37 | runs-on: ubuntu-latest
38 | strategy:
39 | matrix:
40 | python-version: ["3.10", "3.11", "3.12", "3.13"]
41 | steps:
42 | - uses: actions/checkout@v1
43 | - name: Set up PDM
44 | uses: pdm-project/setup-pdm@v3
45 | with:
46 | python-version: ${{ matrix.python-version }}
47 | version: 2.19.2
48 | - name: Cache Nox Virtualenvs
49 | uses: actions/cache@v1
50 | with:
51 | path: .nox
52 | key: ${{ runner.os }}-${{ matrix.python-version }}-nox-${{ hashFiles('**/pdm.lock') }}
53 | restore-keys: ${{ runner.os }}-${{ matrix.python-version }}-nox
54 | - name: Install nox
55 | run: |
56 | pip install nox
57 | pdm config python.use_venv true
58 | - name: Test with coverage
59 | run: |
60 | make PYTHON=${{ matrix.python-version }} cov
61 | - name: Upload coverage to Codecov
62 | uses: codecov/codecov-action@v2
63 | with:
64 | token: ${{ secrets.CODECOV_TOKEN }}
65 | flags: main,unittest,${{ matrix.python-version }}
66 | fail_ci_if_error: true
67 | test-mypy-plugin:
68 | needs: lint
69 | runs-on: ubuntu-latest
70 | strategy:
71 | matrix:
72 | python-version: ["3.10", "3.11", "3.12", "3.13"]
73 | steps:
74 | - uses: actions/checkout@v1
75 | - name: Set up PDM
76 | uses: pdm-project/setup-pdm@v3
77 | with:
78 | python-version: ${{ matrix.python-version }}
79 | - name: Cache Nox Virtualenvs
80 | uses: actions/cache@v1
81 | with:
82 | path: .nox
83 | key: ${{ runner.os }}-${{ matrix.python-version }}-nox-${{ hashFiles('**/pdm.lock') }}
84 | restore-keys: ${{ runner.os }}-${{ matrix.python-version }}-nox
85 | - name: Install nox
86 | run: |
87 | pip install nox
88 | pdm config python.use_venv true
89 | - name: Test
90 | run: |
91 | make PYTHON=${{ matrix.python-version }} test-mypy-plugin
92 | - name: Upload coverage to Codecov
93 | uses: codecov/codecov-action@v2
94 | with:
95 | token: ${{ secrets.CODECOV_TOKEN }}
96 | flags: plugin-mypy,unittest,${{ matrix.python-version }}
97 | fail_ci_if_error: true
98 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/python,emacs
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,emacs
4 |
5 | ### Emacs ###
6 | # -*- mode: gitignore; -*-
7 | *~
8 | \#*\#
9 | /.emacs.desktop
10 | /.emacs.desktop.lock
11 | *.elc
12 | auto-save-list
13 | tramp
14 | .\#*
15 |
16 | # Org-mode
17 | .org-id-locations
18 | *_archive
19 | ltximg/**
20 |
21 | # flymake-mode
22 | *_flymake.*
23 |
24 | # eshell files
25 | /eshell/history
26 | /eshell/lastdir
27 |
28 | # elpa packages
29 | /elpa/
30 |
31 | # reftex files
32 | *.rel
33 |
34 | # AUCTeX auto folder
35 | /auto/
36 |
37 | # cask packages
38 | .cask/
39 | dist/
40 |
41 | # Flycheck
42 | flycheck_*.el
43 |
44 | # server auth directory
45 | /server/
46 |
47 | # projectiles files
48 | .projectile
49 |
50 | # directory configuration
51 | .dir-locals.el
52 |
53 | # network security
54 | /network-security.data
55 |
56 |
57 | ### Python ###
58 | # Byte-compiled / optimized / DLL files
59 | __pycache__/
60 | *.py[cod]
61 | *$py.class
62 |
63 | # C extensions
64 | *.so
65 |
66 | # Distribution / packaging
67 | .Python
68 | build/
69 | develop-eggs/
70 | downloads/
71 | eggs/
72 | .eggs/
73 | lib/
74 | lib64/
75 | parts/
76 | sdist/
77 | var/
78 | wheels/
79 | pip-wheel-metadata/
80 | share/python-wheels/
81 | *.egg-info/
82 | .installed.cfg
83 | *.egg
84 | MANIFEST
85 |
86 | # PyInstaller
87 | # Usually these files are written by a python script from a template
88 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
89 | *.manifest
90 | *.spec
91 |
92 | # Installer logs
93 | pip-log.txt
94 | pip-delete-this-directory.txt
95 |
96 | # Unit test / coverage reports
97 | htmlcov/
98 | .tox/
99 | .nox/
100 | .coverage
101 | .coverage.*
102 | .cache
103 | nosetests.xml
104 | coverage.xml
105 | *.cover
106 | *.py,cover
107 | .hypothesis/
108 | .pytest_cache/
109 | pytestdebug.log
110 |
111 | # Translations
112 | *.mo
113 | *.pot
114 |
115 | # Django stuff:
116 | *.log
117 | local_settings.py
118 | db.sqlite3
119 | db.sqlite3-journal
120 |
121 | # Flask stuff:
122 | instance/
123 | .webassets-cache
124 |
125 | # Scrapy stuff:
126 | .scrapy
127 |
128 | # Sphinx documentation
129 | docs/_build/
130 | doc/_build/
131 |
132 | # PyBuilder
133 | target/
134 |
135 | # Jupyter Notebook
136 | .ipynb_checkpoints
137 |
138 | # IPython
139 | profile_default/
140 | ipython_config.py
141 |
142 | # pyenv
143 | .python-version
144 |
145 | # pipenv
146 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
147 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
148 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
149 | # install all needed dependencies.
150 | #Pipfile.lock
151 |
152 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
153 | __pypackages__/
154 |
155 | # Celery stuff
156 | celerybeat-schedule
157 | celerybeat.pid
158 |
159 | # SageMath parsed files
160 | *.sage.py
161 |
162 | # Environments
163 | .env
164 | .venv
165 | env/
166 | venv/
167 | ENV/
168 | env.bak/
169 | venv.bak/
170 | pythonenv*
171 |
172 | # Spyder project settings
173 | .spyderproject
174 | .spyproject
175 |
176 | # Rope project settings
177 | .ropeproject
178 |
179 | # mkdocs documentation
180 | /site
181 |
182 | # mypy
183 | .mypy_cache/
184 | .dmypy.json
185 | dmypy.json
186 |
187 | # Pyre type checker
188 | .pyre/
189 |
190 | # pytype static type analyzer
191 | .pytype/
192 |
193 | # profiling data
194 | .prof
195 |
196 | # End of https://www.toptal.com/developers/gitignore/api/python,emacs
197 |
198 | ### Custom ###
199 | ## IDEA
200 | .vscode
201 | ## Emacs
202 | .persp-confs
203 | ## Makefile
204 | ## pdm
205 | .pdm.toml
206 | ## MacOS
207 | .DS_Store
208 | ## misc
209 | .dream2nix/
210 | .pdm-python
211 | pdm.toml
212 | result
213 | .envrc
214 | .direnv
215 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/commitizen-tools/commitizen
3 | rev: v3.27.0
4 | hooks:
5 | - id: commitizen
6 | stages:
7 | - commit-msg
8 | - repo: https://github.com/pre-commit/pre-commit-hooks
9 | rev: v4.6.0
10 | hooks:
11 | - id: check-symlinks
12 | - id: check-toml
13 | - id: check-yaml
14 | args: [--unsafe]
15 | - id: detect-private-key
16 | - id: end-of-file-fixer
17 | - id: trailing-whitespace
18 | - id: check-added-large-files
19 | - id: mixed-line-ending
20 | args: [--fix=lf]
21 | - repo: https://github.com/pre-commit/pygrep-hooks
22 | rev: v1.10.0
23 | hooks:
24 | - id: python-check-blanket-noqa
25 | - id: python-check-mock-methods
26 | - id: python-no-eval
27 | - id: python-no-log-warn
28 | - id: python-use-type-annotations
29 | - id: rst-backticks
30 | - repo: https://github.com/psf/black
31 | rev: 24.4.2
32 | hooks:
33 | - id: black
34 | - repo: https://github.com/asottile/blacken-docs
35 | rev: 1.16.0
36 | hooks:
37 | - id: blacken-docs
38 | additional_dependencies: [black==23.3.*]
39 | - repo: https://github.com/PyCQA/flake8
40 | rev: 7.0.0
41 | hooks:
42 | - id: flake8
43 | additional_dependencies: ["flake8-bugbear==23.5.*"]
44 | - repo: https://github.com/pre-commit/mirrors-mypy
45 | rev: v1.10.0
46 | hooks:
47 | - id: mypy
48 | files: data_extractor/.+\.py$
49 | pass_filenames: false
50 | entry: bash -c 'env PYTHONPATH=.:$PYTHONPATH mypy data_extractor --show-traceback'
51 | - repo: https://github.com/pre-commit/mirrors-isort
52 | rev: v5.10.1
53 | hooks:
54 | - id: isort
55 | - repo: https://github.com/PyCQA/doc8
56 | rev: v1.1.1
57 | hooks:
58 | - id: doc8
59 | - repo: local
60 | hooks:
61 | - id: build_readme
62 | name: build_readme
63 | description: Build README.rst
64 | entry: nox -s build_readme
65 | language: system
66 | pass_filenames: false
67 | types: [rst]
68 | - id: export_requirements_txt
69 | name: export_requirements_txt
70 | description: create requirement file for python
71 | entry: python3 scripts/export_requirements_txt.py
72 | language: system
73 | files: pdm.lock
74 | pass_filenames: false
75 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.12"
7 |
8 | python:
9 | install:
10 | - requirements: ./scripts/requirements/requirements-docs.txt
11 | - path: .
12 |
13 | sphinx:
14 | builder: html
15 | configuration: docs/source/conf.py
16 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at linw1995@icloud.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 林玮
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | help:
2 | @echo "PYTHON=X.Y init setup development environemnt with specific Python version"
3 | @echo "init setup development environment with defualt Python version 3.11"
4 | @echo "update-dev update devepoment dependencies via pdm and via pre-commit"
5 | @echo "update update all dependencies via pdm and via pre-commit"
6 | @echo "pre-commit setup git hooks"
7 | @echo "check-all run code quality checkers"
8 | @echo "test run quick tests"
9 | @echo "vtest run quick tests with verbose"
10 | @echo "PYTHON=X.Y cov run tests with coverage and with specific Python version"
11 | @echo "cov run tests with coverage and with default Python version 3.11"
12 | @echo "test-mypy-plugin run mypy plugin tests"
13 | @echo "type-check run static type checking"
14 |
15 | EMPTY :=
16 | SPACE := $(EMPTY) $(EMPTY)
17 |
18 | PYTHON = 3.13
19 | EXTRAS = lxml cssselect jsonpath-extractor jsonpath-rw jsonpath-rw-ext
20 | DEV_EXTRAS = test test-mypy-plugin docs
21 | EXTRAS_ARGS = $(if $(EXTRAS),-G,) $(subst $(SPACE),$(SPACE)-G$(SPACE),$(EXTRAS))
22 | DEV_EXTRAS_ARGS = $(if $(DEV_EXTRAS),-G,) $(subst $(SPACE),$(SPACE)-G$(SPACE),$(DEV_EXTRAS))
23 |
24 | # Environment setup
25 | init:
26 | @echo ">> installing $(if $(EXTRAS),\"$(EXTRAS)\" ,)$(if $(DEV_EXTRAS),\"$(DEV_EXTRAS)\" ,)dependencies by pdm"
27 | $(if $(PYTHON),pdm use -f $(PYTHON),)
28 | pdm info && pdm info --env
29 | pdm install $(EXTRAS_ARGS) $(DEV_EXTRAS_ARGS)
30 | pdm config -l python.use_venv true
31 |
32 | deinit:
33 | rm -rf .nox
34 | rm -rf __pypackages__
35 | rm -rf .mypy_cache
36 | rm -rf htmlcov
37 | rm -rf .pytest_cache
38 | rm -rf *.egg-info
39 |
40 | update-dev:
41 | pdm update $(DEV_EXTRAS_ARGS) $(EXTRAS_ARGS)
42 | pre-commit autoupdate
43 |
44 | update:
45 | pdm update
46 | pre-commit autoupdate
47 |
48 | # Environment setup end
49 |
50 | pre-commit:
51 | pre-commit install --hook-type commit-msg --hook-type pre-commit --overwrite
52 |
53 | check-all:
54 | pre-commit run --all-files
55 |
56 | type-check:
57 | pre-commit run mypy
58 |
59 | test:
60 | pdm run pytest -q -x --ff --nf --ignore tests/typesafety
61 |
62 | vtest:
63 | pdm run pytest -vv -x --ff --nf --ignore tests/typesafety
64 |
65 | test-mypy-plugin:
66 | rm -rf .coverage
67 | nox -p $(PYTHON) -s test_mypy_plugin coverage_report -- $(TARGET)
68 |
69 | test-mypy-plugin-full:
70 | rm -rf .coverage
71 | nox -s test_mypy_plugin -- $(TARGET)
72 | nox -p 3.10 -s coverage_report
73 |
74 | cov:
75 | rm -rf .coverage
76 | nox -p $(PYTHON) -s coverage_test coverage_report -- $(TARGET)
77 |
78 | cov-full:
79 | rm -rf .coverage
80 | nox -s coverage_test -- $(TARGET)
81 | nox -p 3.10 -s coverage_report
82 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | ==============
2 | Data Extractor
3 | ==============
4 |
5 | |license| |Pypi Status| |Python version| |Package version| |PyPI - Downloads|
6 | |GitHub last commit| |Code style: black| |Build Status| |codecov|
7 | |Documentation Status| |PDM managed|
8 |
9 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting.
10 |
11 | Quickstarts
12 | <<<<<<<<<<<
13 |
14 | Installation
15 | ~~~~~~~~~~~~
16 |
17 | Install the stable version from PYPI.
18 |
19 | .. code-block:: shell
20 |
21 | pip install "data-extractor[jsonpath-extractor]" # for extracting JSON data
22 | pip install "data-extractor[lxml]" # for extracting HTML data
23 |
24 | Or install the latest version from Github.
25 |
26 | .. code-block:: shell
27 |
28 | pip install "data-extractor[jsonpath-extractor] @ git+https://github.com/linw1995/data_extractor.git@master"
29 |
30 | Extract JSON data
31 | ~~~~~~~~~~~~~~~~~
32 |
33 | Currently supports to extract JSON data with below optional dependencies
34 |
35 | - jsonpath-extractor_
36 | - jsonpath-rw_
37 | - jsonpath-rw-ext_
38 |
39 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath
40 | .. _jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw
41 | .. _jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/
42 |
43 | install one dependency of them to extract JSON data.
44 |
45 | Extract HTML(XML) data
46 | ~~~~~~~~~~~~~~~~~~~~~~
47 |
48 | Currently supports to extract HTML(XML) data with below optional dependencies
49 |
50 | - lxml_ for using XPath_
51 | - cssselect_ for using CSS-Selectors_
52 |
53 | .. _lxml: https://lxml.de/
54 | .. _XPath: https://www.w3.org/TR/xpath-10/
55 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/
56 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/
57 |
58 | Usage
59 | ~~~~~
60 |
61 | .. code-block:: python3
62 |
63 | from data_extractor import Field, Item, JSONExtractor
64 |
65 |
66 | class Count(Item):
67 | followings = Field(JSONExtractor("countFollowings"))
68 | fans = Field(JSONExtractor("countFans"))
69 |
70 |
71 | class User(Item):
72 | name_ = Field(JSONExtractor("name"), name="name")
73 | age = Field(JSONExtractor("age"), default=17)
74 | count = Count()
75 |
76 |
77 | assert User(JSONExtractor("data.users[*]"), is_many=True).extract(
78 | {
79 | "data": {
80 | "users": [
81 | {
82 | "name": "john",
83 | "age": 19,
84 | "countFollowings": 14,
85 | "countFans": 212,
86 | },
87 | {
88 | "name": "jack",
89 | "description": "",
90 | "countFollowings": 54,
91 | "countFans": 312,
92 | },
93 | ]
94 | }
95 | }
96 | ) == [
97 | {"name": "john", "age": 19, "count": {"followings": 14, "fans": 212}},
98 | {"name": "jack", "age": 17, "count": {"followings": 54, "fans": 312}},
99 | ]
100 |
101 | Changelog
102 | <<<<<<<<<
103 |
104 | v1.0.1
105 | ~~~~~~
106 |
107 | **Build**
108 |
109 | - Supports Python 3.13
110 |
111 |
112 |
113 | Contributing
114 | <<<<<<<<<<<<
115 |
116 |
117 | Environment Setup
118 | ~~~~~~~~~~~~~~~~~
119 |
120 | Clone the source codes from Github.
121 |
122 | .. code-block:: shell
123 |
124 | git clone https://github.com/linw1995/data_extractor.git
125 | cd data_extractor
126 |
127 | Setup the development environment.
128 | Please make sure you install the pdm_,
129 | pre-commit_ and nox_ CLIs in your environment.
130 |
131 | .. code-block:: shell
132 |
133 | make init
134 | make PYTHON=3.7 init # for specific python version
135 |
136 | Linting
137 | ~~~~~~~
138 |
139 | Use pre-commit_ for installing linters to ensure a good code style.
140 |
141 | .. code-block:: shell
142 |
143 | make pre-commit
144 |
145 | Run linters. Some linters run via CLI nox_, so make sure you install it.
146 |
147 | .. code-block:: shell
148 |
149 | make check-all
150 |
151 | Testing
152 | ~~~~~~~
153 |
154 | Run quick tests.
155 |
156 | .. code-block:: shell
157 |
158 | make
159 |
160 | Run quick tests with verbose.
161 |
162 | .. code-block:: shell
163 |
164 | make vtest
165 |
166 | Run tests with coverage.
167 | Testing in multiple Python environments is powered by CLI nox_.
168 |
169 | .. code-block:: shell
170 |
171 | make cov
172 |
173 | .. _pdm: https://github.com/pdm-project/pdm
174 | .. _pre-commit: https://pre-commit.com/
175 | .. _nox: https://nox.thea.codes/en/stable/
176 |
177 | .. |license| image:: https://img.shields.io/github/license/linw1995/data_extractor.svg
178 | :target: https://github.com/linw1995/data_extractor/blob/master/LICENSE
179 |
180 | .. |Pypi Status| image:: https://img.shields.io/pypi/status/data_extractor.svg
181 | :target: https://pypi.org/project/data_extractor
182 |
183 | .. |Python version| image:: https://img.shields.io/pypi/pyversions/data_extractor.svg
184 | :target: https://pypi.org/project/data_extractor
185 |
186 | .. |Package version| image:: https://img.shields.io/pypi/v/data_extractor.svg
187 | :target: https://pypi.org/project/data_extractor
188 |
189 | .. |PyPI - Downloads| image:: https://img.shields.io/pypi/dm/data-extractor.svg
190 | :target: https://pypi.org/project/data_extractor
191 |
192 | .. |GitHub last commit| image:: https://img.shields.io/github/last-commit/linw1995/data_extractor.svg
193 | :target: https://github.com/linw1995/data_extractor
194 |
195 | .. |Code style: black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
196 | :target: https://github.com/ambv/black
197 |
198 | .. |Build Status| image:: https://github.com/linw1995/data_extractor/workflows/Lint&Test/badge.svg
199 | :target: https://github.com/linw1995/data_extractor/actions?query=workflow%3ALint%26Test
200 |
201 | .. |codecov| image:: https://codecov.io/gh/linw1995/data_extractor/branch/master/graph/badge.svg
202 | :target: https://codecov.io/gh/linw1995/data_extractor
203 |
204 | .. |Documentation Status| image:: https://readthedocs.org/projects/data-extractor/badge/?version=latest
205 | :target: https://data-extractor.readthedocs.io/en/latest/?badge=latest
206 |
207 | .. |PDM managed| image:: https://img.shields.io/badge/pdm-managed-blueviolet
208 | :target: https://pdm.fming.dev
209 |
--------------------------------------------------------------------------------
/README.template.rst:
--------------------------------------------------------------------------------
1 | .. include:: docs/source/readme.rst
2 |
--------------------------------------------------------------------------------
/data_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | :mod:`data_extractor`
3 | =====================
4 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting.
5 | """
6 |
7 | # Local Folder
8 | from .core import (
9 | AbstractComplexExtractor,
10 | AbstractExtractors,
11 | AbstractSimpleExtractor,
12 | ComplexExtractorMeta,
13 | )
14 | from .exceptions import ExprError, ExtractError
15 | from .item import RV, Convertor, Field, Item
16 | from .json import (
17 | JSONExtractor,
18 | JSONPathExtractor,
19 | JSONPathRWExtExtractor,
20 | JSONPathRWExtractor,
21 | )
22 | from .lxml import (
23 | AttrCSSExtractor,
24 | CSSExtractor,
25 | Element,
26 | TextCSSExtractor,
27 | XPathExtractor,
28 | )
29 | from .utils import (
30 | LazyStr,
31 | is_complex_extractor,
32 | is_extractor,
33 | is_simple_extractor,
34 | sentinel,
35 | )
36 |
37 | __all__ = (
38 | "AbstractComplexExtractor",
39 | "AbstractExtractors",
40 | "AbstractSimpleExtractor",
41 | "AttrCSSExtractor",
42 | "CSSExtractor",
43 | "ComplexExtractorMeta",
44 | "Convertor",
45 | "Element",
46 | "ExprError",
47 | "ExtractError",
48 | "Field",
49 | "Item",
50 | "JSONExtractor",
51 | "JSONPathExtractor",
52 | "JSONPathRWExtExtractor",
53 | "JSONPathRWExtractor",
54 | "LazyStr",
55 | "RV",
56 | "TextCSSExtractor",
57 | "XPathExtractor",
58 | "is_complex_extractor",
59 | "is_extractor",
60 | "is_simple_extractor",
61 | "sentinel",
62 | )
63 |
--------------------------------------------------------------------------------
/data_extractor/contrib/mypy/__init__.py:
--------------------------------------------------------------------------------
1 | # Standard Library
2 | import logging
3 |
4 | from functools import partial
5 | from typing import Callable, Dict, List, Optional, Type, Union
6 |
7 | # Third Party Library
8 | from mypy.checker import TypeChecker, is_true_literal
9 | from mypy.nodes import (
10 | AssignmentStmt,
11 | CallExpr,
12 | ClassDef,
13 | Expression,
14 | IndexExpr,
15 | MemberExpr,
16 | MypyFile,
17 | NameExpr,
18 | RefExpr,
19 | StrExpr,
20 | SymbolNode,
21 | TypeAlias,
22 | TypeInfo,
23 | Var,
24 | )
25 | from mypy.options import Options
26 | from mypy.plugin import (
27 | DynamicClassDefContext,
28 | FunctionContext,
29 | MethodSigContext,
30 | Plugin,
31 | )
32 | from mypy.semanal import SemanticAnalyzerInterface
33 | from mypy.semanal_typeddict import TypedDictAnalyzer
34 | from mypy.traverser import TraverserVisitor
35 | from mypy.types import AnyType, CallableType, FunctionLike, Instance
36 | from mypy.types import Type as MypyType
37 | from mypy.types import TypedDictType, TypeOfAny, TypeType, UninhabitedType, UnionType
38 |
39 | logger = logging.getLogger(__name__)
40 |
41 |
42 | class RelationshipVisitor(TraverserVisitor):
43 | relationships: Dict[str, List[str]]
44 |
45 | def __init__(self) -> None:
46 | self.relationships = {}
47 |
48 | def is_data_extractor_cls(self, obj: Optional[SymbolNode]) -> bool:
49 | return obj is not None and obj.fullname in (
50 | "data_extractor.item.Field",
51 | "data_extractor.item.Item",
52 | )
53 |
54 | def is_making_extractor_assignment_stmt(self, stmt: AssignmentStmt) -> bool:
55 | rvalue = stmt.rvalue
56 | if not isinstance(rvalue, CallExpr):
57 | return False
58 |
59 | node: Union[Expression, SymbolNode, MypyType] = rvalue.callee
60 | if isinstance(node, IndexExpr):
61 | logger.debug("node=%s", node)
62 | base = node.base
63 | assert base is not None
64 | node = base
65 |
66 | assert isinstance(node, RefExpr)
67 | logger.debug("node=%s", node)
68 | node_ = node.node
69 | if node_ is None:
70 | return False
71 | node = node_
72 |
73 | logger.debug("node=%r", node)
74 | if isinstance(node, Var):
75 | tt = node.type
76 | logger.debug("tt=%s", tt)
77 | if not isinstance(tt, TypeType):
78 | return False
79 | node = tt.item
80 |
81 | logger.debug("node=%r", node)
82 | if isinstance(node, TypeAlias):
83 | node = node.target
84 |
85 | logger.debug("node=%r", node)
86 | if isinstance(node, Instance):
87 | return node.type.has_base("data_extractor.item.Field")
88 |
89 | logger.debug("node=%r", node)
90 | if isinstance(node, TypeInfo):
91 | return self.is_data_extractor_cls(node)
92 |
93 | return False
94 |
95 | def locate_field_in_classdef(self, defn: ClassDef, name: str) -> str:
96 | for block in defn.defs.body:
97 | if not isinstance(block, AssignmentStmt):
98 | continue
99 |
100 | for lvalue in block.lvalues:
101 | assert isinstance(lvalue, NameExpr)
102 | if lvalue.name == name:
103 | assert block.type is not None
104 | return str((block.type.line, block.type.column))
105 | else: # pragma: no cover
106 | raise ValueError(f"Field name = {name!r} not exists in defn = {defn!s}")
107 |
108 | def anal_assignment_stmt(self, stmt: AssignmentStmt) -> None:
109 | logger.debug("stmt=%s", stmt)
110 | if self.is_making_extractor_assignment_stmt(stmt):
111 | rvalue_loc = str((stmt.rvalue.line, stmt.rvalue.column))
112 | logger.debug("stmt=%s, rloc=%r", stmt, rvalue_loc)
113 | for lvalue in stmt.lvalues:
114 | lvalue_loc = ""
115 | logger.debug(f"lvalue = {lvalue!s}")
116 | assert isinstance(lvalue, RefExpr)
117 | if isinstance(lvalue, MemberExpr):
118 | expr = lvalue.expr
119 | assert isinstance(expr, NameExpr)
120 | node = expr.node
121 | if node is None:
122 | return
123 | assert isinstance(node, TypeInfo)
124 | lvalue_loc = self.locate_field_in_classdef(node.defn, lvalue.name)
125 | elif isinstance(lvalue, NameExpr):
126 | node = lvalue.node
127 | assert isinstance(node, SymbolNode)
128 | lvalue_loc = str((node.line, node.column))
129 |
130 | if not lvalue_loc: # pragma: no cover
131 | logger.debug(f"n = {node!s}, stmt = {stmt!s}")
132 | continue
133 |
134 | self.relationships.setdefault(rvalue_loc, []).append(lvalue_loc)
135 |
136 | def visit_assignment_stmt(self, o: AssignmentStmt) -> None:
137 | self.anal_assignment_stmt(o)
138 | super().visit_assignment_stmt(o)
139 |
140 |
141 | class DataExtractorPlugin(Plugin):
142 | cache: Dict[str, Dict[str, List[str]]]
143 | item_typeddict_mapping: Dict[str, TypedDictType]
144 |
145 | def __init__(self, options: Options) -> None:
146 | super().__init__(options)
147 | self.cache = {}
148 | self.item_typeddict_mapping = {}
149 |
150 | def get_current_code(self, ctx: FunctionContext) -> MypyFile:
151 | api = ctx.api
152 | assert isinstance(api, TypeChecker)
153 | module_name = api.tscope.module
154 | assert module_name is not None
155 | return api.modules[module_name]
156 |
157 | def anal_code(self, code: MypyFile) -> Dict[str, List[str]]:
158 | logger.debug(f"code.fullname = {code.fullname!r}, self.cache = {self.cache!r}")
159 | if code.fullname not in self.cache:
160 | try:
161 | visitor = RelationshipVisitor()
162 | except TypeError: # pragma: no cover
163 | # Only supports versions that are bigger than 0.820
164 | return {}
165 |
166 | code.accept(visitor)
167 | self.cache[code.fullname] = visitor.relationships
168 |
169 | return self.cache[code.fullname]
170 |
171 | def check_field_generic_type(self, ctx: FunctionContext) -> MypyType:
172 | rv_type = ctx.default_return_type
173 | if self.options.disallow_any_generics:
174 | return rv_type
175 |
176 | self.anal_code(self.get_current_code(ctx))
177 |
178 | assert isinstance(rv_type, Instance)
179 | if rv_type.args and not isinstance(rv_type.args[0], UninhabitedType):
180 | return rv_type
181 |
182 | return self.apply_any_generic(type=rv_type)
183 |
184 | def apply_any_generic(self, type: Instance) -> Instance:
185 | any_type = AnyType(TypeOfAny.special_form)
186 | args = [any_type]
187 | return type.copy_modified(args=args)
188 |
189 | def check_is_many(self, ctx: FunctionContext) -> bool:
190 | is_many_idx = ctx.callee_arg_names.index("is_many")
191 | is_many_exprs = ctx.args[is_many_idx]
192 | if is_many_exprs:
193 | return is_true_literal(is_many_exprs[0])
194 |
195 | return False
196 |
197 | def prepare_type_annotations(self, ctx: FunctionContext, fullname: str) -> MypyType:
198 | logger.debug("fullname=%r", fullname)
199 |
200 | # check parameter "is_many"
201 | expr = ctx.context
202 | assert isinstance(expr, CallExpr)
203 |
204 | callee = expr.callee
205 | if isinstance(callee, IndexExpr):
206 | callee = callee.base
207 | assert isinstance(callee, NameExpr)
208 |
209 | sym_field_class: Union[MypyType, SymbolNode, None] = callee.node
210 | if isinstance(sym_field_class, TypeAlias):
211 | sym_field_class = sym_field_class.target
212 | elif isinstance(sym_field_class, Var):
213 | typetype = sym_field_class.type
214 | assert isinstance(typetype, TypeType)
215 | sym_field_class = typetype.item
216 |
217 | if isinstance(sym_field_class, Instance):
218 | sym_field_class = sym_field_class.type
219 |
220 | assert isinstance(sym_field_class, TypeInfo)
221 | relationship = self.anal_code(self.get_current_code(ctx))
222 | lvalue_key = str((expr.line, expr.column))
223 | keys = [lvalue_key]
224 | if lvalue_key in relationship:
225 | keys.extend(relationship[lvalue_key])
226 |
227 | for key in keys:
228 | logger.debug(
229 | f"lvalue_key = {lvalue_key!r}, "
230 | f"key = {key!r}, relationship = {relationship!r}"
231 | )
232 |
233 | if self.check_is_many(ctx):
234 | sym_field_class.metadata[key] = {"is_many": True}
235 | else:
236 | sym_field_class.metadata[key] = {"is_many": False}
237 |
238 | rv_type = self.check_field_generic_type(ctx)
239 | return rv_type
240 |
241 | def is_extractor_cls(self, fullname: str, is_item_subcls=False) -> bool:
242 | node = self.lookup_fully_qualified(fullname)
243 | if node is not None:
244 | typenode = node.node
245 | if isinstance(typenode, TypeInfo):
246 | if is_item_subcls:
247 | return typenode.has_base("data_extractor.item.Item")
248 | else:
249 | return typenode.has_base("data_extractor.item.Field")
250 |
251 | return False
252 |
253 | def get_function_hook(
254 | self, fullname: str
255 | ) -> Optional[Callable[[FunctionContext], MypyType]]:
256 | logger.debug("fullname=%r", fullname)
257 | if self.is_extractor_cls(fullname):
258 | return partial(self.prepare_type_annotations, fullname=fullname)
259 |
260 | return super().get_function_hook(fullname)
261 |
262 | def apply_is_many_on_extract_method(
263 | self, ctx: MethodSigContext, fullname: str
264 | ) -> CallableType:
265 | origin: CallableType = ctx.default_signature
266 | origin_ret_type = origin.ret_type
267 | assert isinstance(origin_ret_type, UnionType)
268 |
269 | self_class = ctx.type
270 | assert isinstance(self_class, Instance)
271 | metadata = self_class.type.metadata
272 |
273 | # in case of stmt `Field().extract(...)`
274 | key = str((ctx.type.line, ctx.type.column))
275 | if key not in metadata:
276 | expr = ctx.context
277 | assert isinstance(expr, CallExpr)
278 | callee = expr.callee
279 | assert isinstance(callee, MemberExpr)
280 | callee_expr = callee.expr
281 | assert isinstance(callee_expr, NameExpr)
282 | obj = callee_expr.node
283 | assert isinstance(obj, Var)
284 | key = str((obj.line, obj.column))
285 |
286 | logger.debug("fullname=%r, key=%r, metadata=%r", fullname, key, metadata)
287 | if key in metadata:
288 | is_many = metadata[key]["is_many"]
289 | ret_type = origin_ret_type.items[int(is_many)]
290 | return origin.copy_modified(ret_type=ret_type)
291 | else:
292 | api = ctx.api
293 | assert isinstance(api, TypeChecker)
294 | api.fail("Cant determine extract method return type", context=ctx.context)
295 | return origin
296 |
297 | def is_extract_method(self, fullname: str) -> bool:
298 | suffix = ".extract"
299 | if fullname.endswith(suffix):
300 | return self.is_extractor_cls(fullname[: -len(suffix)])
301 | return False
302 |
303 | def apply_extract_method(
304 | self, ctx: MethodSigContext, fullname: str
305 | ) -> CallableType:
306 | rv = self.apply_is_many_on_extract_method(ctx, fullname)
307 |
308 | # apply item typeddict
309 | item_classname = fullname[: -len(".extract")]
310 | if item_classname in self.item_typeddict_mapping:
311 | logger.debug("fullname=%r, ret_type=%r", fullname, rv.ret_type)
312 | original = rv.ret_type
313 | typeddict = self.item_typeddict_mapping[item_classname]
314 | ret_type: Optional[MypyType]
315 | if isinstance(original, AnyType): # is_many=False
316 | rv = rv.copy_modified(ret_type=typeddict)
317 | else:
318 | assert isinstance(original, Instance)
319 | if original.type.name == "list": # is_many=True
320 | ret_type = original
321 | ret_type.args = (typeddict,)
322 | rv = rv.copy_modified(ret_type=ret_type)
323 | else: # pragma: no cover
324 | api = ctx.api
325 | assert isinstance(api, TypeChecker)
326 | api.fail(
327 | "Cant determine extract method return type", context=ctx.context
328 | )
329 | ret_type = None
330 |
331 | logger.debug(
332 | "fullname=%r, rv=%r, item_typeddict_mapping=%r",
333 | fullname,
334 | rv,
335 | self.item_typeddict_mapping,
336 | )
337 | return rv
338 |
339 | def get_method_signature_hook(
340 | self, fullname: str
341 | ) -> Optional[Callable[[MethodSigContext], FunctionLike]]:
342 | if self.is_extract_method(fullname):
343 | return partial(self.apply_extract_method, fullname=fullname)
344 | return super().get_method_signature_hook(fullname)
345 |
346 | def get_name_arg(self, call: CallExpr) -> str:
347 | name = ""
348 | try:
349 | idx = call.arg_names.index("name")
350 | arg = call.args[idx]
351 | assert isinstance(arg, StrExpr)
352 | name = arg.value
353 | except ValueError:
354 | pass
355 | return name
356 |
357 | def prepare_typeddict(self, ctx: DynamicClassDefContext, fullname: str) -> None:
358 | logger.debug("fullname=%r", fullname)
359 | if fullname in self.item_typeddict_mapping:
360 | return
361 |
362 | api = ctx.api
363 | assert isinstance(api, SemanticAnalyzerInterface)
364 | analyzer = TypedDictAnalyzer(api.options, api, api.msg) # type: ignore
365 |
366 | items: List[str] = []
367 | types: List[MypyType] = []
368 | callee = ctx.call.callee
369 | assert isinstance(callee, NameExpr)
370 | node = callee.node
371 | assert isinstance(node, TypeInfo)
372 | for block in node.defn.defs.body:
373 | if not isinstance(block, AssignmentStmt):
374 | continue
375 |
376 | rvalue = block.rvalue
377 | if not isinstance(rvalue, CallExpr):
378 | continue
379 |
380 | param_name = self.get_name_arg(rvalue)
381 | logger.debug("param_name = %r from rvalue = %s", param_name, rvalue)
382 |
383 | rvalue_type: MypyType
384 | callee = rvalue.callee
385 | if isinstance(callee, IndexExpr):
386 | index = callee.index
387 | assert isinstance(index, NameExpr)
388 | name = index.fullname
389 | assert name is not None
390 | named_type = api.named_type_or_none(name, [])
391 | assert named_type is not None
392 | rvalue_type = named_type
393 | else:
394 | rvalue_type = AnyType(TypeOfAny.special_form)
395 |
396 | if param_name:
397 | items.append(param_name)
398 | types.append(rvalue_type)
399 | else:
400 | for lvalue in block.lvalues:
401 | assert isinstance(lvalue, NameExpr)
402 | items.append(lvalue.name)
403 | types.append(rvalue_type)
404 |
405 | callee = ctx.call.callee
406 | assert isinstance(callee, NameExpr)
407 | typeinfo = analyzer.build_typeddict_typeinfo(
408 | callee.name,
409 | items,
410 | types,
411 | set(items),
412 | -1,
413 | None,
414 | )
415 | assert typeinfo.typeddict_type is not None
416 | self.item_typeddict_mapping[fullname] = typeinfo.typeddict_type
417 | logger.debug(
418 | "fullname=%r, item_typeddict_mapping=%r",
419 | fullname,
420 | self.item_typeddict_mapping,
421 | )
422 |
423 | def get_dynamic_class_hook(
424 | self, fullname: str
425 | ) -> Optional[Callable[[DynamicClassDefContext], None]]:
426 | logger.debug("fullname=%r", fullname)
427 | if self.is_extractor_cls(fullname, is_item_subcls=True):
428 | return partial(self.prepare_typeddict, fullname=fullname)
429 |
430 | return super().get_dynamic_class_hook(fullname)
431 |
432 |
433 | def plugin(version: str) -> Type[Plugin]:
434 | return DataExtractorPlugin
435 |
--------------------------------------------------------------------------------
/data_extractor/core.py:
--------------------------------------------------------------------------------
1 | """
2 | =====================================
3 | :mod:`core` -- Abstract Base Classes.
4 | =====================================
5 | """
6 |
7 | # Standard Library
8 | import ast
9 | import inspect
10 |
11 | from abc import abstractmethod
12 | from collections import namedtuple
13 | from types import FrameType, FunctionType, MethodType
14 | from typing import Any, Dict, Optional, Tuple, Union
15 |
16 | # Local Folder
17 | from .utils import Property, getframe, sentinel
18 |
19 | _LineInfo = namedtuple("_LineInfo", ["file", "lineno", "offset", "line"])
20 |
21 |
22 | def _find_line_info_of_attr_in_source(
23 | frame: Optional[FrameType], key: str, attr: "AbstractComplexExtractor"
24 | ) -> _LineInfo:
25 | if frame is None:
26 | return _LineInfo(None, None, None, f"{key}={attr!r}")
27 |
28 | file = frame.f_code.co_filename
29 | firstlineno = frame.f_lineno
30 | firstline_idx = firstlineno - 1
31 | try:
32 | lines, _ = inspect.findsource(frame)
33 | except OSError:
34 | # can't get the source code from python repl
35 | return _LineInfo(None, None, None, f"{key}={attr!r}")
36 |
37 | start_index = inspect.indentsize(lines[firstline_idx])
38 | for lineno, line in enumerate(lines[firstline_idx + 1 :], start=firstlineno + 1):
39 | # iterate line in the code block body
40 | cur_index = inspect.indentsize(line)
41 | if cur_index <= start_index:
42 | # reach end of the code block,
43 | # use code block firstlineno as SyntaxError.lineno
44 | line = lines[firstline_idx]
45 | lineno = firstlineno
46 | break
47 |
48 | if line.lstrip().startswith(key):
49 | # find the line as SyntaxError.text
50 | break
51 |
52 | else:
53 | # reach EOF,
54 | # use code block firstlineno as SyntaxError.lineno
55 | line = lines[firstline_idx]
56 | lineno = firstlineno
57 |
58 | offset = inspect.indentsize(line)
59 | line = line.strip()
60 | return _LineInfo(file, lineno, offset, line)
61 |
62 |
63 | def _check_field_overwrites_bases_property(
64 | cls: object,
65 | name: str,
66 | bases: Tuple[object],
67 | key: str,
68 | attr: "AbstractComplexExtractor",
69 | ) -> None:
70 | attr_from_bases = getattr(bases[-1], key, None)
71 | if isinstance(attr_from_bases, Property) or key == "_field_names":
72 | # Item's attribute overwrites its property.
73 | frame = getframe(2)
74 | exc_args = _find_line_info_of_attr_in_source(frame, key, attr)
75 | *_, line = exc_args
76 | err_msg = (
77 | f"{line!r} overwriten "
78 | f"the property {key!r} of {name}. "
79 | f"Please using the optional parameter name={key!r} "
80 | f"in {attr!r} to avoid overwriting property."
81 | )
82 | raise SyntaxError(err_msg, exc_args)
83 |
84 |
85 | def _check_field_overwrites_bases_method(
86 | cls: object,
87 | name: str,
88 | bases: Tuple[object],
89 | key: str,
90 | attr: "AbstractComplexExtractor",
91 | ) -> None:
92 | attr_from_bases = getattr(bases[-1], key, None)
93 | if isinstance(attr_from_bases, (FunctionType, MethodType)):
94 | # Item's attribute overwrites its class bases' method.
95 | frame = getframe(2)
96 | exc_args = _find_line_info_of_attr_in_source(frame, key, attr)
97 | *_, line = exc_args
98 | err_msg = (
99 | f"{line!r} overwriten "
100 | f"the method {key!r} of {name!r}. "
101 | f"Please using the optional parameter name={key!r} "
102 | f"in {attr!r} to avoid overwriting method."
103 | )
104 | raise SyntaxError(err_msg, exc_args)
105 |
106 |
107 | def _check_field_overwrites_method(cls: object) -> None:
108 | frame = getframe(2)
109 | if frame is None:
110 | return
111 |
112 | filename = frame.f_code.co_filename
113 | firstlineno = frame.f_lineno
114 | try:
115 | lines, _ = inspect.findsource(frame)
116 | except OSError:
117 | # can't get the source code from python repl
118 | return
119 |
120 | source = "".join(lines)
121 | mod = ast.parse(source)
122 | for node in ast.walk(mod):
123 | if isinstance(node, (ast.ClassDef, ast.Call)) and node.lineno == firstlineno:
124 | item_node = node
125 | break
126 | else: # pragma: no cover
127 | assert 0, f"Can't find the source of {cls}."
128 |
129 | if isinstance(item_node, ast.Call):
130 | # There is no point to check if field overwrites method,
131 | # due to item is created by `type` function.
132 | return
133 |
134 | assigns: Dict[str, ast.Assign] = {}
135 | methods: Dict[str, ast.FunctionDef] = {}
136 | for node in item_node.body:
137 | if isinstance(node, ast.Assign):
138 | for target_ in node.targets:
139 | if not isinstance(target_, ast.Name):
140 | continue
141 |
142 | assigns[target_.id] = node
143 | elif isinstance(node, ast.FunctionDef):
144 | methods[node.name] = node
145 |
146 | unions = assigns.keys() & methods.keys()
147 | if not unions:
148 | return
149 |
150 | key = next(iter(unions))
151 | assign = assigns[key]
152 | method = methods[key]
153 | if assign.lineno > method.lineno:
154 | lineno = assign.lineno
155 | offset = assign.col_offset
156 | line = lines[lineno - 1].strip()
157 |
158 | msg = (
159 | f"method {lines[method.lineno - 1].strip()!r} "
160 | f"on lineno={method.lineno} "
161 | f"overwrited by assign {line!r}. "
162 | f"Please using the optional parameter name={key!r} "
163 | f"in {line!r} to avoid overwriting."
164 | )
165 | else:
166 | lineno = method.lineno
167 | offset = method.col_offset
168 | line = lines[lineno - 1].strip()
169 | msg = (
170 | f"assign {lines[assign.lineno - 1].strip()!r} "
171 | f"on lineno={assign.lineno} "
172 | f"overwrited by method {line!r}. "
173 | f"Please using the optional parameter name={key!r} "
174 | f"in {lines[assign.lineno - 1].strip()!r} to avoid overwriting."
175 | )
176 |
177 | raise SyntaxError(msg, (filename, lineno, offset, line))
178 |
179 |
180 | class SimpleExtractorMeta(type):
181 | """
182 | Simple Extractor Meta Class.
183 | """
184 |
185 |
186 | class ComplexExtractorMeta(SimpleExtractorMeta):
187 | """
188 | Complex Extractor Meta Class.
189 | """
190 |
191 | def __init__(
192 | cls, # noqa: B902
193 | name: str,
194 | bases: Tuple[type],
195 | attr_dict: Dict[str, Any],
196 | ):
197 | super().__init__(name, bases, attr_dict)
198 |
199 | field_names = set()
200 | for key, attr in attr_dict.items():
201 | if isinstance(type(attr), ComplexExtractorMeta):
202 | # can't using data_extractor.utils.is_complex_extractor here,
203 | # because AbstractComplexExtractor which being used in it
204 | # bases on ComplexExtractorMeta.
205 | _check_field_overwrites_bases_method(cls, name, bases, key, attr)
206 | _check_field_overwrites_bases_property(cls, name, bases, key, attr)
207 |
208 | field_names.add(key)
209 |
210 | # check field overwrites method
211 | _check_field_overwrites_method(cls)
212 |
213 | field_names |= set(getattr(cls, "_field_names", []))
214 | for key in field_names.copy():
215 | attr = getattr(cls, key, None)
216 | if not attr or not isinstance(type(attr), ComplexExtractorMeta):
217 | field_names.remove(key)
218 |
219 | cls._field_names: Tuple[str, ...] = tuple(field_names)
220 |
221 |
222 | class AbstractSimpleExtractor(metaclass=SimpleExtractorMeta):
223 | """
224 | Abstract Simple Extractor Class.
225 |
226 | Its metaclass is :class:`data_extractor.core.SimpleExtractorMeta`
227 |
228 | :param expr: Extractor selector expression.
229 | :type expr: str
230 | """
231 |
232 | expr = Property[str]()
233 |
234 | def __init__(self, expr: str):
235 | self.expr = expr
236 |
237 | def __repr__(self) -> str:
238 | return f"{self.__class__.__name__}({self.expr!r})"
239 |
240 | @abstractmethod
241 | def extract(self, element: Any) -> Any:
242 | """
243 | Extract data or subelement from element.
244 |
245 | :param element: The target data node element.
246 | :type element: Any
247 |
248 | :returns: Data or subelement.
249 | :rtype: Any
250 |
251 | :raises ~data_extractor.exceptions.ExprError: Extractor Expression Error.
252 | """
253 | raise NotImplementedError
254 |
255 | def extract_first(self, element: Any, default: Any = sentinel) -> Any:
256 | """
257 | Extract the first data or subelement from `extract` method call result.
258 |
259 | :param element: The target data node element.
260 | :type element: Any
261 | :param default: Default value when not found. \
262 | Default: :data:`data_extractor.utils.sentinel`.
263 | :type default: Any, optional
264 |
265 | :returns: Data or subelement.
266 | :rtype: Any
267 |
268 | :raises ~data_extractor.exceptions.ExtractError: \
269 | Thrown by extractor extracting wrong data.
270 | """
271 | rv = self.extract(element)
272 | if not rv:
273 | if default is sentinel:
274 | # Local Folder
275 | from .exceptions import ExtractError
276 |
277 | raise ExtractError(self, element)
278 |
279 | return default
280 |
281 | return rv[0]
282 |
283 |
284 | class AbstractComplexExtractor(metaclass=ComplexExtractorMeta):
285 | """
286 | Abstract Complex Extractor Clase.
287 |
288 | Its metaclass is :class:`data_extractor.core.ComplexExtractorMeta`
289 | """
290 |
291 | @abstractmethod
292 | def extract(self, element: Any) -> Any:
293 | """
294 | Extract the wanted data.
295 |
296 | :param element: The target data node element.
297 | :type element: Any
298 |
299 | :returns: Data or subelement.
300 | :rtype: Any
301 |
302 | :raises ~data_extractor.exceptions.ExtractError: \
303 | Thrown by extractor extracting wrong data.
304 | """
305 | raise NotImplementedError
306 |
307 |
308 | AbstractExtractors = Union[AbstractSimpleExtractor, AbstractComplexExtractor]
309 |
310 | __all__ = (
311 | "AbstractComplexExtractor",
312 | "AbstractExtractors",
313 | "AbstractSimpleExtractor",
314 | "ComplexExtractorMeta",
315 | "SimpleExtractorMeta",
316 | )
317 |
--------------------------------------------------------------------------------
/data_extractor/exceptions.py:
--------------------------------------------------------------------------------
1 | """
2 | ===========================================
3 | :mod:`exceptions` -- Extracting Exceptions.
4 | ===========================================
5 | """
6 |
7 | # Standard Library
8 | import reprlib
9 |
10 | from typing import Any
11 |
12 | # Local Folder
13 | from .core import AbstractExtractors, AbstractSimpleExtractor
14 | from .utils import LazyStr
15 |
16 |
17 | class ExprError(Exception):
18 | """
19 | Invalid Expr.
20 |
21 | :param extractor: The object for data extracting.
22 | :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor`
23 | :param exc: The actual exception is thrown when extracting.
24 | :type exc: Exception
25 | """
26 |
27 | def __init__(self, extractor: AbstractSimpleExtractor, exc: Exception):
28 | self.extractor = extractor
29 | self.exc = exc
30 |
31 | def __str__(self) -> str:
32 | return f"ExprError with {self.exc!r} raised by {self.extractor!r} extracting"
33 |
34 | def __repr__(self) -> str:
35 | return f"{self.__class__.__name__}({self.extractor!r}, exc={self.exc!r})"
36 |
37 |
38 | class ExtractError(Exception):
39 | """
40 | Thrown by extractor extracting wrong data.
41 |
42 | :param extractor: The object for data extracting.
43 | :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor`, \
44 | :class:`data_extractor.core.AbstractComplexExtractor`
45 | :param element: The target data node element.
46 | :type element: Any
47 | """
48 |
49 | def __init__(self, extractor: AbstractExtractors, element: Any):
50 | super().__init__(LazyStr(func=lambda: self._trace_repr))
51 | self.element = element
52 | self.extractors = [extractor]
53 |
54 | def __repr__(self) -> str:
55 | return (
56 | f"{self.__class__.__name__}"
57 | f"({self.extractors[0]!r}, element={reprlib.repr(self.element)})"
58 | )
59 |
60 | def _append(self, extractor: AbstractExtractors) -> None:
61 | self.extractors.append(extractor)
62 |
63 | @property
64 | def _trace_repr(self) -> str:
65 | return f"{self.__repr__()}\n" + "\n".join(
66 | " " * idx + "|-" + repr(extractor)
67 | for idx, extractor in enumerate([*self.extractors[::-1], self.element])
68 | )
69 |
70 |
71 | __all__ = ("ExprError", "ExtractError")
72 |
--------------------------------------------------------------------------------
/data_extractor/item.py:
--------------------------------------------------------------------------------
1 | """
2 | =====================================================
3 | :mod:`item` -- Complex Extractor for data extracting.
4 | =====================================================
5 | """
6 |
7 | # Standard Library
8 | import copy
9 |
10 | from typing import (
11 | Any,
12 | Callable,
13 | Dict,
14 | Generic,
15 | Iterator,
16 | List,
17 | Optional,
18 | Type,
19 | TypeVar,
20 | Union,
21 | )
22 |
23 | # Local Folder
24 | from .core import AbstractComplexExtractor, AbstractSimpleExtractor
25 | from .exceptions import ExtractError
26 | from .utils import Property, is_simple_extractor, sentinel
27 |
28 | RV = TypeVar("RV")
29 | Convertor = Callable[[Any], RV]
30 |
31 |
32 | class Field(Generic[RV], AbstractComplexExtractor):
33 | """
34 | Extract data by cooperating with extractor.
35 |
36 | :param extractor: The object for data extracting
37 | :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor`
38 | :param name: Optional parameter for special field name.
39 | :type name: str, optional
40 | :param default: Default value when not found. \
41 | Default: :data:`data_extractor.utils.sentinel`.
42 | :type default: Any
43 | :param is_many: Indicate the data which extractor extracting is more than one.
44 | :type is_many: bool
45 |
46 | :raises ValueError: Invalid SimpleExtractor.
47 | :raises ValueError: Can't both set default and is_manay=True.
48 | """
49 |
50 | extractor = Property[Optional[AbstractSimpleExtractor]]()
51 | name = Property[Optional[str]]()
52 | default = Property[Any]()
53 | is_many = Property[bool]()
54 |
55 | type = Property[Optional[Type[RV]]]()
56 | convertor = Property[Optional[Convertor[RV]]]()
57 |
58 | def __init__(
59 | self,
60 | extractor: Optional[AbstractSimpleExtractor] = None,
61 | name: Optional[str] = None,
62 | default: Any = sentinel,
63 | is_many: bool = False,
64 | type: Optional[Type[RV]] = None,
65 | convertor: Optional[Convertor[RV]] = None,
66 | ):
67 | super().__init__()
68 |
69 | if extractor is not None and not is_simple_extractor(extractor):
70 | raise ValueError(f"Invalid SimpleExtractor: {extractor!r}")
71 |
72 | if default is not sentinel and is_many:
73 | raise ValueError(f"Can't both set default={default} and is_many=True")
74 |
75 | self.extractor = extractor
76 | self.name = name
77 | self.default = default
78 | self.is_many = is_many
79 | self.type = type
80 | self.convertor = convertor
81 |
82 | def __class_getitem__(cls, rv_type: Type[RV]):
83 | def new_init(
84 | self,
85 | extractor: Optional[AbstractSimpleExtractor] = None,
86 | name: Optional[str] = None,
87 | default: Any = sentinel,
88 | is_many: bool = False,
89 | type: Optional[Type[RV]] = None,
90 | convertor: Optional[Convertor[RV]] = None,
91 | ):
92 | cls.__init__(
93 | self,
94 | extractor=extractor,
95 | name=name,
96 | default=default,
97 | is_many=is_many,
98 | type=type or rv_type,
99 | convertor=convertor,
100 | )
101 |
102 | if rv_type is RV: # type: ignore
103 | # it is a type-unbound container class
104 | return cls
105 | else:
106 | return type(cls.__name__, (cls,), {"__init__": new_init})
107 |
108 | def __repr__(self) -> str:
109 | args = [f"{self.extractor!r}"]
110 | if self.name is not None:
111 | args.append(f"name={self.name!r}")
112 |
113 | if self.default is not sentinel:
114 | args.append(f"default={self.default!r}")
115 |
116 | if self.is_many:
117 | args.append(f"is_many={self.is_many!r}")
118 |
119 | return f"{self.__class__.__name__}({', '.join(args)})"
120 |
121 | def extract(self, element: Any) -> Union[RV, List[RV]]:
122 | if self.extractor is None:
123 | if isinstance(element, list):
124 | rv = element
125 | else:
126 | rv = [element]
127 | else:
128 | rv = self.extractor.extract(element)
129 |
130 | if self.is_many:
131 | return [self._extract(r) for r in rv]
132 |
133 | if not rv:
134 | if self.default is sentinel:
135 | raise ExtractError(self, element)
136 |
137 | return self.default
138 |
139 | return self._extract(rv[0])
140 |
141 | def _extract(self, element: Any) -> RV:
142 | if self.convertor is not None:
143 | return self.convertor(element)
144 | else:
145 | cls = self.type
146 | if cls is not None and callable(cls):
147 | # TODO: inspect function signature for supporting better conversion
148 | return cls(element) # type: ignore
149 | else:
150 | return element
151 |
152 | def __deepcopy__(self, memo: Dict[int, Any]) -> AbstractComplexExtractor:
153 | deepcopy_method = self.__deepcopy__
154 | self.__deepcopy__ = None # type: ignore
155 | cp = copy.deepcopy(self, memo)
156 | self.__deepcopy__ = deepcopy_method # type: ignore
157 |
158 | # avoid duplicating the sentinel object.
159 | if self.default is sentinel:
160 | Property.change_internal_value(cp, "default", sentinel)
161 |
162 | return cp
163 |
164 |
165 | class Item(Field[RV]):
166 | """
167 | Extract data by cooperating with extractors, fields and items.
168 | """
169 |
170 | def __init__(
171 | self,
172 | extractor=None,
173 | name=None,
174 | default=sentinel,
175 | is_many=False,
176 | type=None,
177 | convertor=None,
178 | ):
179 | super().__init__(
180 | extractor=extractor,
181 | name=name,
182 | default=default,
183 | is_many=is_many,
184 | type=type,
185 | convertor=convertor or self.default_convertor,
186 | )
187 |
188 | def default_convertor(self, rv: Dict[str, Any]) -> RV:
189 | cls = self.type
190 | if cls is not None and callable(cls):
191 | # TODO: inspect function signature for supporting better conversion
192 | return cls(**rv) # type: ignore
193 |
194 | return rv # type: ignore
195 |
196 | def _extract(self, element: Any) -> RV:
197 | rv = {}
198 | for field in self.field_names():
199 | try:
200 | extractor = getattr(self, field)
201 | if extractor.name is not None:
202 | field = extractor.name
203 |
204 | rv[field] = extractor.extract(element)
205 | except ExtractError as exc:
206 | exc._append(extractor=self)
207 | raise exc
208 |
209 | return super()._extract(rv)
210 |
211 | @classmethod
212 | def field_names(cls) -> Iterator[str]:
213 | """
214 | Iterate all `Item` or `Field` type attributes' name.
215 | """
216 | yield from cls._field_names
217 |
218 | def simplify(self) -> AbstractSimpleExtractor:
219 | """
220 | Create an extractor that has compatible API like SimpleExtractor's.
221 |
222 | :returns: A simple extractor.
223 | :rtype: :class:`data_extractor.core.AbstractSimpleExtractor`
224 | """
225 | # duplication seems to be useless due to the properties of Item is unchageable
226 | # but it maybe need to change is_many property of Item.
227 | duplicated = copy.deepcopy(self)
228 | # set for fixing in SimpeExtractor.extract method signature
229 | Property.change_internal_value(duplicated, "is_many", True)
230 |
231 | def extract(self: AbstractSimpleExtractor, element: Any) -> List[RV]:
232 | return duplicated.extract(element) # type: ignore
233 |
234 | def getter(self: AbstractSimpleExtractor, name: str) -> Any:
235 | if (
236 | name not in ("extract", "extract_first")
237 | and not name.startswith("__")
238 | and hasattr(duplicated.extractor, name)
239 | ):
240 | return getattr(duplicated.extractor, name)
241 | return super(type(self), self).__getattribute__(name)
242 |
243 | classname = f"{type(duplicated).__name__}Simplified"
244 | base = AbstractSimpleExtractor
245 | if duplicated.extractor is not None:
246 | base = type(duplicated.extractor)
247 |
248 | new_cls = type(
249 | classname,
250 | (base,),
251 | {
252 | "extract": extract,
253 | "__getattribute__": getter,
254 | },
255 | )
256 | # wrapper class no needs for initialization
257 | obj: AbstractSimpleExtractor = base.__new__(new_cls)
258 | if not hasattr(obj, "expr"):
259 | # handle case of Item with extractor=None.
260 | # and its expr property will raise AttributeError,
261 | # so hasattr return False
262 | obj.expr = "" # set to avoid class.__repr__ raising AttributeError
263 |
264 | return obj
265 |
266 |
267 | __all__ = ("Field", "Item", "RV", "Convertor")
268 |
--------------------------------------------------------------------------------
/data_extractor/json.py:
--------------------------------------------------------------------------------
1 | """
2 | ===================================================
3 | :mod:`json` -- Extractors for JSON data extracting.
4 | ===================================================
5 | """
6 |
7 | # Standard Library
8 | from typing import TYPE_CHECKING, Any, Optional, Type
9 |
10 | # Local Folder
11 | from .core import AbstractSimpleExtractor
12 | from .exceptions import ExprError
13 | from .utils import Property, _missing_dependency
14 |
15 |
16 | class JSONExtractor(AbstractSimpleExtractor):
17 | """
18 | Use JSONPath expression implementated by **jsonpath-extractor**,
19 | **jsonpath-rw** or **jsonpath-rw-ext** packages for JSON data extracting.
20 | Change **json_extractor_backend** value to indicate which package to use.
21 |
22 | >>> import data_extractor.json
23 | >>> from data_extractor.json import JSONPathExtractor
24 | >>> data_extractor.json.json_extractor_backend = JSONPathExtractor
25 |
26 | Before extracting, should parse the JSON text into Python object.
27 |
28 | :param expr: JSONPath Expression.
29 | :type expr: str
30 | """
31 |
32 | def __new__(
33 | cls: Type["JSONExtractor"], *args: Any, **kwargs: Any
34 | ) -> "JSONExtractor":
35 | if json_extractor_backend is None:
36 | raise RuntimeError(
37 | "'jsonpath-extractor', 'jsonpath-rw' or 'jsonpath-rw-ext' "
38 | "package is needed, run pip to install it. "
39 | )
40 |
41 | obj: JSONExtractor
42 | if cls is JSONExtractor:
43 | # invoke the json extractor backend for object creation
44 | # TODO: cache renamed type
45 | obj = super(AbstractSimpleExtractor, cls).__new__(
46 | type(
47 | "JSONExtractor", (json_extractor_backend,), {}
48 | ) # rename into JSONExtractor
49 | )
50 | else:
51 | # invoke subclasses directly
52 | obj = super(AbstractSimpleExtractor, cls).__new__(cls)
53 |
54 | return obj
55 |
56 | def extract(self, element: Any) -> Any:
57 | raise NotImplementedError
58 |
59 |
60 | try:
61 | # Third Party Library
62 | import jsonpath_rw
63 |
64 | _missing_jsonpath_rw = False
65 | except ImportError:
66 | _missing_jsonpath_rw = True
67 |
68 |
69 | class JSONPathRWExtractor(JSONExtractor):
70 | """
71 | Use JSONPath expression implementated by **jsonpath-rw** package
72 | for JSON data extracting.
73 |
74 | Before extracting, should parse the JSON text into Python object.
75 |
76 | :param expr: JSONPath Expression.
77 | :type expr: str
78 | """
79 |
80 | if TYPE_CHECKING:
81 | # Third Party Library
82 | from jsonpath_rw import JSONPath
83 | _jsonpath = Property["JSONPath"]()
84 |
85 | def __init__(self, expr: str) -> None:
86 | super(JSONExtractor, self).__init__(expr)
87 | if _missing_jsonpath_rw:
88 | _missing_dependency("jsonpath-rw")
89 |
90 | # Third Party Library
91 | from jsonpath_rw.lexer import JsonPathLexerError
92 |
93 | try:
94 | self._jsonpath = jsonpath_rw.parse(self.expr)
95 | except (JsonPathLexerError, Exception) as exc:
96 | # jsonpath_rw.parser.JsonPathParser.p_error raises exc of Exception type
97 | raise ExprError(extractor=self, exc=exc) from exc
98 |
99 | def extract(self, element: Any) -> Any:
100 | """
101 | Extract data from JSON data.
102 |
103 | :param element: Python object parsed from JSON text.
104 | :type element: Any
105 |
106 | :returns: Data.
107 | :rtype: Any
108 | """
109 | return [m.value for m in self._jsonpath.find(element)]
110 |
111 |
112 | try:
113 | # Third Party Library
114 | import jsonpath_rw_ext
115 |
116 | _missing_jsonpath_rw_ext = False
117 | except ImportError:
118 | _missing_jsonpath_rw_ext = True
119 |
120 |
121 | class JSONPathRWExtExtractor(JSONPathRWExtractor):
122 | """
123 | Use JSONPath expression implementated by **jsonpath-rw-ext** package
124 | for JSON data extracting.
125 |
126 | Before extracting, should parse the JSON text into Python object.
127 |
128 | :param expr: JSONPath Expression.
129 | :type expr: str
130 | """
131 |
132 | if TYPE_CHECKING:
133 | # Third Party Library
134 | from jsonpath_rw_ext import JSONPath as JSONPathExt
135 | _jsonpath = Property["JSONPathExt"]()
136 |
137 | def __init__(self, expr: str) -> None:
138 | super(JSONExtractor, self).__init__(expr)
139 | if _missing_jsonpath_rw_ext:
140 | _missing_dependency("jsonpath-rw-ext")
141 |
142 | # Third Party Library
143 | from jsonpath_rw.lexer import JsonPathLexerError
144 |
145 | try:
146 | self._jsonpath = jsonpath_rw_ext.parse(self.expr)
147 | except (JsonPathLexerError, Exception) as exc:
148 | # jsonpath_rw.parser.JsonPathParser.p_error raises exc of Exception type
149 | raise ExprError(extractor=self, exc=exc) from exc
150 |
151 |
152 | try:
153 | # Third Party Library
154 | import jsonpath
155 |
156 | _missing_jsonpath = False
157 | except ImportError:
158 | _missing_jsonpath = True
159 |
160 |
161 | class JSONPathExtractor(JSONExtractor):
162 | """
163 | Use JSONPath expression implementated by **jsonpath-extractor** package
164 | for JSON data extracting.
165 |
166 | Before extracting, should parse the JSON text into Python object.
167 |
168 | :param expr: JSONPath Expression.
169 | :type expr: str
170 | """
171 |
172 | if TYPE_CHECKING:
173 | # Third Party Library
174 | from jsonpath import Expr
175 |
176 | _jsonpath = Property["Expr"]()
177 |
178 | def __init__(self, expr: str) -> None:
179 | super(JSONExtractor, self).__init__(expr)
180 |
181 | if _missing_jsonpath:
182 | _missing_dependency("jsonpath-extractor")
183 |
184 | try:
185 | self._jsonpath = jsonpath.parse(self.expr)
186 | except SyntaxError as exc:
187 | raise ExprError(extractor=self, exc=exc) from exc
188 |
189 | def extract(self, element: Any) -> Any:
190 | """
191 | Extract data from JSON data.
192 |
193 | :param element: Python object parsed from JSON text.
194 | :type element: Any
195 |
196 | :returns: Data.
197 | :rtype: Any
198 | """
199 | return self._jsonpath.find(element)
200 |
201 |
202 | json_extractor_backend: Optional[Type[JSONExtractor]] = None
203 | if not _missing_jsonpath:
204 | json_extractor_backend = JSONPathExtractor
205 | elif not _missing_jsonpath_rw_ext:
206 | json_extractor_backend = JSONPathRWExtExtractor
207 | elif not _missing_jsonpath_rw:
208 | json_extractor_backend = JSONPathRWExtractor
209 |
210 |
211 | __all__ = (
212 | "JSONExtractor",
213 | "JSONPathExtractor",
214 | "JSONPathRWExtExtractor",
215 | "JSONPathRWExtractor",
216 | "json_extractor_backend",
217 | )
218 |
--------------------------------------------------------------------------------
/data_extractor/lxml.py:
--------------------------------------------------------------------------------
1 | """
2 | :mod:`lxml` -- Extractors for XML or HTML data extracting.
3 | ==========================================================
4 | """
5 |
6 | # Standard Library
7 | from typing import List, Union
8 |
9 | # Local Folder
10 | from .core import AbstractSimpleExtractor
11 | from .exceptions import ExprError
12 | from .utils import Property, _missing_dependency
13 |
14 | try:
15 | # Third Party Library
16 | from lxml.etree import XPath, XPathSyntaxError
17 | from lxml.etree import _Element as Element
18 |
19 | _missing_lxml = False
20 | except ImportError:
21 | _missing_lxml = True
22 |
23 | Element = None # TODO: Find a way to get rid of this. See PEP 562
24 |
25 |
26 | class XPathExtractor(AbstractSimpleExtractor):
27 | """
28 | Use XPath for XML or HTML data extracting.
29 |
30 | Before extracting, should parse the XML or HTML text \
31 | into :class:`data_extractor.lxml.Element` object.
32 |
33 | :param expr: XPath Expression.
34 | :type exprt: str
35 | """
36 |
37 | _find = Property["XPath"]()
38 |
39 | def __init__(self, expr: str):
40 | super().__init__(expr)
41 |
42 | if _missing_lxml:
43 | _missing_dependency("lxml")
44 |
45 | try:
46 | self._find = XPath(self.expr)
47 | except XPathSyntaxError as exc:
48 | raise ExprError(extractor=self, exc=exc) from exc
49 |
50 | def extract(self, element: Element) -> Union[List[Element], List[str]]:
51 | """
52 | Extract subelements or data from XML or HTML data.
53 |
54 | :param element: Target.
55 | :type element: :class:`data_extractor.lxml.Element`
56 |
57 | :returns: List of :class:`data_extractor.lxml.Element` objects, \
58 | List of str, or str.
59 | :rtype: list
60 |
61 | :raises data_extractor.exceptions.ExprError: XPath Expression Error.
62 | """
63 | # Third Party Library
64 | from lxml.etree import XPathEvalError
65 |
66 | try:
67 | rv = self._find(element)
68 | if not isinstance(rv, list):
69 | return [rv]
70 | else:
71 | return rv
72 | except XPathEvalError as exc:
73 | raise ExprError(extractor=self, exc=exc) from exc
74 |
75 |
76 | try:
77 | # Third Party Library
78 | import cssselect
79 |
80 | del cssselect
81 | _missing_cssselect = False
82 | except ImportError:
83 | _missing_cssselect = True
84 |
85 |
86 | class CSSExtractor(AbstractSimpleExtractor):
87 | """
88 | Use CSS Selector for XML or HTML data subelements extracting.
89 |
90 | Before extracting, should parse the XML or HTML text \
91 | into :class:`data_extractor.lxml.Element` object.
92 |
93 | :param expr: CSS Selector Expression.
94 | :type expr: str
95 | """
96 |
97 | _extractor = Property[XPathExtractor]()
98 |
99 | def __init__(self, expr: str):
100 | super().__init__(expr)
101 |
102 | if _missing_cssselect:
103 | _missing_dependency("cssselect")
104 |
105 | # Third Party Library
106 | from cssselect import GenericTranslator
107 | from cssselect.parser import SelectorError
108 |
109 | try:
110 | xpath_expr = GenericTranslator().css_to_xpath(self.expr)
111 | except SelectorError as exc:
112 | raise ExprError(extractor=self, exc=exc) from exc
113 |
114 | self._extractor = XPathExtractor(xpath_expr)
115 |
116 | def extract(self, element: Element) -> List[Element]:
117 | """
118 | Extract subelements from XML or HTML data.
119 |
120 | :param element: Target.
121 | :type element: :class:`data_extractor.lxml.Element`
122 |
123 | :returns: List of :class:`data_extractor.lxml.Element` objects, \
124 | extracted result.
125 | :rtype: list
126 | """
127 | return self._extractor.extract(element)
128 |
129 |
130 | class TextCSSExtractor(CSSExtractor):
131 | """
132 | Use CSS Selector for XML or HTML data subelements' text extracting.
133 |
134 | Before extracting, should parse the XML or HTML text \
135 | into :class:`data_extractor.lxml.Element` object.
136 |
137 | :param expr: CSS Selector Expression.
138 | :type expr: str
139 | """
140 |
141 | def extract(self, element: Element) -> List[str]:
142 | """
143 | Extract subelements' text from XML or HTML data.
144 |
145 | :param element: Target.
146 | :type element: :class:`data_extractor.lxml.Element`
147 |
148 | :returns: List of str, extracted result.
149 | :rtype: list
150 |
151 | :raises ~data_extractor.exceptions.ExprError: CSS Selector Expression Error.
152 | """
153 | return [ele.text for ele in super().extract(element)]
154 |
155 |
156 | class AttrCSSExtractor(CSSExtractor):
157 | """
158 | Use CSS Selector for XML or HTML data subelements' attribute value extracting.
159 |
160 | Before extracting, should parse the XML or HTML text \
161 | into :class:`data_extractor.lxml.Element` object.
162 |
163 | :param expr: CSS Selector Expression.
164 | :type expr: str
165 | :param attr: Target attribute name.
166 | :type attr: str
167 | """
168 |
169 | attr = Property[str]()
170 |
171 | def __init__(self, expr: str, attr: str):
172 | super().__init__(expr)
173 | self.attr = attr
174 |
175 | def __repr__(self) -> str:
176 | return f"{self.__class__.__name__}(expr={self.expr!r}, attr={self.attr!r})"
177 |
178 | def extract(self, element: Element) -> List[str]:
179 | """
180 | Extract subelements' attribute value from XML or HTML data.
181 |
182 | :param element: Target.
183 | :type element: :class:`data_extractor.lxml.Element`
184 |
185 | :returns: List of str, extracted result.
186 | :rtype: list
187 |
188 | :raises ~data_extractor.exceptions.ExprError: CSS Selector Expression Error.
189 | """
190 | return [
191 | ele.get(self.attr)
192 | for ele in super().extract(element)
193 | if self.attr in ele.keys()
194 | ]
195 |
196 |
197 | __all__ = (
198 | "AttrCSSExtractor",
199 | "CSSExtractor",
200 | "Element",
201 | "TextCSSExtractor",
202 | "XPathExtractor",
203 | )
204 |
--------------------------------------------------------------------------------
/data_extractor/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linw1995/data_extractor/ca1a4c4dacec7852590ad7bbf1bee421a3ab1e4a/data_extractor/py.typed
--------------------------------------------------------------------------------
/data_extractor/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | =================================
3 | :mod:`utils` -- Extracting Utils.
4 | =================================
5 | """
6 |
7 | # Standard Library
8 | import inspect
9 |
10 | from types import FrameType
11 | from typing import (
12 | TYPE_CHECKING,
13 | Any,
14 | Callable,
15 | Generic,
16 | Optional,
17 | Type,
18 | TypeVar,
19 | Union,
20 | overload,
21 | )
22 |
23 |
24 | class __Sentinel:
25 | """Singleton."""
26 |
27 | def __repr__(self) -> str:
28 | return "sentinel"
29 |
30 |
31 | sentinel = __Sentinel()
32 |
33 |
34 | class LazyStr:
35 | """
36 | Lazy String.
37 |
38 | :param func: Lazy __str__ function.
39 | """
40 |
41 | def __init__(self, func: Callable[[], str]):
42 | self.func = func
43 |
44 | def __str__(self) -> str:
45 | return self.func()
46 |
47 |
48 | def is_extractor(obj: Any) -> bool:
49 | """
50 | Determine the object if it is an extractor, return :obj:`True` if it is.
51 | """
52 | # Local Folder
53 | from .core import AbstractComplexExtractor, AbstractSimpleExtractor
54 |
55 | return isinstance(obj, (AbstractComplexExtractor, AbstractSimpleExtractor))
56 |
57 |
58 | def is_simple_extractor(obj: Any) -> bool:
59 | """
60 | Determine the object if it is a simple extractor, return :obj:`True` if it is.
61 | """
62 | # Local Folder
63 | from .core import AbstractSimpleExtractor
64 |
65 | return isinstance(obj, AbstractSimpleExtractor)
66 |
67 |
68 | def is_complex_extractor(obj: Any) -> bool:
69 | """
70 | Determine the object if it is a complex extractor, return :obj:`True` if it is.
71 | """
72 | # Local Folder
73 | from .core import AbstractComplexExtractor
74 |
75 | return isinstance(obj, AbstractComplexExtractor)
76 |
77 |
78 | def getframe(depth: int = 0) -> Optional[FrameType]:
79 | cur = frame = inspect.currentframe()
80 | if frame is None:
81 | # If running in an implementation without Python stack frame support,
82 | return None
83 |
84 | while depth > -1:
85 | if cur is None:
86 | raise ValueError(f"Invalid depth = {depth!r} for frame = {frame!r}")
87 |
88 | cur = cur.f_back
89 | depth -= 1
90 |
91 | return cur
92 |
93 |
94 | T = TypeVar("T")
95 |
96 | if TYPE_CHECKING:
97 | # Local Folder
98 | from .core import AbstractExtractors
99 |
100 |
101 | class Property(Generic[T]):
102 | """
103 | Extractor property.
104 | """
105 |
106 | def __set_name__(self, owner: Any, name: str) -> None:
107 | """
108 | Customized names -- Descriptor HowTo Guide
109 | https://docs.python.org/3/howto/descriptor.html#customized-names
110 | """
111 | self.public_name = name
112 | self.private_name = f"__property_{name}"
113 |
114 | @overload
115 | def __get__(self, obj: None, cls: Type["AbstractExtractors"]) -> "Property[T]":
116 | pass
117 |
118 | @overload
119 | def __get__(self, obj: Any, cls: Type["AbstractExtractors"]) -> T:
120 | pass
121 |
122 | def __get__(
123 | self, obj: Any, cls: Type["AbstractExtractors"]
124 | ) -> Union["Property[T]", T]:
125 | if obj is None:
126 | return self
127 |
128 | try:
129 | return getattr(obj, self.private_name)
130 | except AttributeError as exc:
131 | # raise right AttributeError
132 | msg: str = exc.args[0]
133 | raise AttributeError(msg.replace(self.private_name, self.public_name))
134 |
135 | def __set__(self, obj: Any, value: T) -> T:
136 | if hasattr(obj, self.private_name):
137 | raise AttributeError("can't set attribute")
138 | else:
139 | setattr(obj, self.private_name, value)
140 | return value
141 |
142 | @staticmethod
143 | def change_internal_value(
144 | obj: "AbstractExtractors", property_name: str, value: T
145 | ) -> None:
146 | attr = getattr(type(obj), property_name)
147 | if not isinstance(attr, Property):
148 | raise AttributeError(f"Type of attribute {property_name!r} is not Property")
149 |
150 | setattr(obj, attr.private_name, value)
151 |
152 |
153 | def _missing_dependency(dependency: str) -> None:
154 | """
155 | Raise :class:RuntimeError for the extractor class that missing optional dependency.
156 | """
157 | raise RuntimeError(f"{dependency!r} package is needed, run pip to install it. ")
158 |
159 |
160 | __all__ = (
161 | "LazyStr",
162 | "Property",
163 | "getframe",
164 | "is_complex_extractor",
165 | "is_extractor",
166 | "is_simple_extractor",
167 | "sentinel",
168 | )
169 |
--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
1 | {
2 | lib,
3 | dream2nix,
4 | ...
5 | }: {
6 | imports = [
7 | dream2nix.modules.dream2nix.WIP-python-pdm
8 | ];
9 |
10 | mkDerivation = {
11 | src = lib.cleanSourceWith {
12 | src = lib.cleanSource ./.;
13 | filter = name: type:
14 | !(builtins.any (x: x) [
15 | (lib.hasSuffix ".nix" name)
16 | (lib.hasPrefix "." (builtins.baseNameOf name))
17 | (lib.hasSuffix "flake.lock" name)
18 | ]);
19 | };
20 | };
21 |
22 | pdm.lockfile = ./pdm.lock;
23 | pdm.pyproject = ./pyproject.toml;
24 |
25 | buildPythonPackage = {
26 | pythonImportsCheck = [
27 | "data_extractor"
28 | ];
29 | };
30 |
31 | pdm.editables = lib.mkForce {};
32 | }
33 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = source
8 | BUILDDIR = build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
1 | div.sphinxsidebar {
2 | width: 250px;
3 | }
4 |
5 | dl.class{
6 | margin-bottom: 1rem;
7 | }
8 |
9 | dl.method, dl.field-list {
10 | margin-bottom: 0.5rem;
11 | }
12 |
13 | pre {
14 | padding: 7px 15px;
15 | }
16 |
--------------------------------------------------------------------------------
/docs/source/api_core.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.core
2 |
3 | .. autoclass:: data_extractor.core.SimpleExtractorMeta
4 |
5 | .. autoclass:: data_extractor.core.ComplexExtractorMeta
6 |
7 | .. autoclass:: data_extractor.core.AbstractSimpleExtractor
8 | :members:
9 |
10 | .. autoclass:: data_extractor.core.AbstractComplexExtractor
11 | :members:
12 |
--------------------------------------------------------------------------------
/docs/source/api_exceptions.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.exceptions
2 |
3 | .. autoexception:: data_extractor.exceptions.ExprError
4 |
5 | .. autoexception:: data_extractor.exceptions.ExtractError
6 |
--------------------------------------------------------------------------------
/docs/source/api_item.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.item
2 |
3 | .. autoclass:: data_extractor.item.Field
4 | :show-inheritance:
5 | :inherited-members:
6 | :members:
7 |
8 | .. autoclass:: data_extractor.item.Item
9 | :show-inheritance:
10 | :inherited-members:
11 | :members: extract, field_names, simplify
12 |
--------------------------------------------------------------------------------
/docs/source/api_json.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.json
2 | :members:
3 | :inherited-members:
4 | :show-inheritance:
5 |
--------------------------------------------------------------------------------
/docs/source/api_lxml.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.lxml
2 | :members:
3 | :inherited-members:
4 | :show-inheritance:
5 |
--------------------------------------------------------------------------------
/docs/source/api_reference.rst:
--------------------------------------------------------------------------------
1 | =============
2 | API Reference
3 | =============
4 |
5 | .. automodule:: data_extractor
6 |
7 | .. toctree::
8 | :name: API Reference
9 | :maxdepth: 2
10 |
11 | api_core
12 | api_exceptions
13 | api_utils
14 | api_lxml
15 | api_json
16 | api_item
17 |
--------------------------------------------------------------------------------
/docs/source/api_utils.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.utils
2 |
3 | .. autoclass:: data_extractor.utils.LazyStr
4 |
5 | .. autodata:: data_extractor.utils.sentinel
6 |
7 | .. autofunction:: data_extractor.utils.is_extractor
8 |
9 | .. autofunction:: data_extractor.utils.is_simple_extractor
10 |
11 | .. autofunction:: data_extractor.utils.is_complex_extractor
12 |
13 | .. autoclass:: data_extractor.utils.Property
14 |
--------------------------------------------------------------------------------
/docs/source/changelog.rst:
--------------------------------------------------------------------------------
1 | =========
2 | Changelog
3 | =========
4 |
5 | v1.0.1
6 | ~~~~~~
7 |
8 | **Build**
9 |
10 | - Supports Python 3.13
11 |
12 |
13 | .. include:: history.rst
14 | :start-line: 4
15 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # http://www.sphinx-doc.org/en/master/config
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | # Standard Library
21 | from datetime import date
22 |
23 | project = "Data-Extractor"
24 | year = date.today().year
25 | copyright = f"{year}, 林玮"
26 | author = "林玮"
27 |
28 |
29 | # -- General configuration ---------------------------------------------------
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.githubpages"]
35 | autodoc_inherit_docstrings = True
36 |
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ["_templates"]
39 |
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = []
44 |
45 |
46 | # -- Options for HTML output -------------------------------------------------
47 |
48 | # The theme to use for HTML and HTML Help pages. See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = "alabaster"
52 | html_theme_options = {
53 | "description": "Combine XPath, CSS Selectors and JSONPath for Web data extracting.",
54 | "github_button": True,
55 | "github_type": "star",
56 | "travis_button": True,
57 | "codecov_button": True,
58 | "github_user": "linw1995",
59 | "github_repo": "data_extractor",
60 | "fixed_sidebar": False,
61 | "page_width": "1024px",
62 | "sidebar_width": "230px",
63 | }
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ["_static"]
68 | html_sidebars = {
69 | "**": ["about.html", "navigation.html", "relations.html", "searchbox.html"]
70 | }
71 |
--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Contributing
3 | ============
4 |
5 |
6 | Environment Setup
7 | ~~~~~~~~~~~~~~~~~
8 |
9 | Clone the source codes from Github.
10 |
11 | .. code-block:: shell
12 |
13 | git clone https://github.com/linw1995/data_extractor.git
14 | cd data_extractor
15 |
16 | Setup the development environment.
17 | Please make sure you install the pdm_,
18 | pre-commit_ and nox_ CLIs in your environment.
19 |
20 | .. code-block:: shell
21 |
22 | make init
23 | make PYTHON=3.7 init # for specific python version
24 |
25 | Linting
26 | ~~~~~~~
27 |
28 | Use pre-commit_ for installing linters to ensure a good code style.
29 |
30 | .. code-block:: shell
31 |
32 | make pre-commit
33 |
34 | Run linters. Some linters run via CLI nox_, so make sure you install it.
35 |
36 | .. code-block:: shell
37 |
38 | make check-all
39 |
40 | Testing
41 | ~~~~~~~
42 |
43 | Run quick tests.
44 |
45 | .. code-block:: shell
46 |
47 | make
48 |
49 | Run quick tests with verbose.
50 |
51 | .. code-block:: shell
52 |
53 | make vtest
54 |
55 | Run tests with coverage.
56 | Testing in multiple Python environments is powered by CLI nox_.
57 |
58 | .. code-block:: shell
59 |
60 | make cov
61 |
62 | .. _pdm: https://github.com/pdm-project/pdm
63 | .. _pre-commit: https://pre-commit.com/
64 | .. _nox: https://nox.thea.codes/en/stable/
65 |
--------------------------------------------------------------------------------
/docs/source/history.rst:
--------------------------------------------------------------------------------
1 | =======
2 | History
3 | =======
4 |
5 | v1.0.0
6 | ~~~~~~
7 |
8 | **Feature**
9 |
10 | - Generic extractor with convertor (#83)
11 | - mypy plugin for type annotation of extracting result (#83)
12 |
13 | v0.10.2
14 | ~~~~~~~
15 |
16 | **Build**
17 |
18 | - upgrade jsonpath-extractor to v0.8.0
19 |
20 |
21 | v0.10.1
22 | ~~~~~~~
23 |
24 | **Fix**
25 |
26 | - typo in .utils.Property
27 |
28 | v0.10.0
29 | ~~~~~~~
30 |
31 | **Feature**
32 |
33 | - supports PEP 561 -- Distributing and Packaging Type Information
34 |
35 | **Fix**
36 |
37 | - remove LICENSE file from dist files
38 | - duplicated extracting if class attrs overlap happened #67
39 | - remove super class sub-extractors error #68
40 |
41 | **Refactor**
42 |
43 | - remove duplciated module "data_extractor.abc"
44 | - remove the lazy build mechanism of extractors
45 | - JSON backend invoking mechanism
46 | - make all properties of extractors immutable
47 |
48 | **Document**
49 |
50 | - fix wrong docstring of "data_extractor.utils.Property"
51 |
52 | v0.9.0
53 | ~~~~~~
54 |
55 | **Fix**
56 |
57 | - type annotations #63 #64
58 |
59 | **Refactor**
60 |
61 | - .utils.Property with "Customized names" support #64
62 | - rename .abc to .core and mark elder duplciated #65
63 |
64 | v0.8.0
65 | ~~~~~~
66 |
67 | - 11bfd2c supports latest jsonpath-extractor package
68 |
69 | v0.7.0
70 | ~~~~~~
71 |
72 | - 65d1fce Fix:Create JSONExtractor with wrong subtype
73 | - 407cd78 New:Make lxml and cssselect optional (#61)
74 |
75 | v0.6.1
76 | ~~~~~~
77 |
78 | - d28fff4 Fix:Item created error by ``type`` function. (Issue #56)
79 |
80 | v0.6.0
81 | ~~~~~~
82 |
83 | - f1d21fe New:Make different implementations of JSONExtractor optional
84 | - 0175cde New:Add jsonpath-extractor as opitional json extractor backend
85 | - 3b6da8b Chg:Upgrade dependencies
86 |
87 | v0.6.0-alpha.3
88 | ~~~~~~~~~~~~~~
89 |
90 | - 1982302 Fix:Type annotation error
91 |
92 | v0.6.0.dev2
93 | ~~~~~~~~~~~
94 |
95 | - b7edbae Dev,New:Use nox test in multi-py-versions, Update workflow
96 | - a043838 Fix:Can't import JSONPathExtractor from root module
97 | - a23ece9 Test,Fix:Missing JSONPathExtractor in simple extractor tests
98 | - 5903ff9 Dev,Fix:Nox changes symlink '.venv' of virtualenv of development
99 | - 57d03ad Dev,Fix:Install unneeded development dependencies
100 |
101 | v0.6.0.dev1
102 | ~~~~~~~~~~~
103 |
104 | - 2459f7d Dev,New:Add Github Actions for CI
105 | - a151a91 Dev,New:Add scripts/export_requirements_txt.sh
106 | - f7cdaa3 Dev,Chg:Remove travis-ci
107 | - f1d21fe New:Make different implementations of JSONExtractor optional
108 | - 9f74619 Fix:Use __getattr__ on the module in the wrong way
109 | - 25a8bf8 Dev,Fix:Cannot use pytest.mark.usefixtures() in pytest.param
110 | - 8f51603 Dev,Chg:Upgrade poetry version in Makefile
111 | - 21aa08e Dev,Chg:Test in two ways
112 | - 4cb4678 Chg:Upgrade dependencies
113 | - 4177b98 Dev,Fix:remove the venv before pretest installation
114 | - 0175cde New:Add jsonpath-extractor as opitional json extractor backend
115 |
116 | v0.5.4
117 | ~~~~~~
118 |
119 | - 9552c79 Fix:Simplified item's extract_first method fail to raise ExtractError
120 | - 08167ab Fix:Simplified item's extract_first method
121 | should support param default
122 | - 6e4c269 New:More unittest for testing the simplified items
123 | - a35b85a Chg:Update poetry.lock
124 | - e5ff37b Docs,Chg:Update travis-ci status source in the README.rst
125 |
126 | v0.5.3
127 | ~~~~~~
128 |
129 | - 6a26be5 Chg:Wrap the single return value as a list
130 | - 0b63927 Fix:Item can not extract the data is list type
131 | - 9deeb5f Chg:Update poetry.lock
132 |
133 | v0.5.2
134 | ~~~~~~
135 |
136 | - 0561672 Fix:Wrong parameter name
137 |
138 | v0.5.1
139 | ~~~~~~
140 |
141 | - c9b07f4 Fix:Wrong shield placing
142 | - b198712 Dev,Fix:Build travis-ci config validation
143 |
144 | v0.5.0
145 | ~~~~~~
146 |
147 | - 0056f37 Split AbstractExtractor into AbstractSimpleExtractor and
148 | AbstractComplexExtractor
149 | - c42aeb5 Feature/more friendly development setup (#34)
150 | - 2f9a71c New:Support testing in 3.8
151 | - c8bd593 New:Stash unstaged code before testing
152 | - d2a18a8 New:Best way to raise new exc
153 | - 90fa9c8 New:ExprError ``__str__`` implementation
154 | - d961768 Fix:Update mypy pre-commit config
155 | - e5d59c3 New:Raise SyntaxError when field overwrites method (#38)
156 | - 7720fb9 Feature/avoid field overwriting (#39)
157 | - b722717 Dev,Fix:Black configure not working
158 | - f8f0df8 New:Implement extractors' build method
159 | - 98ada74 Chg:Update docs
160 |
161 | v0.4.1
162 | ~~~~~~
163 |
164 | - d180992 Add pre-commit support and fix pre-commit check error (#32)
165 | - bd680c1 Update pyproject.toml
166 | - 64f30f7 remove unhappened condtional
167 |
168 | v0.4.0
169 | ~~~~~~
170 |
171 | - 74f569b Update docs and lint docs (#31)
172 | - 4188634 Support RTD (#30)
173 | - a5b776f Separate dependencies (#29)
174 | - 69079b4 Generate simple extractor from a complex extractor (#28)
175 | - 58a7570 Support JSONPath ext syntax (#26)
176 | - bb7c602 Replace Pipenv with Poetry (#24)
177 |
178 | v0.3.2
179 | ~~~~~~
180 |
181 | - cd65ad0 Make Parameter extractor Optional
182 |
183 | v0.2.2
184 | ~~~~~~
185 |
186 | - fca801a Merge pull request #22 from linw1995/hotfix
187 |
188 | + 8bf2a62 Fix name overwritten syntax checking
189 | that includes the ``__init__`` first parameter.
190 |
191 | + 10e2ca0 Fix raise wrong execption from python repl,
192 | oneline code or type() creation.
193 |
194 | v0.2.1
195 | ~~~~~~
196 |
197 | - a05b75f Export all from the root module.
198 | - d2900d3 Add Optional Parameter name for special field name. (#19)
199 | - 99a4a7f Raise SyntaxError
200 | when the field name is the same as Item's parameter… (#18)
201 |
202 | v0.2.0
203 | ~~~~~~
204 |
205 | - 9c2e2cd Rename ExtractFirstMixin into SimpleExtractorBase (#12)
206 | - bac925d Raise ValueError
207 | when misplaced the complex extractor in complex extractor. (#13)
208 |
209 | - 88b9227 Wrap expr exception (#14)
210 | - aeb9520 Deploy Docs on GitHub Pages. (#15)
211 |
212 | + Update docstring.
213 | + Deploy Docs on Github Pages.
214 | + Add Quickstarts.rst
215 |
216 | - Bump into beta
217 |
218 | v0.1.5
219 | ~~~~~~
220 |
221 | - cabfac3 Add utils.py
222 | - 9e1c005 Make all extractor class inherit the same ABC.
223 | - 7828a1a Make easy to trace exception thrown
224 | by complex extractor extracting data.
225 |
226 | v0.1.4
227 | ~~~~~~
228 |
229 | - f4267fe Modify docstr
230 | - 6f2f8d1 Add more docstr
231 |
232 | v0.1.3
233 | ~~~~~~
234 |
235 | - 5f4b0e0 Update README.md
236 | - 1b8bfb9 Add UserWarning when extractor can't extract first item from result
237 | - dd2cd25 Remove the useless _extract call
238 | - 655ec9d Add UserWarning when expr is conflict with parameter is_many=True
239 | - bcade2c No alow user to set is_many=True and default!=sentinel at same time
240 | - 761bd30 Add more unit tests
241 |
242 | v0.1.2
243 | ~~~~~~
244 |
245 | - Add exceptions.py and ExprError
246 | - Change travis-ci deploy stage condition
247 | - Add travis-ci deploy github release
248 |
249 | v0.1.1
250 | ~~~~~~
251 |
252 | - Rename ``.html`` to ``.lxml``;
253 | Remove ``fromstring``, ``tostring`` function from ``.lxml``
254 |
255 | + Rename .html to .lxml
256 | + use ``lxml.html.fromstring`` and ``lxml.html.tostring`` to process HTML
257 | + use ``lxml.etree.fromstring`` and ``lxml.etree.tostring`` to process XML
258 |
259 | - Add check_isort, check_black, check,
260 | check_all, fc: format_code into Makefile for development.
261 |
262 | v0.1.0
263 | ~~~~~~
264 |
265 | - initialize project
266 | - add Extractor to extract data from the text which format is HTML or JSON.
267 | - add complex extractor: Field, Item
268 |
--------------------------------------------------------------------------------
/docs/source/howto/index.rst:
--------------------------------------------------------------------------------
1 | =====================
2 | Data-Extractor HOWTOs
3 | =====================
4 |
5 | Learning how to use data-extractor.
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | json
11 | lxml
12 | item
13 |
--------------------------------------------------------------------------------
/docs/source/howto/item.rst:
--------------------------------------------------------------------------------
1 | ==================
2 | Complex Extracting
3 | ==================
4 |
5 | .. include:: lxml.rst
6 | :start-line: 7
7 | :end-before: Using
8 |
9 | Defining :class:`ChannelItem` class, then extracting the data.
10 |
11 | .. code-block:: python3
12 |
13 | from data_extractor import Field, Item, XPathExtractor
14 |
15 |
16 | class ChannelItem(Item):
17 | title = Field(XPathExtractor("./title/text()"), default="")
18 | link = Field(XPathExtractor("./link/text()"), default="")
19 | description = Field(XPathExtractor("./description/text()"))
20 | publish_date = Field(XPathExtractor("./pubDate/text()"))
21 | guid = Field(XPathExtractor("./guid/text()"))
22 |
23 | Extracting all channel items from file.
24 |
25 | .. code-block:: python3
26 |
27 | from data_extractor import XPathExtractor
28 |
29 | extractor = ChannelItem(XPathExtractor("//channel/item"), is_many=True)
30 | assert extractor.extract(root)[:2] == [
31 | {
32 | "title": "Star City",
33 | "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
34 | "description": 'How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s Star City.',
35 | "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT",
36 | "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
37 | },
38 | {
39 | "title": "",
40 | "link": "",
41 | "description": 'Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a partial eclipse of the Sun on Saturday, May 31st.',
42 | "publish_date": "Fri, 30 May 2003 11:06:42 GMT",
43 | "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
44 | },
45 | ]
46 |
47 | Nested Extractors
48 | ~~~~~~~~~~~~~~~~~
49 |
50 | Defining :class:`Channel` class with :class:`ChannelItem`.
51 |
52 | .. code-block:: python3
53 |
54 | class Channel(Item):
55 | title = Field(XPathExtractor("./title/text()"))
56 | link = Field(XPathExtractor("./link/text()"))
57 | description = Field(XPathExtractor("./description/text()"))
58 | language = Field(XPathExtractor("./language/text()"))
59 | publish_date = Field(XPathExtractor("./pubDate/text()"))
60 | last_build_date = Field(XPathExtractor("./lastBuildDate/text()"))
61 | docs = Field(XPathExtractor("./docs/text()"))
62 | generator = Field(XPathExtractor("./generator/text()"))
63 | managing_editor = Field(XPathExtractor("./managingEditor/text()"))
64 | web_master = Field(XPathExtractor("./webMaster/text()"))
65 |
66 | items = ChannelItem(XPathExtractor("./item[position()<3]"), is_many=True)
67 |
68 | Extracting the rss channel data from file.
69 |
70 | .. code-block:: python3
71 |
72 | from data_extractor import XPathExtractor
73 |
74 | extractor = Channel(XPathExtractor("//channel"))
75 | assert extractor.extract(root) == {
76 | "title": "Liftoff News",
77 | "link": "http://liftoff.msfc.nasa.gov/",
78 | "description": "Liftoff to Space Exploration.",
79 | "language": "en-us",
80 | "publish_date": "Tue, 10 Jun 2003 04:00:00 GMT",
81 | "last_build_date": "Tue, 10 Jun 2003 09:41:01 GMT",
82 | "docs": "http://blogs.law.harvard.edu/tech/rss",
83 | "generator": "Weblog Editor 2.0",
84 | "managing_editor": "editor@example.com",
85 | "web_master": "webmaster@example.com",
86 | "items": [
87 | {
88 | "title": "Star City",
89 | "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
90 | "description": 'How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s Star City.',
91 | "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT",
92 | "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
93 | },
94 | {
95 | "title": "",
96 | "link": "",
97 | "description": 'Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a partial eclipse of the Sun on Saturday, May 31st.',
98 | "publish_date": "Fri, 30 May 2003 11:06:42 GMT",
99 | "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
100 | },
101 | ],
102 | }
103 |
104 | Simplifying Complex Extractor
105 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
106 |
107 | A complex extractor can be simplified
108 | into a simple extractor
109 | by using :meth:`data_extractor.item.Item.simplify`.
110 |
111 | .. code-block:: python3
112 |
113 | from data_extractor import XPathExtractor
114 |
115 | complex_extractorra = ChannelItem(XPathExtractor("//channel/item"))
116 | simple_extractor = complex_extractor.simplify()
117 |
118 | complex_extractor.is_many = False
119 | assert simple_extractor.extract_first(root) == complex_extractor.extract(root)
120 |
121 | complex_extractor.is_many = True
122 | assert simple_extractor.extract(root) == complex_extractor.extract(root)
123 |
124 | Set Paramater Extractor Be None To Extract Data From Root
125 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
126 |
127 | .. code-block:: python3
128 |
129 | from data_extractor import Item, Field, JSONExtractor
130 |
131 |
132 | class User(Item):
133 | nickname = Field(JSONExtractor("name"))
134 | age = Field(JSONExtractor("age"))
135 | raw = Field()
136 |
137 |
138 | assert User().extract({"name": "john", "age": 17, "gender": "male"}) == {
139 | "nickname": "john",
140 | "age": 17,
141 | "raw": {"name": "john", "age": 17, "gender": "male"},
142 | }
143 |
144 | Avoid Field Overwrites Property Or Method
145 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
146 |
147 | To avoid complex extractor's field overwrites its property or method,
148 | use the parameter **name** of the complex extractor.
149 |
150 | .. code-block:: python3
151 |
152 | from data_extractor import Field, Item, JSONExtractor
153 |
154 |
155 | class User(Item):
156 | name_ = Field(JSONExtractor("name"), name="name")
157 |
158 |
159 | assert User().extract({"name": "john", "age": 17}) == {"name": "john"}
160 |
--------------------------------------------------------------------------------
/docs/source/howto/json.rst:
--------------------------------------------------------------------------------
1 | =================
2 | Extract JSON Data
3 | =================
4 |
5 | The function to extract data from the JSON file
6 | powered by python-jsonpath-rw_ and python-jsonpath-rw-ext_
7 | to support JSONPath_.
8 | Or use a new syntax of JSONPATH for extracting
9 | by installing optional dependency jsonpath-extractor_.
10 |
11 | Run below command to install optional dependency.
12 |
13 | .. code-block:: shell
14 |
15 | pip install "data_extractor[jsonpath-rw]"
16 | pip install "data_extractor[jsonpath-rw-ext]"
17 |
18 | pip install "data_extractor[jsonpath-extractor]"
19 |
20 | Use the :class:`data_extractor.json.JSONExtractor` to extract data.
21 |
22 | .. code-block:: python3
23 |
24 | import json
25 | from data_extractor import JSONExtractor
26 |
27 | text = '{"foo": [{"baz": 1}, {"baz": 2}]}'
28 | data = json.loads(text)
29 | assert JSONExtractor("foo[*].baz").extract(data) == [1, 2]
30 |
31 | .. _python-jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw
32 | .. _python-jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/
33 | .. _JSONPath: https://goessner.net/articles/JsonPath/
34 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath
35 |
36 | By changing :data:`json_extractor_backend`
37 | to use a specific backend of JSON extractor.
38 | See APIs ref of :class:`data_extractor.json.JSONExtractor`
39 | for additional details.
40 |
--------------------------------------------------------------------------------
/docs/source/howto/lxml.rst:
--------------------------------------------------------------------------------
1 | ========================
2 | Extract HTML or XML Data
3 | ========================
4 |
5 | The function to extract data from the html or xml file
6 | powered by lxml_ to support XPath_, by cssselect_ to support CSS-Selectors_.
7 |
8 | Run below command to install optional dependency.
9 |
10 | .. code-block:: shell
11 |
12 | pip install "data_extractor[lxml]" # For using XPath
13 | pip install "data_extractor[cssselect]" # For using CSS-Selectors
14 |
15 | Download RSS Sample file for demonstrate.
16 |
17 | .. code-block:: shell
18 |
19 | wget http://www.rssboard.org/files/sample-rss-2.xml
20 |
21 | Parse it into :class:`data_extractor.lxml.Element`.
22 |
23 | .. code-block:: python3
24 |
25 | from pathlib import Path
26 |
27 | from lxml.etree import fromstring
28 |
29 | root = fromstring(Path("sample-rss-2.xml").read_text())
30 |
31 | Using :class:`data_extractor.lxml.XPathExtractor` to extract rss channel title.
32 |
33 | .. code-block:: python3
34 |
35 | from data_extractor import XPathExtractor
36 |
37 | assert XPathExtractor("//channel/title/text()").extract_first(root) == "Liftoff News"
38 |
39 | Using :class:`data_extractor.lxml.TextCSSExtractor`
40 | to extract all rss item links.
41 |
42 | .. code-block:: python3
43 |
44 | from data_extractor import TextCSSExtractor
45 |
46 | assert TextCSSExtractor("item>link").extract(root) == [
47 | "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
48 | "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
49 | "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
50 | ]
51 |
52 | Using :class:`data_extractor.lxml.AttrCSSExtractor` to extract rss version.
53 |
54 | .. code-block:: python3
55 |
56 | from data_extractor import AttrCSSExtractor
57 |
58 | assert AttrCSSExtractor("rss", attr="version").extract_first(root) == "2.0"
59 |
60 | .. _lxml: https://lxml.de
61 | .. _XPath: https://www.w3.org/TR/xpath-10/
62 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/
63 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/
64 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | ==========================================
2 | Welcome to Data-Extractor's documentation!
3 | ==========================================
4 |
5 | .. include:: readme.rst
6 | :start-line: 4
7 |
8 | Contents
9 | ========
10 |
11 | .. toctree::
12 | :maxdepth: 4
13 |
14 | quickstarts
15 | howto/index
16 | contributing
17 | api_reference
18 | changelog
19 |
20 |
21 | Indices and tables
22 | ==================
23 |
24 | * :ref:`genindex`
25 | * :ref:`search`
26 |
--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Installation
3 | ============
4 |
5 | Install the stable version from PYPI.
6 |
7 | .. code-block:: shell
8 |
9 | pip install "data-extractor[jsonpath-extractor]" # for extracting JSON data
10 | pip install "data-extractor[lxml]" # for extracting HTML data
11 |
12 | Or install the latest version from Github.
13 |
14 | .. code-block:: shell
15 |
16 | pip install "data-extractor[jsonpath-extractor] @ git+https://github.com/linw1995/data_extractor.git@master"
17 |
18 | Extract JSON data
19 | ~~~~~~~~~~~~~~~~~
20 |
21 | Currently supports to extract JSON data with below optional dependencies
22 |
23 | - jsonpath-extractor_
24 | - jsonpath-rw_
25 | - jsonpath-rw-ext_
26 |
27 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath
28 | .. _jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw
29 | .. _jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/
30 |
31 | install one dependency of them to extract JSON data.
32 |
33 | Extract HTML(XML) data
34 | ~~~~~~~~~~~~~~~~~~~~~~
35 |
36 | Currently supports to extract HTML(XML) data with below optional dependencies
37 |
38 | - lxml_ for using XPath_
39 | - cssselect_ for using CSS-Selectors_
40 |
41 | .. _lxml: https://lxml.de/
42 | .. _XPath: https://www.w3.org/TR/xpath-10/
43 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/
44 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/
45 |
--------------------------------------------------------------------------------
/docs/source/quickstarts.rst:
--------------------------------------------------------------------------------
1 | ===========
2 | Quickstarts
3 | ===========
4 |
5 | Installation
6 | ~~~~~~~~~~~~
7 |
8 | .. include:: installation.rst
9 | :start-line: 4
10 |
11 | Usage
12 | ~~~~~
13 |
14 | .. code-block:: python3
15 |
16 | from data_extractor import Field, Item, JSONExtractor
17 |
18 |
19 | class Count(Item):
20 | followings = Field(JSONExtractor("countFollowings"))
21 | fans = Field(JSONExtractor("countFans"))
22 |
23 |
24 | class User(Item):
25 | name_ = Field(JSONExtractor("name"), name="name")
26 | age = Field(JSONExtractor("age"), default=17)
27 | count = Count()
28 |
29 |
30 | assert User(JSONExtractor("data.users[*]"), is_many=True).extract(
31 | {
32 | "data": {
33 | "users": [
34 | {
35 | "name": "john",
36 | "age": 19,
37 | "countFollowings": 14,
38 | "countFans": 212,
39 | },
40 | {
41 | "name": "jack",
42 | "description": "",
43 | "countFollowings": 54,
44 | "countFans": 312,
45 | },
46 | ]
47 | }
48 | }
49 | ) == [
50 | {"name": "john", "age": 19, "count": {"followings": 14, "fans": 212}},
51 | {"name": "jack", "age": 17, "count": {"followings": 54, "fans": 312}},
52 | ]
53 |
--------------------------------------------------------------------------------
/docs/source/readme.rst:
--------------------------------------------------------------------------------
1 | ==============
2 | Data Extractor
3 | ==============
4 |
5 | |license| |Pypi Status| |Python version| |Package version| |PyPI - Downloads|
6 | |GitHub last commit| |Code style: black| |Build Status| |codecov|
7 | |Documentation Status| |PDM managed|
8 |
9 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting.
10 |
11 | Quickstarts
12 | <<<<<<<<<<<
13 |
14 | .. include:: quickstarts.rst
15 | :start-line: 4
16 |
17 | Changelog
18 | <<<<<<<<<
19 |
20 | .. include:: changelog.rst
21 | :start-line: 4
22 | :end-before: .. include:: history.rst
23 |
24 | Contributing
25 | <<<<<<<<<<<<
26 |
27 | .. include:: contributing.rst
28 | :start-line: 4
29 |
30 | .. |license| image:: https://img.shields.io/github/license/linw1995/data_extractor.svg
31 | :target: https://github.com/linw1995/data_extractor/blob/master/LICENSE
32 |
33 | .. |Pypi Status| image:: https://img.shields.io/pypi/status/data_extractor.svg
34 | :target: https://pypi.org/project/data_extractor
35 |
36 | .. |Python version| image:: https://img.shields.io/pypi/pyversions/data_extractor.svg
37 | :target: https://pypi.org/project/data_extractor
38 |
39 | .. |Package version| image:: https://img.shields.io/pypi/v/data_extractor.svg
40 | :target: https://pypi.org/project/data_extractor
41 |
42 | .. |PyPI - Downloads| image:: https://img.shields.io/pypi/dm/data-extractor.svg
43 | :target: https://pypi.org/project/data_extractor
44 |
45 | .. |GitHub last commit| image:: https://img.shields.io/github/last-commit/linw1995/data_extractor.svg
46 | :target: https://github.com/linw1995/data_extractor
47 |
48 | .. |Code style: black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
49 | :target: https://github.com/ambv/black
50 |
51 | .. |Build Status| image:: https://github.com/linw1995/data_extractor/workflows/Lint&Test/badge.svg
52 | :target: https://github.com/linw1995/data_extractor/actions?query=workflow%3ALint%26Test
53 |
54 | .. |codecov| image:: https://codecov.io/gh/linw1995/data_extractor/branch/master/graph/badge.svg
55 | :target: https://codecov.io/gh/linw1995/data_extractor
56 |
57 | .. |Documentation Status| image:: https://readthedocs.org/projects/data-extractor/badge/?version=latest
58 | :target: https://data-extractor.readthedocs.io/en/latest/?badge=latest
59 |
60 | .. |PDM managed| image:: https://img.shields.io/badge/pdm-managed-blueviolet
61 | :target: https://pdm.fming.dev
62 |
--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
1 | {
2 | "nodes": {
3 | "dream2nix": {
4 | "inputs": {
5 | "nixpkgs": "nixpkgs",
6 | "purescript-overlay": "purescript-overlay",
7 | "pyproject-nix": "pyproject-nix"
8 | },
9 | "locked": {
10 | "lastModified": 1728585693,
11 | "narHash": "sha256-rhx5SYpIkPu7d+rjF9FGGBVxS0BwAEkmYIsJg2a3E20=",
12 | "owner": "nix-community",
13 | "repo": "dream2nix",
14 | "rev": "c6935471f7e1a9e190aaa9ac9823dca34e00d92a",
15 | "type": "github"
16 | },
17 | "original": {
18 | "owner": "nix-community",
19 | "repo": "dream2nix",
20 | "type": "github"
21 | }
22 | },
23 | "flake-compat": {
24 | "flake": false,
25 | "locked": {
26 | "lastModified": 1696426674,
27 | "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
28 | "owner": "edolstra",
29 | "repo": "flake-compat",
30 | "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
31 | "type": "github"
32 | },
33 | "original": {
34 | "owner": "edolstra",
35 | "repo": "flake-compat",
36 | "type": "github"
37 | }
38 | },
39 | "nixpkgs": {
40 | "locked": {
41 | "lastModified": 1728538411,
42 | "narHash": "sha256-f0SBJz1eZ2yOuKUr5CA9BHULGXVSn6miBuUWdTyhUhU=",
43 | "owner": "NixOS",
44 | "repo": "nixpkgs",
45 | "rev": "b69de56fac8c2b6f8fd27f2eca01dcda8e0a4221",
46 | "type": "github"
47 | },
48 | "original": {
49 | "owner": "NixOS",
50 | "ref": "nixpkgs-unstable",
51 | "repo": "nixpkgs",
52 | "type": "github"
53 | }
54 | },
55 | "purescript-overlay": {
56 | "inputs": {
57 | "flake-compat": "flake-compat",
58 | "nixpkgs": [
59 | "dream2nix",
60 | "nixpkgs"
61 | ],
62 | "slimlock": "slimlock"
63 | },
64 | "locked": {
65 | "lastModified": 1724504251,
66 | "narHash": "sha256-TIw+sac0NX0FeAneud+sQZT+ql1G/WEb7/Vb436rUXM=",
67 | "owner": "thomashoneyman",
68 | "repo": "purescript-overlay",
69 | "rev": "988b09676c2a0e6a46dfa3589aa6763c90476b8a",
70 | "type": "github"
71 | },
72 | "original": {
73 | "owner": "thomashoneyman",
74 | "repo": "purescript-overlay",
75 | "type": "github"
76 | }
77 | },
78 | "pyproject-nix": {
79 | "flake": false,
80 | "locked": {
81 | "lastModified": 1702448246,
82 | "narHash": "sha256-hFg5s/hoJFv7tDpiGvEvXP0UfFvFEDgTdyHIjDVHu1I=",
83 | "owner": "davhau",
84 | "repo": "pyproject.nix",
85 | "rev": "5a06a2697b228c04dd2f35659b4b659ca74f7aeb",
86 | "type": "github"
87 | },
88 | "original": {
89 | "owner": "davhau",
90 | "ref": "dream2nix",
91 | "repo": "pyproject.nix",
92 | "type": "github"
93 | }
94 | },
95 | "root": {
96 | "inputs": {
97 | "dream2nix": "dream2nix",
98 | "nixpkgs": [
99 | "dream2nix",
100 | "nixpkgs"
101 | ]
102 | }
103 | },
104 | "slimlock": {
105 | "inputs": {
106 | "nixpkgs": [
107 | "dream2nix",
108 | "purescript-overlay",
109 | "nixpkgs"
110 | ]
111 | },
112 | "locked": {
113 | "lastModified": 1688756706,
114 | "narHash": "sha256-xzkkMv3neJJJ89zo3o2ojp7nFeaZc2G0fYwNXNJRFlo=",
115 | "owner": "thomashoneyman",
116 | "repo": "slimlock",
117 | "rev": "cf72723f59e2340d24881fd7bf61cb113b4c407c",
118 | "type": "github"
119 | },
120 | "original": {
121 | "owner": "thomashoneyman",
122 | "repo": "slimlock",
123 | "type": "github"
124 | }
125 | }
126 | },
127 | "root": "root",
128 | "version": 7
129 | }
130 |
--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
1 | {
2 | inputs = {
3 | dream2nix.url = "github:nix-community/dream2nix";
4 | nixpkgs.follows = "dream2nix/nixpkgs";
5 | };
6 |
7 | outputs = {
8 | self,
9 | dream2nix,
10 | nixpkgs,
11 | }: let
12 | eachSystem = nixpkgs.lib.genAttrs [
13 | "aarch64-darwin"
14 | "aarch64-linux"
15 | "x86_64-darwin"
16 | "x86_64-linux"
17 | ];
18 | in {
19 | packages = eachSystem (system: {
20 | default = dream2nix.lib.evalModules {
21 | packageSets.nixpkgs = import nixpkgs {inherit system;};
22 | modules = [
23 | ./default.nix
24 | {
25 | paths.projectRoot = ./.;
26 | paths.projectRootFile = "flake.nix";
27 | paths.package = ./.;
28 | }
29 | ];
30 | };
31 | });
32 | devShells = eachSystem (system: let
33 | pkgs = import nixpkgs {inherit system;};
34 | in {
35 | default = pkgs.mkShell {
36 | inputsFrom = [
37 | self.packages.${system}.default.devShell
38 | ];
39 |
40 | packages = with pkgs; [
41 | pre-commit
42 | python312Packages.nox
43 |
44 | python39
45 | python310
46 | python311
47 | python312
48 | python313
49 | ];
50 | };
51 | });
52 | };
53 | }
54 |
--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
1 | # Standard Library
2 | import os
3 |
4 | from pathlib import Path
5 |
6 | # Third Party Library
7 | import nox
8 |
9 | nox.options.stop_on_first_error = True
10 |
11 |
12 | pythons = ["3.10", "3.11", "3.12", "3.13"]
13 |
14 | os.environ.update({"PDM_IGNORE_SAVED_PYTHON": "1"})
15 | os.environ.pop("PYTHONPATH", None)
16 |
17 |
18 | def venv_setup_on_create(session, install):
19 | cwd = os.getcwd()
20 | session.cd(session.create_tmp())
21 | if session.run(
22 | "python", "-Esc", "import data_extractor", success_codes=(1, 0), silent=True
23 | ):
24 | install(session)
25 | session.cd(cwd)
26 |
27 |
28 | @nox.session(python=pythons, venv_backend="venv")
29 | @nox.parametrize(
30 | "extractor_backend",
31 | [
32 | None,
33 | "jsonpath-extractor",
34 | "jsonpath-rw",
35 | "jsonpath-rw-ext",
36 | "lxml",
37 | "cssselect",
38 | ],
39 | )
40 | def coverage_test(session, extractor_backend):
41 | venv_setup_on_create(
42 | session,
43 | lambda s: s.run(
44 | "pdm",
45 | "sync",
46 | "-G",
47 | "test",
48 | *(("-G", extractor_backend) if extractor_backend else tuple()),
49 | external=True,
50 | ),
51 | )
52 | session.run(
53 | "pytest",
54 | "-vv",
55 | "--cov=data_extractor",
56 | "--cov-append",
57 | "--ignore",
58 | "tests/typesafety",
59 | *session.posargs,
60 | )
61 |
62 |
63 | @nox.session(python=pythons, venv_backend="venv")
64 | def coverage_report(session):
65 | venv_setup_on_create(
66 | session,
67 | lambda s: s.run("pdm", "sync", "-G", "test", external=True),
68 | )
69 | session.run("coverage", "report")
70 | session.run("coverage", "xml")
71 | session.run("coverage", "html")
72 | session.log(
73 | f">> open file:/{(Path() / 'htmlcov/index.html').absolute()} to see coverage"
74 | )
75 |
76 |
77 | @nox.session(python=pythons, venv_backend="venv")
78 | def test_mypy_plugin(session):
79 | venv_setup_on_create(
80 | session,
81 | lambda s: s.run("pdm", "sync", "-G", "test-mypy-plugin", external=True),
82 | )
83 |
84 | session.run(
85 | "pytest",
86 | "-vv",
87 | "--cov=data_extractor/contrib/mypy",
88 | "--cov-append",
89 | "--mypy-same-process",
90 | "--mypy-ini-file=./tests/mypy.ini",
91 | "tests/typesafety",
92 | *(session.posargs if session.posargs else tuple()),
93 | )
94 |
95 |
96 | @nox.session(python=pythons[-1:], venv_backend="venv")
97 | def build_readme(session):
98 | venv_setup_on_create(
99 | session,
100 | lambda s: s.run("pdm", "sync", "-G", "build_readme", external=True),
101 | )
102 | session.run(
103 | "python", "scripts/build_readme.py", "README.template.rst", "README.rst"
104 | )
105 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "data_extractor"
3 | authors = [{ name = "林玮 (Jade Lin)", email = "linw1995@icloud.com" }]
4 | description = "Combine XPath, CSS Selectors and JSONPath for Web data extracting."
5 | readme = "README.rst"
6 | classifiers = [
7 | "Intended Audience :: Developers",
8 | "License :: OSI Approved :: MIT License",
9 | "Programming Language :: Python",
10 | "Programming Language :: Python :: 3",
11 | "Programming Language :: Python :: 3.10",
12 | "Programming Language :: Python :: 3.11",
13 | "Programming Language :: Python :: 3.12",
14 | "Programming Language :: Python :: 3.13",
15 | "Development Status :: 5 - Production/Stable",
16 | "Operating System :: POSIX",
17 | "Operating System :: MacOS :: MacOS X",
18 | "Operating System :: Microsoft :: Windows",
19 | ]
20 | keywords = [
21 | "data-extractor",
22 | "data-extraction",
23 | "xpath",
24 | "css-selectors",
25 | "jsonpath",
26 | ]
27 | dependencies = []
28 | requires-python = ">=3.10"
29 | license = { text = "MIT" }
30 | dynamic = ["version"]
31 |
32 | [project.urls]
33 | homepage = "https://github.com/linw1995/data_extractor"
34 | repository = "https://github.com/linw1995/data_extractor"
35 | documentation = "https://data-extractor.readthedocs.io/en/latest/"
36 |
37 | [project.optional-dependencies]
38 | lxml = ["lxml >= 4.3, < 6"]
39 | cssselect = ["lxml >= 4.3, < 6", "cssselect >= 1.0.3, < 2"]
40 | jsonpath-extractor = ["jsonpath-extractor >= 0.5, < 0.9"]
41 | jsonpath-rw = ["jsonpath-rw >= 1.4, < 2"]
42 | jsonpath-rw-ext = ["jsonpath-rw >= 1.4, < 2", "jsonpath-rw-ext >= 1.2, < 2"]
43 |
44 | [build-system]
45 | requires = ["pdm-pep517[setuptools]"]
46 | build-backend = "pdm.pep517.api"
47 |
48 | [tool.commitizen]
49 | name = "cz_conventional_commits"
50 | version = "0.9.0"
51 | tag_format = "v$version"
52 |
53 | [tool.pdm]
54 | includes = [
55 | "data_extractor/*.py",
56 | "data_extractor/py.typed",
57 | "data_extractor/contrib/",
58 | ]
59 | version = { use_scm = true }
60 |
61 | [tool.pdm.dev-dependencies]
62 | docs = [
63 | "lxml >= 4.3, < 6",
64 | "cssselect >= 1.0.3, < 2",
65 | "jsonpath-extractor >= 0.5, < 0.9",
66 | "jsonpath-rw >= 1.4, < 2",
67 | "jsonpath-rw-ext >= 1.2, < 2",
68 | "sphinx ~= 7.4",
69 | ]
70 | build_readme = ["click >= 7.1.2, < 8", "docutils >= 0.16", "pygments ~= 2.8"]
71 | test = ["pytest >= 6, < 8", "pytest-cov >= 2.7.1, < 3"]
72 | test-mypy-plugin = [
73 | "pytest >= 6, < 8",
74 | "pytest-cov >= 2.7.1, < 3",
75 | "pytest-mypy-plugins ~= 1.6",
76 | "mypy~=0.930",
77 | ]
78 |
79 | [[tool.pdm.source]]
80 | name = "pypi"
81 | url = "https://pypi.org/simple"
82 | verify_ssl = true
83 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # https://github.com/pytest-dev/pytest/issues/3062
2 | # Don't move below settings into setup.cfg
3 | [pytest]
4 | testpaths = ./tests
5 | log_format = %(asctime)s - %(name)s - %(levelname)s - %(message)s
6 | xfail_strict=true
7 |
--------------------------------------------------------------------------------
/scripts/build_readme.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | .. _issues-172: https://github.com/github/markup/issues/172
4 |
5 | Because Github markup do not render :include: directive. (issues-172_)
6 | """
7 |
8 | # Standard Library
9 | from pathlib import Path
10 | from unittest import mock
11 |
12 | # Third Party Library
13 | import click
14 | import docutils.nodes
15 | import docutils.parsers.rst
16 | import docutils.parsers.rst.directives.misc
17 | import docutils.statemachine
18 | import docutils.utils
19 |
20 |
21 | @click.command()
22 | @click.argument("source_file")
23 | @click.argument("target_file")
24 | def build_readme(source_file, target_file):
25 | old_string2lines = docutils.statemachine.string2lines
26 | old_run = docutils.parsers.rst.directives.misc.Include.run
27 | text = ""
28 | target_text = None
29 |
30 | def string2lines(*args, **kwargs):
31 | nonlocal text, target_text
32 | if target_text is not None:
33 | text = text.replace(target_text, args[0])
34 | target_text = None
35 | else:
36 | text += args[0]
37 |
38 | rv = old_string2lines(*args, **kwargs)
39 | return rv
40 |
41 | def run(self):
42 | nonlocal target_text
43 | target_text = self.block_text
44 | rv = old_run(self)
45 | return rv
46 |
47 | with (
48 | mock.patch.object(docutils.statemachine, "string2lines", string2lines),
49 | mock.patch.object(docutils.parsers.rst.directives.misc.Include, "run", run),
50 | ):
51 | source_file_path: Path = Path.cwd() / source_file
52 | target_file_path: Path = Path.cwd() / target_file
53 | parser = docutils.parsers.rst.Parser()
54 | default_settings = docutils.frontend.OptionParser(
55 | components=(docutils.parsers.rst.Parser,)
56 | ).get_default_values()
57 | document = docutils.utils.new_document(source_file_path.name, default_settings)
58 | parser.parse(source_file_path.read_text(encoding="utf-8"), document)
59 | text = text.rstrip() + "\n"
60 | if (
61 | target_file_path.exists()
62 | and target_file_path.read_text(encoding="utf-8") == text
63 | ):
64 | return
65 |
66 | target_file_path.write_text(text, encoding="utf-8")
67 |
68 |
69 | if __name__ == "__main__":
70 | build_readme()
71 |
--------------------------------------------------------------------------------
/scripts/export_requirements_txt.py:
--------------------------------------------------------------------------------
1 | # Standard Library
2 | import enum
3 | import shlex
4 | import subprocess
5 |
6 | from pathlib import Path
7 |
8 | Format = enum.Enum("Format", "requirements setuppy")
9 | BASE_DIR = Path(__file__).parent / "requirements"
10 |
11 |
12 | def fix_end_of_file(text):
13 | return text.rstrip() + "\n"
14 |
15 |
16 | def pdm_export(args, filename, format: Format):
17 | output = subprocess.check_output(
18 | shlex.split(f"pdm export -f {format.name} {' '.join(args)}"), encoding="utf-8"
19 | )
20 | output = fix_end_of_file(output)
21 | if format is Format.setuppy:
22 | output = "\n".join(
23 | ['# This a dummy setup.py to enable GitHub "Used By" stats', output]
24 | )
25 | p = Path(filename)
26 | if not p.parent.exists():
27 | p.parent.mkdir(parents=True)
28 | is_new = not p.exists()
29 | if is_new or p.read_text() != output:
30 | p.write_text(output)
31 | if is_new:
32 | raise RuntimeError("Create a new file")
33 |
34 |
35 | pdm_export(
36 | args=["--prod"],
37 | filename=BASE_DIR / "requirements-mini.txt",
38 | format=Format.requirements,
39 | )
40 | pdm_export(
41 | args=[
42 | "--prod",
43 | "-G:all",
44 | ],
45 | filename=BASE_DIR / "requirements.txt",
46 | format=Format.requirements,
47 | )
48 | pdm_export(
49 | args=["-G:all"],
50 | filename=BASE_DIR / "requirements-dev.txt",
51 | format=Format.requirements,
52 | )
53 | pdm_export(
54 | args=["-G", "docs"],
55 | filename=BASE_DIR / "requirements-docs.txt",
56 | format=Format.requirements,
57 | )
58 | # pdm_export(args=[], filename=BASE_DIR / "setup.py", format=Format.setuppy)
59 |
--------------------------------------------------------------------------------
/scripts/requirements/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | # This file is @generated by PDM.
2 | # Please do not edit it manually.
3 |
4 | alabaster==0.7.16 \
5 | --hash=sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65 \
6 | --hash=sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92
7 | babel==2.16.0 \
8 | --hash=sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b \
9 | --hash=sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316
10 | certifi==2024.12.14 \
11 | --hash=sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56 \
12 | --hash=sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db
13 | charset-normalizer==3.4.0 \
14 | --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \
15 | --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \
16 | --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \
17 | --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \
18 | --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \
19 | --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \
20 | --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \
21 | --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \
22 | --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \
23 | --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \
24 | --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \
25 | --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \
26 | --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \
27 | --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \
28 | --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \
29 | --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \
30 | --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \
31 | --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \
32 | --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \
33 | --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \
34 | --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \
35 | --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \
36 | --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \
37 | --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \
38 | --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \
39 | --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \
40 | --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \
41 | --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \
42 | --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \
43 | --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \
44 | --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \
45 | --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \
46 | --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \
47 | --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \
48 | --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \
49 | --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \
50 | --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \
51 | --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \
52 | --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \
53 | --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \
54 | --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \
55 | --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \
56 | --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \
57 | --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \
58 | --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \
59 | --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \
60 | --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \
61 | --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \
62 | --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \
63 | --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \
64 | --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \
65 | --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \
66 | --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \
67 | --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \
68 | --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \
69 | --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \
70 | --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \
71 | --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \
72 | --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \
73 | --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \
74 | --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \
75 | --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482
76 | colorama==0.4.6; sys_platform == "win32" \
77 | --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
78 | --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
79 | cssselect==1.2.0 \
80 | --hash=sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc \
81 | --hash=sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e
82 | decorator==5.1.1 \
83 | --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
84 | --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
85 | docutils==0.21.2 \
86 | --hash=sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f \
87 | --hash=sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2
88 | idna==3.10 \
89 | --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
90 | --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
91 | imagesize==1.4.1 \
92 | --hash=sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b \
93 | --hash=sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a
94 | jinja2==3.1.5 \
95 | --hash=sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb \
96 | --hash=sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb
97 | jsonpath-extractor==0.8.0 \
98 | --hash=sha256:08c53808f981fbd27f3488687940607b6213da38cc8c67e56cb41610acd53783 \
99 | --hash=sha256:e82fcd6ae89123eb5ea09a2afb76d2884346369d0cd0c9509efff65c49fd15b6
100 | jsonpath-rw==1.4.0 \
101 | --hash=sha256:05c471281c45ae113f6103d1268ec7a4831a2e96aa80de45edc89b11fac4fbec
102 | jsonpath-rw-ext==1.2.2 \
103 | --hash=sha256:0947e018c4e6d46f9d04c56487793c702eb225fa252891aa4ed41a9ca26f3d84 \
104 | --hash=sha256:a9e44e803b6d87d135b09d1e5af0db4d4cf97ba62711a80aa51c8c721980a994
105 | lxml==5.3.0 \
106 | --hash=sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3 \
107 | --hash=sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002 \
108 | --hash=sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd \
109 | --hash=sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832 \
110 | --hash=sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e \
111 | --hash=sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30 \
112 | --hash=sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51 \
113 | --hash=sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4 \
114 | --hash=sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4 \
115 | --hash=sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86 \
116 | --hash=sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8 \
117 | --hash=sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f \
118 | --hash=sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03 \
119 | --hash=sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e \
120 | --hash=sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99 \
121 | --hash=sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7 \
122 | --hash=sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d \
123 | --hash=sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22 \
124 | --hash=sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492 \
125 | --hash=sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b \
126 | --hash=sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f \
127 | --hash=sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a \
128 | --hash=sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a \
129 | --hash=sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4 \
130 | --hash=sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442 \
131 | --hash=sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b \
132 | --hash=sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c \
133 | --hash=sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1 \
134 | --hash=sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be \
135 | --hash=sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367 \
136 | --hash=sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e \
137 | --hash=sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16 \
138 | --hash=sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d \
139 | --hash=sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83 \
140 | --hash=sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba \
141 | --hash=sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763 \
142 | --hash=sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff \
143 | --hash=sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b \
144 | --hash=sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c \
145 | --hash=sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8 \
146 | --hash=sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f \
147 | --hash=sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a \
148 | --hash=sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce \
149 | --hash=sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1 \
150 | --hash=sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330 \
151 | --hash=sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18 \
152 | --hash=sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff \
153 | --hash=sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c \
154 | --hash=sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179 \
155 | --hash=sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080 \
156 | --hash=sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d \
157 | --hash=sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32 \
158 | --hash=sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a \
159 | --hash=sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79 \
160 | --hash=sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3 \
161 | --hash=sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5 \
162 | --hash=sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f \
163 | --hash=sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d \
164 | --hash=sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3 \
165 | --hash=sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9 \
166 | --hash=sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957 \
167 | --hash=sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb \
168 | --hash=sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656 \
169 | --hash=sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b \
170 | --hash=sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d \
171 | --hash=sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd \
172 | --hash=sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859 \
173 | --hash=sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a \
174 | --hash=sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005 \
175 | --hash=sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654 \
176 | --hash=sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80 \
177 | --hash=sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec \
178 | --hash=sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7 \
179 | --hash=sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965 \
180 | --hash=sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8
181 | markupsafe==3.0.2 \
182 | --hash=sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4 \
183 | --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \
184 | --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \
185 | --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \
186 | --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \
187 | --hash=sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca \
188 | --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \
189 | --hash=sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832 \
190 | --hash=sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b \
191 | --hash=sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579 \
192 | --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \
193 | --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \
194 | --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \
195 | --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \
196 | --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \
197 | --hash=sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb \
198 | --hash=sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e \
199 | --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \
200 | --hash=sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a \
201 | --hash=sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d \
202 | --hash=sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b \
203 | --hash=sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8 \
204 | --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \
205 | --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \
206 | --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \
207 | --hash=sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d \
208 | --hash=sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93 \
209 | --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \
210 | --hash=sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158 \
211 | --hash=sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84 \
212 | --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \
213 | --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \
214 | --hash=sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171 \
215 | --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \
216 | --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \
217 | --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \
218 | --hash=sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d \
219 | --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \
220 | --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \
221 | --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \
222 | --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \
223 | --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \
224 | --hash=sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798 \
225 | --hash=sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c \
226 | --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \
227 | --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \
228 | --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \
229 | --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \
230 | --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \
231 | --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430 \
232 | --hash=sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50
233 | packaging==24.2 \
234 | --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \
235 | --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f
236 | pbr==6.1.0 \
237 | --hash=sha256:788183e382e3d1d7707db08978239965e8b9e4e5ed42669bf4758186734d5f24 \
238 | --hash=sha256:a776ae228892d8013649c0aeccbb3d5f99ee15e005a4cbb7e61d55a067b28a2a
239 | ply==3.11 \
240 | --hash=sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3 \
241 | --hash=sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce
242 | pygments==2.18.0 \
243 | --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \
244 | --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a
245 | requests==2.32.3 \
246 | --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
247 | --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
248 | six==1.17.0 \
249 | --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
250 | --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
251 | snowballstemmer==2.2.0 \
252 | --hash=sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1 \
253 | --hash=sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a
254 | sphinx==7.4.7 \
255 | --hash=sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe \
256 | --hash=sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239
257 | sphinxcontrib-applehelp==2.0.0 \
258 | --hash=sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1 \
259 | --hash=sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5
260 | sphinxcontrib-devhelp==2.0.0 \
261 | --hash=sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad \
262 | --hash=sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2
263 | sphinxcontrib-htmlhelp==2.1.0 \
264 | --hash=sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8 \
265 | --hash=sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9
266 | sphinxcontrib-jsmath==1.0.1 \
267 | --hash=sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178 \
268 | --hash=sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8
269 | sphinxcontrib-qthelp==2.0.0 \
270 | --hash=sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab \
271 | --hash=sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb
272 | sphinxcontrib-serializinghtml==2.0.0 \
273 | --hash=sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331 \
274 | --hash=sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d
275 | tomli==2.2.1; python_version < "3.11" \
276 | --hash=sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6 \
277 | --hash=sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd \
278 | --hash=sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c \
279 | --hash=sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b \
280 | --hash=sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8 \
281 | --hash=sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6 \
282 | --hash=sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77 \
283 | --hash=sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff \
284 | --hash=sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea \
285 | --hash=sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192 \
286 | --hash=sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249 \
287 | --hash=sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee \
288 | --hash=sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4 \
289 | --hash=sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98 \
290 | --hash=sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8 \
291 | --hash=sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4 \
292 | --hash=sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281 \
293 | --hash=sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744 \
294 | --hash=sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69 \
295 | --hash=sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13 \
296 | --hash=sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140 \
297 | --hash=sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e \
298 | --hash=sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e \
299 | --hash=sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc \
300 | --hash=sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff \
301 | --hash=sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec \
302 | --hash=sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2 \
303 | --hash=sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222 \
304 | --hash=sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106 \
305 | --hash=sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272 \
306 | --hash=sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a \
307 | --hash=sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7
308 | typing-extensions==3.10.0.2 \
309 | --hash=sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e \
310 | --hash=sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34
311 | urllib3==2.3.0 \
312 | --hash=sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df \
313 | --hash=sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d
314 | --index-url https://pypi.org/simple
315 |
--------------------------------------------------------------------------------
/scripts/requirements/requirements-mini.txt:
--------------------------------------------------------------------------------
1 | # This file is @generated by PDM.
2 | # Please do not edit it manually.
3 |
4 | --index-url https://pypi.org/simple
5 |
--------------------------------------------------------------------------------
/scripts/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file is @generated by PDM.
2 | # Please do not edit it manually.
3 |
4 | cssselect==1.2.0 \
5 | --hash=sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc \
6 | --hash=sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e
7 | decorator==5.1.1 \
8 | --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
9 | --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
10 | jsonpath-extractor==0.8.0 \
11 | --hash=sha256:08c53808f981fbd27f3488687940607b6213da38cc8c67e56cb41610acd53783 \
12 | --hash=sha256:e82fcd6ae89123eb5ea09a2afb76d2884346369d0cd0c9509efff65c49fd15b6
13 | jsonpath-rw==1.4.0 \
14 | --hash=sha256:05c471281c45ae113f6103d1268ec7a4831a2e96aa80de45edc89b11fac4fbec
15 | jsonpath-rw-ext==1.2.2 \
16 | --hash=sha256:0947e018c4e6d46f9d04c56487793c702eb225fa252891aa4ed41a9ca26f3d84 \
17 | --hash=sha256:a9e44e803b6d87d135b09d1e5af0db4d4cf97ba62711a80aa51c8c721980a994
18 | lxml==5.3.0 \
19 | --hash=sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3 \
20 | --hash=sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002 \
21 | --hash=sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd \
22 | --hash=sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832 \
23 | --hash=sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e \
24 | --hash=sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30 \
25 | --hash=sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51 \
26 | --hash=sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4 \
27 | --hash=sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4 \
28 | --hash=sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86 \
29 | --hash=sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8 \
30 | --hash=sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f \
31 | --hash=sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03 \
32 | --hash=sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e \
33 | --hash=sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99 \
34 | --hash=sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7 \
35 | --hash=sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d \
36 | --hash=sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22 \
37 | --hash=sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492 \
38 | --hash=sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b \
39 | --hash=sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f \
40 | --hash=sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a \
41 | --hash=sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a \
42 | --hash=sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4 \
43 | --hash=sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442 \
44 | --hash=sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b \
45 | --hash=sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c \
46 | --hash=sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1 \
47 | --hash=sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be \
48 | --hash=sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367 \
49 | --hash=sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e \
50 | --hash=sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16 \
51 | --hash=sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d \
52 | --hash=sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83 \
53 | --hash=sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba \
54 | --hash=sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763 \
55 | --hash=sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff \
56 | --hash=sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b \
57 | --hash=sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c \
58 | --hash=sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8 \
59 | --hash=sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f \
60 | --hash=sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a \
61 | --hash=sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce \
62 | --hash=sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1 \
63 | --hash=sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330 \
64 | --hash=sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18 \
65 | --hash=sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff \
66 | --hash=sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c \
67 | --hash=sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179 \
68 | --hash=sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080 \
69 | --hash=sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d \
70 | --hash=sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32 \
71 | --hash=sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a \
72 | --hash=sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79 \
73 | --hash=sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3 \
74 | --hash=sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5 \
75 | --hash=sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f \
76 | --hash=sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d \
77 | --hash=sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3 \
78 | --hash=sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9 \
79 | --hash=sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957 \
80 | --hash=sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb \
81 | --hash=sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656 \
82 | --hash=sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b \
83 | --hash=sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d \
84 | --hash=sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd \
85 | --hash=sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859 \
86 | --hash=sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a \
87 | --hash=sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005 \
88 | --hash=sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654 \
89 | --hash=sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80 \
90 | --hash=sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec \
91 | --hash=sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7 \
92 | --hash=sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965 \
93 | --hash=sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8
94 | pbr==6.1.0 \
95 | --hash=sha256:788183e382e3d1d7707db08978239965e8b9e4e5ed42669bf4758186734d5f24 \
96 | --hash=sha256:a776ae228892d8013649c0aeccbb3d5f99ee15e005a4cbb7e61d55a067b28a2a
97 | ply==3.11 \
98 | --hash=sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3 \
99 | --hash=sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce
100 | six==1.17.0 \
101 | --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
102 | --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
103 | typing-extensions==3.10.0.2 \
104 | --hash=sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e \
105 | --hash=sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34
106 | --index-url https://pypi.org/simple
107 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203, W503
4 |
5 | [isort]
6 | profile=black
7 | lines_between_types=1
8 |
9 | import_heading_stdlib=Standard Library
10 | import_heading_thirdparty=Third Party Library
11 | import_heading_firstparty=First Party Library
12 | import_heading_localfolder=Local Folder
13 |
14 | [coverage:run]
15 | branch = true
16 | omit =
17 | site-packages
18 |
19 | [coverage:report]
20 | precision = 2
21 | # Regexes for lines to exclude from consideration
22 | exclude_lines =
23 | # Have to re-enable the standard pragma
24 | pragma: no cover
25 |
26 | # Don't complain about missing debug-only code:
27 | def __repr__
28 | if self\.debug
29 |
30 | # Don't complain if tests don't hit defensive assertion code:
31 | raise AssertionError
32 | raise NotImplementedError
33 |
34 | # Don't complain if non-runnable code isn't run:
35 | if 0:
36 | if __name__ == .__main__.:
37 | if TYPE_CHECKING:
38 |
39 | # type annotations
40 | @overload
41 |
42 |
43 | ignore_errors = True
44 |
45 | [mypy]
46 | follow_imports = silent
47 | warn_redundant_casts = true
48 | check_untyped_defs = true
49 | disallow_any_generics = false
50 | no_implicit_optional = true
51 | #disallow_untyped_defs = true
52 | #warn_unused_ignores = true
53 | plugins = data_extractor.contrib.mypy:plugin
54 |
55 | [mypy-lxml.*]
56 | ignore_missing_imports = true
57 |
58 | [mypy-cssselect.*]
59 | ignore_missing_imports = true
60 |
61 | [mypy-jsonpath.*]
62 | ignore_missing_imports = true
63 |
64 | [mypy-jsonpath_rw.*]
65 | ignore_missing_imports = true
66 |
67 | [mypy-jsonpath_rw_ext.*]
68 | ignore_missing_imports = true
69 |
70 | [mypy-mypy.*]
71 | ignore_missing_imports = true
72 |
73 | [mypy-tests.*]
74 | disallow_untyped_defs = false
75 |
76 | [mypy-pytest.*]
77 | ignore_missing_imports = true
78 |
79 | [mypy-_pytest.*]
80 | ignore_missing_imports = true
81 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Standard Library
2 | import platform
3 | import sys
4 |
5 | from pathlib import Path
6 |
7 | current_python_version = "%s.%s" % platform.python_version_tuple()[:2]
8 |
9 | # when executing pytest cli, the sys.path will be changed.
10 | # jsonpath-extractor package's module `jsonpath` same as
11 | # the file `jsonpath.py` in f'{sys.prefix}/bin'.
12 | # So need to remove it to avoid import the wrong module.
13 | for p in [
14 | Path(f"{sys.prefix}/bin/jsonpath.py"),
15 | Path(f"__pypackages__/{current_python_version}/bin/jsonpath.py"),
16 | ]:
17 | if p.exists():
18 | p.unlink()
19 |
20 | # pdm
21 |
--------------------------------------------------------------------------------
/tests/assets/sample-rss-2.xml:
--------------------------------------------------------------------------------
1 |
2 |