├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── Makefile
├── README.rst
├── README.template.rst
├── data_extractor
    ├── __init__.py
    ├── contrib
    │   └── mypy
    │   │   └── __init__.py
    ├── core.py
    ├── exceptions.py
    ├── item.py
    ├── json.py
    ├── lxml.py
    ├── py.typed
    └── utils.py
├── default.nix
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _static
    │       └── custom.css
    │   ├── api_core.rst
    │   ├── api_exceptions.rst
    │   ├── api_item.rst
    │   ├── api_json.rst
    │   ├── api_lxml.rst
    │   ├── api_reference.rst
    │   ├── api_utils.rst
    │   ├── changelog.rst
    │   ├── conf.py
    │   ├── contributing.rst
    │   ├── history.rst
    │   ├── howto
    │       ├── index.rst
    │       ├── item.rst
    │       ├── json.rst
    │       └── lxml.rst
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── quickstarts.rst
    │   └── readme.rst
├── flake.lock
├── flake.nix
├── noxfile.py
├── pdm.lock
├── pyproject.toml
├── pytest.ini
├── scripts
    ├── build_readme.py
    ├── export_requirements_txt.py
    └── requirements
    │   ├── requirements-dev.txt
    │   ├── requirements-docs.txt
    │   ├── requirements-mini.txt
    │   └── requirements.txt
├── setup.cfg
└── tests
    ├── __init__.py
    ├── assets
        └── sample-rss-2.xml
    ├── conftest.py
    ├── mypy.ini
    ├── test_exceptions.py
    ├── test_generic_item.py
    ├── test_item.py
    ├── test_json.py
    ├── test_lxml.py
    ├── test_utils.py
    ├── typesafety
        ├── conftest.py
        ├── test_extracted_typed_dict.yml
        └── test_generic.yml
    └── utils.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "pip"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "weekly"
7 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "*"
 7 | 
 8 | jobs:
 9 |   release:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v1
13 |       - name: Set up PDM
14 |         uses: pdm-project/setup-pdm@v2.5
15 |         with:
16 |           python-version: 3.9
17 |           version: 2.19.2
18 |       - name: Build release
19 |         run: |
20 |           pdm build
21 |       - name: Upload release
22 |         uses: actions/upload-artifact@v4
23 |         with:
24 |           name: dist
25 |           path: dist
26 |       - name: Publish release to PYPI
27 |         run: |
28 |           pip install twine
29 |           twine upload -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} --verbose dist/*
30 |       - name: Publish release to GitHub Release
31 |         uses: softprops/action-gh-release@v2
32 |         with:
33 |           files: dist/*
34 |         env:
35 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Lint&Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |       - release/*
 8 |   pull_request:
 9 |     branches:
10 |       - "*"
11 | 
12 | jobs:
13 |   lint:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: actions/checkout@v1
17 |       - name: Set up PDM
18 |         uses: pdm-project/setup-pdm@v3
19 |         with:
20 |           python-version: 3.13
21 |           version: 2.19.2
22 |       - name: Cache Nox Virtualenvs
23 |         uses: actions/cache@v1
24 |         with:
25 |           path: .nox
26 |           key: ${{ runner.os }}-nox-${{ hashFiles('**/pdm.lock') }}
27 |           restore-keys: ${{ runner.os }}-nox
28 |       - name: Install nox
29 |         run: |
30 |           pip install nox
31 |           pdm config python.use_venv true
32 |       - uses: pre-commit/action@v2.0.0
33 |         env:
34 |           SKIP: export_requirements_txt
35 |   test:
36 |     needs: lint
37 |     runs-on: ubuntu-latest
38 |     strategy:
39 |       matrix:
40 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
41 |     steps:
42 |       - uses: actions/checkout@v1
43 |       - name: Set up PDM
44 |         uses: pdm-project/setup-pdm@v3
45 |         with:
46 |           python-version: ${{ matrix.python-version }}
47 |           version: 2.19.2
48 |       - name: Cache Nox Virtualenvs
49 |         uses: actions/cache@v1
50 |         with:
51 |           path: .nox
52 |           key: ${{ runner.os }}-${{ matrix.python-version }}-nox-${{ hashFiles('**/pdm.lock') }}
53 |           restore-keys: ${{ runner.os }}-${{ matrix.python-version }}-nox
54 |       - name: Install nox
55 |         run: |
56 |           pip install nox
57 |           pdm config python.use_venv true
58 |       - name: Test with coverage
59 |         run: |
60 |           make PYTHON=${{ matrix.python-version }} cov
61 |       - name: Upload coverage to Codecov
62 |         uses: codecov/codecov-action@v2
63 |         with:
64 |           token: ${{ secrets.CODECOV_TOKEN }}
65 |           flags: main,unittest,${{ matrix.python-version }}
66 |           fail_ci_if_error: true
67 |   test-mypy-plugin:
68 |     needs: lint
69 |     runs-on: ubuntu-latest
70 |     strategy:
71 |       matrix:
72 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
73 |     steps:
74 |       - uses: actions/checkout@v1
75 |       - name: Set up PDM
76 |         uses: pdm-project/setup-pdm@v3
77 |         with:
78 |           python-version: ${{ matrix.python-version }}
79 |       - name: Cache Nox Virtualenvs
80 |         uses: actions/cache@v1
81 |         with:
82 |           path: .nox
83 |           key: ${{ runner.os }}-${{ matrix.python-version }}-nox-${{ hashFiles('**/pdm.lock') }}
84 |           restore-keys: ${{ runner.os }}-${{ matrix.python-version }}-nox
85 |       - name: Install nox
86 |         run: |
87 |           pip install nox
88 |           pdm config python.use_venv true
89 |       - name: Test
90 |         run: |
91 |           make PYTHON=${{ matrix.python-version }} test-mypy-plugin
92 |       - name: Upload coverage to Codecov
93 |         uses: codecov/codecov-action@v2
94 |         with:
95 |           token: ${{ secrets.CODECOV_TOKEN }}
96 |           flags: plugin-mypy,unittest,${{ matrix.python-version }}
97 |           fail_ci_if_error: true
98 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/python,emacs
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,emacs
  4 | 
  5 | ### Emacs ###
  6 | # -*- mode: gitignore; -*-
  7 | *~
  8 | \#*\#
  9 | /.emacs.desktop
 10 | /.emacs.desktop.lock
 11 | *.elc
 12 | auto-save-list
 13 | tramp
 14 | .\#*
 15 | 
 16 | # Org-mode
 17 | .org-id-locations
 18 | *_archive
 19 | ltximg/**
 20 | 
 21 | # flymake-mode
 22 | *_flymake.*
 23 | 
 24 | # eshell files
 25 | /eshell/history
 26 | /eshell/lastdir
 27 | 
 28 | # elpa packages
 29 | /elpa/
 30 | 
 31 | # reftex files
 32 | *.rel
 33 | 
 34 | # AUCTeX auto folder
 35 | /auto/
 36 | 
 37 | # cask packages
 38 | .cask/
 39 | dist/
 40 | 
 41 | # Flycheck
 42 | flycheck_*.el
 43 | 
 44 | # server auth directory
 45 | /server/
 46 | 
 47 | # projectiles files
 48 | .projectile
 49 | 
 50 | # directory configuration
 51 | .dir-locals.el
 52 | 
 53 | # network security
 54 | /network-security.data
 55 | 
 56 | 
 57 | ### Python ###
 58 | # Byte-compiled / optimized / DLL files
 59 | __pycache__/
 60 | *.py[cod]
 61 | *$py.class
 62 | 
 63 | # C extensions
 64 | *.so
 65 | 
 66 | # Distribution / packaging
 67 | .Python
 68 | build/
 69 | develop-eggs/
 70 | downloads/
 71 | eggs/
 72 | .eggs/
 73 | lib/
 74 | lib64/
 75 | parts/
 76 | sdist/
 77 | var/
 78 | wheels/
 79 | pip-wheel-metadata/
 80 | share/python-wheels/
 81 | *.egg-info/
 82 | .installed.cfg
 83 | *.egg
 84 | MANIFEST
 85 | 
 86 | # PyInstaller
 87 | #  Usually these files are written by a python script from a template
 88 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 89 | *.manifest
 90 | *.spec
 91 | 
 92 | # Installer logs
 93 | pip-log.txt
 94 | pip-delete-this-directory.txt
 95 | 
 96 | # Unit test / coverage reports
 97 | htmlcov/
 98 | .tox/
 99 | .nox/
100 | .coverage
101 | .coverage.*
102 | .cache
103 | nosetests.xml
104 | coverage.xml
105 | *.cover
106 | *.py,cover
107 | .hypothesis/
108 | .pytest_cache/
109 | pytestdebug.log
110 | 
111 | # Translations
112 | *.mo
113 | *.pot
114 | 
115 | # Django stuff:
116 | *.log
117 | local_settings.py
118 | db.sqlite3
119 | db.sqlite3-journal
120 | 
121 | # Flask stuff:
122 | instance/
123 | .webassets-cache
124 | 
125 | # Scrapy stuff:
126 | .scrapy
127 | 
128 | # Sphinx documentation
129 | docs/_build/
130 | doc/_build/
131 | 
132 | # PyBuilder
133 | target/
134 | 
135 | # Jupyter Notebook
136 | .ipynb_checkpoints
137 | 
138 | # IPython
139 | profile_default/
140 | ipython_config.py
141 | 
142 | # pyenv
143 | .python-version
144 | 
145 | # pipenv
146 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
147 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
148 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
149 | #   install all needed dependencies.
150 | #Pipfile.lock
151 | 
152 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
153 | __pypackages__/
154 | 
155 | # Celery stuff
156 | celerybeat-schedule
157 | celerybeat.pid
158 | 
159 | # SageMath parsed files
160 | *.sage.py
161 | 
162 | # Environments
163 | .env
164 | .venv
165 | env/
166 | venv/
167 | ENV/
168 | env.bak/
169 | venv.bak/
170 | pythonenv*
171 | 
172 | # Spyder project settings
173 | .spyderproject
174 | .spyproject
175 | 
176 | # Rope project settings
177 | .ropeproject
178 | 
179 | # mkdocs documentation
180 | /site
181 | 
182 | # mypy
183 | .mypy_cache/
184 | .dmypy.json
185 | dmypy.json
186 | 
187 | # Pyre type checker
188 | .pyre/
189 | 
190 | # pytype static type analyzer
191 | .pytype/
192 | 
193 | # profiling data
194 | .prof
195 | 
196 | # End of https://www.toptal.com/developers/gitignore/api/python,emacs
197 | 
198 | ### Custom ###
199 | ## IDEA
200 | .vscode
201 | ## Emacs
202 | .persp-confs
203 | ## Makefile
204 | ## pdm
205 | .pdm.toml
206 | ## MacOS
207 | .DS_Store
208 | ## misc
209 | .dream2nix/
210 | .pdm-python
211 | pdm.toml
212 | result
213 | .envrc
214 | .direnv
215 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/commitizen-tools/commitizen
 3 |     rev: v3.27.0
 4 |     hooks:
 5 |       - id: commitizen
 6 |         stages:
 7 |           - commit-msg
 8 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 9 |     rev: v4.6.0
10 |     hooks:
11 |       - id: check-symlinks
12 |       - id: check-toml
13 |       - id: check-yaml
14 |         args: [--unsafe]
15 |       - id: detect-private-key
16 |       - id: end-of-file-fixer
17 |       - id: trailing-whitespace
18 |       - id: check-added-large-files
19 |       - id: mixed-line-ending
20 |         args: [--fix=lf]
21 |   - repo: https://github.com/pre-commit/pygrep-hooks
22 |     rev: v1.10.0
23 |     hooks:
24 |       - id: python-check-blanket-noqa
25 |       - id: python-check-mock-methods
26 |       - id: python-no-eval
27 |       - id: python-no-log-warn
28 |       - id: python-use-type-annotations
29 |       - id: rst-backticks
30 |   - repo: https://github.com/psf/black
31 |     rev: 24.4.2
32 |     hooks:
33 |       - id: black
34 |   - repo: https://github.com/asottile/blacken-docs
35 |     rev: 1.16.0
36 |     hooks:
37 |       - id: blacken-docs
38 |         additional_dependencies: [black==23.3.*]
39 |   - repo: https://github.com/PyCQA/flake8
40 |     rev: 7.0.0
41 |     hooks:
42 |       - id: flake8
43 |         additional_dependencies: ["flake8-bugbear==23.5.*"]
44 |   - repo: https://github.com/pre-commit/mirrors-mypy
45 |     rev: v1.10.0
46 |     hooks:
47 |       - id: mypy
48 |         files: data_extractor/.+\.py$
49 |         pass_filenames: false
50 |         entry: bash -c 'env PYTHONPATH=.:$PYTHONPATH mypy data_extractor --show-traceback'
51 |   - repo: https://github.com/pre-commit/mirrors-isort
52 |     rev: v5.10.1
53 |     hooks:
54 |       - id: isort
55 |   - repo: https://github.com/PyCQA/doc8
56 |     rev: v1.1.1
57 |     hooks:
58 |       - id: doc8
59 |   - repo: local
60 |     hooks:
61 |       - id: build_readme
62 |         name: build_readme
63 |         description: Build README.rst
64 |         entry: nox -s build_readme
65 |         language: system
66 |         pass_filenames: false
67 |         types: [rst]
68 |       - id: export_requirements_txt
69 |         name: export_requirements_txt
70 |         description: create requirement file for python
71 |         entry: python3 scripts/export_requirements_txt.py
72 |         language: system
73 |         files: pdm.lock
74 |         pass_filenames: false
75 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.12"
 7 | 
 8 | python:
 9 |   install:
10 |     - requirements: ./scripts/requirements/requirements-docs.txt
11 |     - path: .
12 | 
13 | sphinx:
14 |   builder: html
15 |   configuration: docs/source/conf.py
16 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at linw1995@icloud.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 林玮
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | help:
 2 | 	@echo "PYTHON=X.Y init		setup development environemnt with specific Python version"
 3 | 	@echo "init			setup development environment with defualt Python version 3.11"
 4 | 	@echo "update-dev		update devepoment dependencies via pdm and via pre-commit"
 5 | 	@echo "update			update all dependencies via pdm and via pre-commit"
 6 | 	@echo "pre-commit		setup git hooks"
 7 | 	@echo "check-all		run code quality checkers"
 8 | 	@echo "test			run quick tests"
 9 | 	@echo "vtest			run quick tests with verbose"
10 | 	@echo "PYTHON=X.Y cov		run tests with coverage and with specific Python version"
11 | 	@echo "cov			run tests with coverage and with default Python version 3.11"
12 | 	@echo "test-mypy-plugin	run mypy plugin tests"
13 | 	@echo "type-check		run static type checking"
14 | 
15 | EMPTY :=
16 | SPACE := $(EMPTY) $(EMPTY)
17 | 
18 | PYTHON = 3.13
19 | EXTRAS = lxml cssselect jsonpath-extractor jsonpath-rw jsonpath-rw-ext
20 | DEV_EXTRAS = test test-mypy-plugin docs
21 | EXTRAS_ARGS = $(if $(EXTRAS),-G,) $(subst $(SPACE),$(SPACE)-G$(SPACE),$(EXTRAS))
22 | DEV_EXTRAS_ARGS = $(if $(DEV_EXTRAS),-G,) $(subst $(SPACE),$(SPACE)-G$(SPACE),$(DEV_EXTRAS))
23 | 
24 | # Environment setup
25 | init:
26 | 	@echo ">> installing $(if $(EXTRAS),\"$(EXTRAS)\" ,)$(if $(DEV_EXTRAS),\"$(DEV_EXTRAS)\" ,)dependencies by pdm"
27 | 	$(if $(PYTHON),pdm use -f $(PYTHON),)
28 | 	pdm info && pdm info --env
29 | 	pdm install $(EXTRAS_ARGS) $(DEV_EXTRAS_ARGS)
30 | 	pdm config -l python.use_venv true
31 | 
32 | deinit:
33 | 	rm -rf .nox
34 | 	rm -rf __pypackages__
35 | 	rm -rf .mypy_cache
36 | 	rm -rf htmlcov
37 | 	rm -rf .pytest_cache
38 | 	rm -rf *.egg-info
39 | 
40 | update-dev:
41 | 	pdm update $(DEV_EXTRAS_ARGS) $(EXTRAS_ARGS)
42 | 	pre-commit autoupdate
43 | 
44 | update:
45 | 	pdm update
46 | 	pre-commit autoupdate
47 | 
48 | # Environment setup end
49 | 
50 | pre-commit:
51 | 	pre-commit install --hook-type commit-msg --hook-type pre-commit --overwrite
52 | 
53 | check-all:
54 | 	pre-commit run --all-files
55 | 
56 | type-check:
57 | 	pre-commit run mypy
58 | 
59 | test:
60 | 	pdm run pytest -q -x --ff --nf --ignore tests/typesafety
61 | 
62 | vtest:
63 | 	pdm run pytest -vv -x --ff --nf --ignore tests/typesafety
64 | 
65 | test-mypy-plugin:
66 | 	rm -rf .coverage
67 | 	nox -p $(PYTHON) -s test_mypy_plugin coverage_report -- $(TARGET)
68 | 
69 | test-mypy-plugin-full:
70 | 	rm -rf .coverage
71 | 	nox -s test_mypy_plugin -- $(TARGET)
72 | 	nox -p 3.10 -s coverage_report
73 | 
74 | cov:
75 | 	rm -rf .coverage
76 | 	nox -p $(PYTHON) -s coverage_test coverage_report -- $(TARGET)
77 | 
78 | cov-full:
79 | 	rm -rf .coverage
80 | 	nox -s coverage_test -- $(TARGET)
81 | 	nox -p 3.10 -s coverage_report
82 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ==============
  2 | Data Extractor
  3 | ==============
  4 | 
  5 | |license| |Pypi Status| |Python version| |Package version| |PyPI - Downloads|
  6 | |GitHub last commit| |Code style: black| |Build Status| |codecov|
  7 | |Documentation Status| |PDM managed|
  8 | 
  9 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting.
 10 | 
 11 | Quickstarts
 12 | <<<<<<<<<<<
 13 | 
 14 | Installation
 15 | ~~~~~~~~~~~~
 16 | 
 17 | Install the stable version from PYPI.
 18 | 
 19 | .. code-block:: shell
 20 | 
 21 |     pip install "data-extractor[jsonpath-extractor]"  # for extracting JSON data
 22 |     pip install "data-extractor[lxml]"  # for extracting HTML data
 23 | 
 24 | Or install the latest version from Github.
 25 | 
 26 | .. code-block:: shell
 27 | 
 28 |     pip install "data-extractor[jsonpath-extractor] @ git+https://github.com/linw1995/data_extractor.git@master"
 29 | 
 30 | Extract JSON data
 31 | ~~~~~~~~~~~~~~~~~
 32 | 
 33 | Currently supports to extract JSON data with below optional dependencies
 34 | 
 35 | - jsonpath-extractor_
 36 | - jsonpath-rw_
 37 | - jsonpath-rw-ext_
 38 | 
 39 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath
 40 | .. _jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw
 41 | .. _jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/
 42 | 
 43 | install one dependency of them to extract JSON data.
 44 | 
 45 | Extract HTML(XML) data
 46 | ~~~~~~~~~~~~~~~~~~~~~~
 47 | 
 48 | Currently supports to extract HTML(XML) data with below optional dependencies
 49 | 
 50 | - lxml_ for using XPath_
 51 | - cssselect_ for using CSS-Selectors_
 52 | 
 53 | .. _lxml: https://lxml.de/
 54 | .. _XPath: https://www.w3.org/TR/xpath-10/
 55 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/
 56 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/
 57 | 
 58 | Usage
 59 | ~~~~~
 60 | 
 61 | .. code-block:: python3
 62 | 
 63 |     from data_extractor import Field, Item, JSONExtractor
 64 | 
 65 | 
 66 |     class Count(Item):
 67 |         followings = Field(JSONExtractor("countFollowings"))
 68 |         fans = Field(JSONExtractor("countFans"))
 69 | 
 70 | 
 71 |     class User(Item):
 72 |         name_ = Field(JSONExtractor("name"), name="name")
 73 |         age = Field(JSONExtractor("age"), default=17)
 74 |         count = Count()
 75 | 
 76 | 
 77 |     assert User(JSONExtractor("data.users[*]"), is_many=True).extract(
 78 |         {
 79 |             "data": {
 80 |                 "users": [
 81 |                     {
 82 |                         "name": "john",
 83 |                         "age": 19,
 84 |                         "countFollowings": 14,
 85 |                         "countFans": 212,
 86 |                     },
 87 |                     {
 88 |                         "name": "jack",
 89 |                         "description": "",
 90 |                         "countFollowings": 54,
 91 |                         "countFans": 312,
 92 |                     },
 93 |                 ]
 94 |             }
 95 |         }
 96 |     ) == [
 97 |         {"name": "john", "age": 19, "count": {"followings": 14, "fans": 212}},
 98 |         {"name": "jack", "age": 17, "count": {"followings": 54, "fans": 312}},
 99 |     ]
100 | 
101 | Changelog
102 | <<<<<<<<<
103 | 
104 | v1.0.1
105 | ~~~~~~
106 | 
107 | **Build**
108 | 
109 | - Supports Python 3.13
110 | 
111 | 
112 | 
113 | Contributing
114 | <<<<<<<<<<<<
115 | 
116 | 
117 | Environment Setup
118 | ~~~~~~~~~~~~~~~~~
119 | 
120 | Clone the source codes from Github.
121 | 
122 | .. code-block:: shell
123 | 
124 |     git clone https://github.com/linw1995/data_extractor.git
125 |     cd data_extractor
126 | 
127 | Setup the development environment.
128 | Please make sure you install the pdm_,
129 | pre-commit_ and nox_ CLIs in your environment.
130 | 
131 | .. code-block:: shell
132 | 
133 |     make init
134 |     make PYTHON=3.7 init  # for specific python version
135 | 
136 | Linting
137 | ~~~~~~~
138 | 
139 | Use pre-commit_ for installing linters to ensure a good code style.
140 | 
141 | .. code-block:: shell
142 | 
143 |     make pre-commit
144 | 
145 | Run linters. Some linters run via CLI nox_, so make sure you install it.
146 | 
147 | .. code-block:: shell
148 | 
149 |     make check-all
150 | 
151 | Testing
152 | ~~~~~~~
153 | 
154 | Run quick tests.
155 | 
156 | .. code-block:: shell
157 | 
158 |     make
159 | 
160 | Run quick tests with verbose.
161 | 
162 | .. code-block:: shell
163 | 
164 |     make vtest
165 | 
166 | Run tests with coverage.
167 | Testing in multiple Python environments is powered by CLI nox_.
168 | 
169 | .. code-block:: shell
170 | 
171 |     make cov
172 | 
173 | .. _pdm: https://github.com/pdm-project/pdm
174 | .. _pre-commit: https://pre-commit.com/
175 | .. _nox: https://nox.thea.codes/en/stable/
176 | 
177 | .. |license| image:: https://img.shields.io/github/license/linw1995/data_extractor.svg
178 |     :target: https://github.com/linw1995/data_extractor/blob/master/LICENSE
179 | 
180 | .. |Pypi Status| image:: https://img.shields.io/pypi/status/data_extractor.svg
181 |     :target: https://pypi.org/project/data_extractor
182 | 
183 | .. |Python version| image:: https://img.shields.io/pypi/pyversions/data_extractor.svg
184 |     :target: https://pypi.org/project/data_extractor
185 | 
186 | .. |Package version| image:: https://img.shields.io/pypi/v/data_extractor.svg
187 |     :target: https://pypi.org/project/data_extractor
188 | 
189 | .. |PyPI - Downloads| image:: https://img.shields.io/pypi/dm/data-extractor.svg
190 |     :target: https://pypi.org/project/data_extractor
191 | 
192 | .. |GitHub last commit| image:: https://img.shields.io/github/last-commit/linw1995/data_extractor.svg
193 |     :target: https://github.com/linw1995/data_extractor
194 | 
195 | .. |Code style: black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
196 |     :target: https://github.com/ambv/black
197 | 
198 | .. |Build Status| image:: https://github.com/linw1995/data_extractor/workflows/Lint&Test/badge.svg
199 |     :target: https://github.com/linw1995/data_extractor/actions?query=workflow%3ALint%26Test
200 | 
201 | .. |codecov| image:: https://codecov.io/gh/linw1995/data_extractor/branch/master/graph/badge.svg
202 |     :target: https://codecov.io/gh/linw1995/data_extractor
203 | 
204 | .. |Documentation Status| image:: https://readthedocs.org/projects/data-extractor/badge/?version=latest
205 |     :target: https://data-extractor.readthedocs.io/en/latest/?badge=latest
206 | 
207 | .. |PDM managed| image:: https://img.shields.io/badge/pdm-managed-blueviolet
208 |     :target: https://pdm.fming.dev
209 | 


--------------------------------------------------------------------------------
/README.template.rst:
--------------------------------------------------------------------------------
1 | .. include:: docs/source/readme.rst
2 | 


--------------------------------------------------------------------------------
/data_extractor/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :mod:`data_extractor`
 3 | =====================
 4 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting.
 5 | """
 6 | 
 7 | # Local Folder
 8 | from .core import (
 9 |     AbstractComplexExtractor,
10 |     AbstractExtractors,
11 |     AbstractSimpleExtractor,
12 |     ComplexExtractorMeta,
13 | )
14 | from .exceptions import ExprError, ExtractError
15 | from .item import RV, Convertor, Field, Item
16 | from .json import (
17 |     JSONExtractor,
18 |     JSONPathExtractor,
19 |     JSONPathRWExtExtractor,
20 |     JSONPathRWExtractor,
21 | )
22 | from .lxml import (
23 |     AttrCSSExtractor,
24 |     CSSExtractor,
25 |     Element,
26 |     TextCSSExtractor,
27 |     XPathExtractor,
28 | )
29 | from .utils import (
30 |     LazyStr,
31 |     is_complex_extractor,
32 |     is_extractor,
33 |     is_simple_extractor,
34 |     sentinel,
35 | )
36 | 
37 | __all__ = (
38 |     "AbstractComplexExtractor",
39 |     "AbstractExtractors",
40 |     "AbstractSimpleExtractor",
41 |     "AttrCSSExtractor",
42 |     "CSSExtractor",
43 |     "ComplexExtractorMeta",
44 |     "Convertor",
45 |     "Element",
46 |     "ExprError",
47 |     "ExtractError",
48 |     "Field",
49 |     "Item",
50 |     "JSONExtractor",
51 |     "JSONPathExtractor",
52 |     "JSONPathRWExtExtractor",
53 |     "JSONPathRWExtractor",
54 |     "LazyStr",
55 |     "RV",
56 |     "TextCSSExtractor",
57 |     "XPathExtractor",
58 |     "is_complex_extractor",
59 |     "is_extractor",
60 |     "is_simple_extractor",
61 |     "sentinel",
62 | )
63 | 


--------------------------------------------------------------------------------
/data_extractor/contrib/mypy/__init__.py:
--------------------------------------------------------------------------------
  1 | # Standard Library
  2 | import logging
  3 | 
  4 | from functools import partial
  5 | from typing import Callable, Dict, List, Optional, Type, Union
  6 | 
  7 | # Third Party Library
  8 | from mypy.checker import TypeChecker, is_true_literal
  9 | from mypy.nodes import (
 10 |     AssignmentStmt,
 11 |     CallExpr,
 12 |     ClassDef,
 13 |     Expression,
 14 |     IndexExpr,
 15 |     MemberExpr,
 16 |     MypyFile,
 17 |     NameExpr,
 18 |     RefExpr,
 19 |     StrExpr,
 20 |     SymbolNode,
 21 |     TypeAlias,
 22 |     TypeInfo,
 23 |     Var,
 24 | )
 25 | from mypy.options import Options
 26 | from mypy.plugin import (
 27 |     DynamicClassDefContext,
 28 |     FunctionContext,
 29 |     MethodSigContext,
 30 |     Plugin,
 31 | )
 32 | from mypy.semanal import SemanticAnalyzerInterface
 33 | from mypy.semanal_typeddict import TypedDictAnalyzer
 34 | from mypy.traverser import TraverserVisitor
 35 | from mypy.types import AnyType, CallableType, FunctionLike, Instance
 36 | from mypy.types import Type as MypyType
 37 | from mypy.types import TypedDictType, TypeOfAny, TypeType, UninhabitedType, UnionType
 38 | 
 39 | logger = logging.getLogger(__name__)
 40 | 
 41 | 
 42 | class RelationshipVisitor(TraverserVisitor):
 43 |     relationships: Dict[str, List[str]]
 44 | 
 45 |     def __init__(self) -> None:
 46 |         self.relationships = {}
 47 | 
 48 |     def is_data_extractor_cls(self, obj: Optional[SymbolNode]) -> bool:
 49 |         return obj is not None and obj.fullname in (
 50 |             "data_extractor.item.Field",
 51 |             "data_extractor.item.Item",
 52 |         )
 53 | 
 54 |     def is_making_extractor_assignment_stmt(self, stmt: AssignmentStmt) -> bool:
 55 |         rvalue = stmt.rvalue
 56 |         if not isinstance(rvalue, CallExpr):
 57 |             return False
 58 | 
 59 |         node: Union[Expression, SymbolNode, MypyType] = rvalue.callee
 60 |         if isinstance(node, IndexExpr):
 61 |             logger.debug("node=%s", node)
 62 |             base = node.base
 63 |             assert base is not None
 64 |             node = base
 65 | 
 66 |         assert isinstance(node, RefExpr)
 67 |         logger.debug("node=%s", node)
 68 |         node_ = node.node
 69 |         if node_ is None:
 70 |             return False
 71 |         node = node_
 72 | 
 73 |         logger.debug("node=%r", node)
 74 |         if isinstance(node, Var):
 75 |             tt = node.type
 76 |             logger.debug("tt=%s", tt)
 77 |             if not isinstance(tt, TypeType):
 78 |                 return False
 79 |             node = tt.item
 80 | 
 81 |         logger.debug("node=%r", node)
 82 |         if isinstance(node, TypeAlias):
 83 |             node = node.target
 84 | 
 85 |         logger.debug("node=%r", node)
 86 |         if isinstance(node, Instance):
 87 |             return node.type.has_base("data_extractor.item.Field")
 88 | 
 89 |         logger.debug("node=%r", node)
 90 |         if isinstance(node, TypeInfo):
 91 |             return self.is_data_extractor_cls(node)
 92 | 
 93 |         return False
 94 | 
 95 |     def locate_field_in_classdef(self, defn: ClassDef, name: str) -> str:
 96 |         for block in defn.defs.body:
 97 |             if not isinstance(block, AssignmentStmt):
 98 |                 continue
 99 | 
100 |             for lvalue in block.lvalues:
101 |                 assert isinstance(lvalue, NameExpr)
102 |                 if lvalue.name == name:
103 |                     assert block.type is not None
104 |                     return str((block.type.line, block.type.column))
105 |         else:  # pragma: no cover
106 |             raise ValueError(f"Field name = {name!r} not exists in defn = {defn!s}")
107 | 
108 |     def anal_assignment_stmt(self, stmt: AssignmentStmt) -> None:
109 |         logger.debug("stmt=%s", stmt)
110 |         if self.is_making_extractor_assignment_stmt(stmt):
111 |             rvalue_loc = str((stmt.rvalue.line, stmt.rvalue.column))
112 |             logger.debug("stmt=%s, rloc=%r", stmt, rvalue_loc)
113 |             for lvalue in stmt.lvalues:
114 |                 lvalue_loc = ""
115 |                 logger.debug(f"lvalue = {lvalue!s}")
116 |                 assert isinstance(lvalue, RefExpr)
117 |                 if isinstance(lvalue, MemberExpr):
118 |                     expr = lvalue.expr
119 |                     assert isinstance(expr, NameExpr)
120 |                     node = expr.node
121 |                     if node is None:
122 |                         return
123 |                     assert isinstance(node, TypeInfo)
124 |                     lvalue_loc = self.locate_field_in_classdef(node.defn, lvalue.name)
125 |                 elif isinstance(lvalue, NameExpr):
126 |                     node = lvalue.node
127 |                     assert isinstance(node, SymbolNode)
128 |                     lvalue_loc = str((node.line, node.column))
129 | 
130 |                 if not lvalue_loc:  # pragma: no cover
131 |                     logger.debug(f"n = {node!s}, stmt = {stmt!s}")
132 |                     continue
133 | 
134 |                 self.relationships.setdefault(rvalue_loc, []).append(lvalue_loc)
135 | 
136 |     def visit_assignment_stmt(self, o: AssignmentStmt) -> None:
137 |         self.anal_assignment_stmt(o)
138 |         super().visit_assignment_stmt(o)
139 | 
140 | 
141 | class DataExtractorPlugin(Plugin):
142 |     cache: Dict[str, Dict[str, List[str]]]
143 |     item_typeddict_mapping: Dict[str, TypedDictType]
144 | 
145 |     def __init__(self, options: Options) -> None:
146 |         super().__init__(options)
147 |         self.cache = {}
148 |         self.item_typeddict_mapping = {}
149 | 
150 |     def get_current_code(self, ctx: FunctionContext) -> MypyFile:
151 |         api = ctx.api
152 |         assert isinstance(api, TypeChecker)
153 |         module_name = api.tscope.module
154 |         assert module_name is not None
155 |         return api.modules[module_name]
156 | 
157 |     def anal_code(self, code: MypyFile) -> Dict[str, List[str]]:
158 |         logger.debug(f"code.fullname = {code.fullname!r}, self.cache = {self.cache!r}")
159 |         if code.fullname not in self.cache:
160 |             try:
161 |                 visitor = RelationshipVisitor()
162 |             except TypeError:  # pragma: no cover
163 |                 # Only supports versions that are bigger than 0.820
164 |                 return {}
165 | 
166 |             code.accept(visitor)
167 |             self.cache[code.fullname] = visitor.relationships
168 | 
169 |         return self.cache[code.fullname]
170 | 
171 |     def check_field_generic_type(self, ctx: FunctionContext) -> MypyType:
172 |         rv_type = ctx.default_return_type
173 |         if self.options.disallow_any_generics:
174 |             return rv_type
175 | 
176 |         self.anal_code(self.get_current_code(ctx))
177 | 
178 |         assert isinstance(rv_type, Instance)
179 |         if rv_type.args and not isinstance(rv_type.args[0], UninhabitedType):
180 |             return rv_type
181 | 
182 |         return self.apply_any_generic(type=rv_type)
183 | 
184 |     def apply_any_generic(self, type: Instance) -> Instance:
185 |         any_type = AnyType(TypeOfAny.special_form)
186 |         args = [any_type]
187 |         return type.copy_modified(args=args)
188 | 
189 |     def check_is_many(self, ctx: FunctionContext) -> bool:
190 |         is_many_idx = ctx.callee_arg_names.index("is_many")
191 |         is_many_exprs = ctx.args[is_many_idx]
192 |         if is_many_exprs:
193 |             return is_true_literal(is_many_exprs[0])
194 | 
195 |         return False
196 | 
197 |     def prepare_type_annotations(self, ctx: FunctionContext, fullname: str) -> MypyType:
198 |         logger.debug("fullname=%r", fullname)
199 | 
200 |         # check parameter "is_many"
201 |         expr = ctx.context
202 |         assert isinstance(expr, CallExpr)
203 | 
204 |         callee = expr.callee
205 |         if isinstance(callee, IndexExpr):
206 |             callee = callee.base
207 |         assert isinstance(callee, NameExpr)
208 | 
209 |         sym_field_class: Union[MypyType, SymbolNode, None] = callee.node
210 |         if isinstance(sym_field_class, TypeAlias):
211 |             sym_field_class = sym_field_class.target
212 |         elif isinstance(sym_field_class, Var):
213 |             typetype = sym_field_class.type
214 |             assert isinstance(typetype, TypeType)
215 |             sym_field_class = typetype.item
216 | 
217 |         if isinstance(sym_field_class, Instance):
218 |             sym_field_class = sym_field_class.type
219 | 
220 |         assert isinstance(sym_field_class, TypeInfo)
221 |         relationship = self.anal_code(self.get_current_code(ctx))
222 |         lvalue_key = str((expr.line, expr.column))
223 |         keys = [lvalue_key]
224 |         if lvalue_key in relationship:
225 |             keys.extend(relationship[lvalue_key])
226 | 
227 |         for key in keys:
228 |             logger.debug(
229 |                 f"lvalue_key = {lvalue_key!r}, "
230 |                 f"key = {key!r}, relationship = {relationship!r}"
231 |             )
232 | 
233 |             if self.check_is_many(ctx):
234 |                 sym_field_class.metadata[key] = {"is_many": True}
235 |             else:
236 |                 sym_field_class.metadata[key] = {"is_many": False}
237 | 
238 |         rv_type = self.check_field_generic_type(ctx)
239 |         return rv_type
240 | 
241 |     def is_extractor_cls(self, fullname: str, is_item_subcls=False) -> bool:
242 |         node = self.lookup_fully_qualified(fullname)
243 |         if node is not None:
244 |             typenode = node.node
245 |             if isinstance(typenode, TypeInfo):
246 |                 if is_item_subcls:
247 |                     return typenode.has_base("data_extractor.item.Item")
248 |                 else:
249 |                     return typenode.has_base("data_extractor.item.Field")
250 | 
251 |         return False
252 | 
253 |     def get_function_hook(
254 |         self, fullname: str
255 |     ) -> Optional[Callable[[FunctionContext], MypyType]]:
256 |         logger.debug("fullname=%r", fullname)
257 |         if self.is_extractor_cls(fullname):
258 |             return partial(self.prepare_type_annotations, fullname=fullname)
259 | 
260 |         return super().get_function_hook(fullname)
261 | 
262 |     def apply_is_many_on_extract_method(
263 |         self, ctx: MethodSigContext, fullname: str
264 |     ) -> CallableType:
265 |         origin: CallableType = ctx.default_signature
266 |         origin_ret_type = origin.ret_type
267 |         assert isinstance(origin_ret_type, UnionType)
268 | 
269 |         self_class = ctx.type
270 |         assert isinstance(self_class, Instance)
271 |         metadata = self_class.type.metadata
272 | 
273 |         # in case of stmt `Field().extract(...)`
274 |         key = str((ctx.type.line, ctx.type.column))
275 |         if key not in metadata:
276 |             expr = ctx.context
277 |             assert isinstance(expr, CallExpr)
278 |             callee = expr.callee
279 |             assert isinstance(callee, MemberExpr)
280 |             callee_expr = callee.expr
281 |             assert isinstance(callee_expr, NameExpr)
282 |             obj = callee_expr.node
283 |             assert isinstance(obj, Var)
284 |             key = str((obj.line, obj.column))
285 | 
286 |         logger.debug("fullname=%r, key=%r, metadata=%r", fullname, key, metadata)
287 |         if key in metadata:
288 |             is_many = metadata[key]["is_many"]
289 |             ret_type = origin_ret_type.items[int(is_many)]
290 |             return origin.copy_modified(ret_type=ret_type)
291 |         else:
292 |             api = ctx.api
293 |             assert isinstance(api, TypeChecker)
294 |             api.fail("Cant determine extract method return type", context=ctx.context)
295 |             return origin
296 | 
297 |     def is_extract_method(self, fullname: str) -> bool:
298 |         suffix = ".extract"
299 |         if fullname.endswith(suffix):
300 |             return self.is_extractor_cls(fullname[: -len(suffix)])
301 |         return False
302 | 
303 |     def apply_extract_method(
304 |         self, ctx: MethodSigContext, fullname: str
305 |     ) -> CallableType:
306 |         rv = self.apply_is_many_on_extract_method(ctx, fullname)
307 | 
308 |         # apply item typeddict
309 |         item_classname = fullname[: -len(".extract")]
310 |         if item_classname in self.item_typeddict_mapping:
311 |             logger.debug("fullname=%r, ret_type=%r", fullname, rv.ret_type)
312 |             original = rv.ret_type
313 |             typeddict = self.item_typeddict_mapping[item_classname]
314 |             ret_type: Optional[MypyType]
315 |             if isinstance(original, AnyType):  # is_many=False
316 |                 rv = rv.copy_modified(ret_type=typeddict)
317 |             else:
318 |                 assert isinstance(original, Instance)
319 |                 if original.type.name == "list":  # is_many=True
320 |                     ret_type = original
321 |                     ret_type.args = (typeddict,)
322 |                     rv = rv.copy_modified(ret_type=ret_type)
323 |                 else:  # pragma: no cover
324 |                     api = ctx.api
325 |                     assert isinstance(api, TypeChecker)
326 |                     api.fail(
327 |                         "Cant determine extract method return type", context=ctx.context
328 |                     )
329 |                     ret_type = None
330 | 
331 |         logger.debug(
332 |             "fullname=%r, rv=%r, item_typeddict_mapping=%r",
333 |             fullname,
334 |             rv,
335 |             self.item_typeddict_mapping,
336 |         )
337 |         return rv
338 | 
339 |     def get_method_signature_hook(
340 |         self, fullname: str
341 |     ) -> Optional[Callable[[MethodSigContext], FunctionLike]]:
342 |         if self.is_extract_method(fullname):
343 |             return partial(self.apply_extract_method, fullname=fullname)
344 |         return super().get_method_signature_hook(fullname)
345 | 
346 |     def get_name_arg(self, call: CallExpr) -> str:
347 |         name = ""
348 |         try:
349 |             idx = call.arg_names.index("name")
350 |             arg = call.args[idx]
351 |             assert isinstance(arg, StrExpr)
352 |             name = arg.value
353 |         except ValueError:
354 |             pass
355 |         return name
356 | 
357 |     def prepare_typeddict(self, ctx: DynamicClassDefContext, fullname: str) -> None:
358 |         logger.debug("fullname=%r", fullname)
359 |         if fullname in self.item_typeddict_mapping:
360 |             return
361 | 
362 |         api = ctx.api
363 |         assert isinstance(api, SemanticAnalyzerInterface)
364 |         analyzer = TypedDictAnalyzer(api.options, api, api.msg)  # type: ignore
365 | 
366 |         items: List[str] = []
367 |         types: List[MypyType] = []
368 |         callee = ctx.call.callee
369 |         assert isinstance(callee, NameExpr)
370 |         node = callee.node
371 |         assert isinstance(node, TypeInfo)
372 |         for block in node.defn.defs.body:
373 |             if not isinstance(block, AssignmentStmt):
374 |                 continue
375 | 
376 |             rvalue = block.rvalue
377 |             if not isinstance(rvalue, CallExpr):
378 |                 continue
379 | 
380 |             param_name = self.get_name_arg(rvalue)
381 |             logger.debug("param_name = %r from rvalue = %s", param_name, rvalue)
382 | 
383 |             rvalue_type: MypyType
384 |             callee = rvalue.callee
385 |             if isinstance(callee, IndexExpr):
386 |                 index = callee.index
387 |                 assert isinstance(index, NameExpr)
388 |                 name = index.fullname
389 |                 assert name is not None
390 |                 named_type = api.named_type_or_none(name, [])
391 |                 assert named_type is not None
392 |                 rvalue_type = named_type
393 |             else:
394 |                 rvalue_type = AnyType(TypeOfAny.special_form)
395 | 
396 |             if param_name:
397 |                 items.append(param_name)
398 |                 types.append(rvalue_type)
399 |             else:
400 |                 for lvalue in block.lvalues:
401 |                     assert isinstance(lvalue, NameExpr)
402 |                     items.append(lvalue.name)
403 |                     types.append(rvalue_type)
404 | 
405 |         callee = ctx.call.callee
406 |         assert isinstance(callee, NameExpr)
407 |         typeinfo = analyzer.build_typeddict_typeinfo(
408 |             callee.name,
409 |             items,
410 |             types,
411 |             set(items),
412 |             -1,
413 |             None,
414 |         )
415 |         assert typeinfo.typeddict_type is not None
416 |         self.item_typeddict_mapping[fullname] = typeinfo.typeddict_type
417 |         logger.debug(
418 |             "fullname=%r, item_typeddict_mapping=%r",
419 |             fullname,
420 |             self.item_typeddict_mapping,
421 |         )
422 | 
423 |     def get_dynamic_class_hook(
424 |         self, fullname: str
425 |     ) -> Optional[Callable[[DynamicClassDefContext], None]]:
426 |         logger.debug("fullname=%r", fullname)
427 |         if self.is_extractor_cls(fullname, is_item_subcls=True):
428 |             return partial(self.prepare_typeddict, fullname=fullname)
429 | 
430 |         return super().get_dynamic_class_hook(fullname)
431 | 
432 | 
433 | def plugin(version: str) -> Type[Plugin]:
434 |     return DataExtractorPlugin
435 | 


--------------------------------------------------------------------------------
/data_extractor/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | =====================================
  3 | :mod:`core` -- Abstract Base Classes.
  4 | =====================================
  5 | """
  6 | 
  7 | # Standard Library
  8 | import ast
  9 | import inspect
 10 | 
 11 | from abc import abstractmethod
 12 | from collections import namedtuple
 13 | from types import FrameType, FunctionType, MethodType
 14 | from typing import Any, Dict, Optional, Tuple, Union
 15 | 
 16 | # Local Folder
 17 | from .utils import Property, getframe, sentinel
 18 | 
 19 | _LineInfo = namedtuple("_LineInfo", ["file", "lineno", "offset", "line"])
 20 | 
 21 | 
 22 | def _find_line_info_of_attr_in_source(
 23 |     frame: Optional[FrameType], key: str, attr: "AbstractComplexExtractor"
 24 | ) -> _LineInfo:
 25 |     if frame is None:
 26 |         return _LineInfo(None, None, None, f"{key}={attr!r}")
 27 | 
 28 |     file = frame.f_code.co_filename
 29 |     firstlineno = frame.f_lineno
 30 |     firstline_idx = firstlineno - 1
 31 |     try:
 32 |         lines, _ = inspect.findsource(frame)
 33 |     except OSError:
 34 |         # can't get the source code from python repl
 35 |         return _LineInfo(None, None, None, f"{key}={attr!r}")
 36 | 
 37 |     start_index = inspect.indentsize(lines[firstline_idx])
 38 |     for lineno, line in enumerate(lines[firstline_idx + 1 :], start=firstlineno + 1):
 39 |         # iterate line in the code block body
 40 |         cur_index = inspect.indentsize(line)
 41 |         if cur_index <= start_index:
 42 |             # reach end of the code block,
 43 |             # use code block firstlineno as SyntaxError.lineno
 44 |             line = lines[firstline_idx]
 45 |             lineno = firstlineno
 46 |             break
 47 | 
 48 |         if line.lstrip().startswith(key):
 49 |             # find the line as SyntaxError.text
 50 |             break
 51 | 
 52 |     else:
 53 |         # reach EOF,
 54 |         # use code block firstlineno as SyntaxError.lineno
 55 |         line = lines[firstline_idx]
 56 |         lineno = firstlineno
 57 | 
 58 |     offset = inspect.indentsize(line)
 59 |     line = line.strip()
 60 |     return _LineInfo(file, lineno, offset, line)
 61 | 
 62 | 
 63 | def _check_field_overwrites_bases_property(
 64 |     cls: object,
 65 |     name: str,
 66 |     bases: Tuple[object],
 67 |     key: str,
 68 |     attr: "AbstractComplexExtractor",
 69 | ) -> None:
 70 |     attr_from_bases = getattr(bases[-1], key, None)
 71 |     if isinstance(attr_from_bases, Property) or key == "_field_names":
 72 |         # Item's attribute overwrites its property.
 73 |         frame = getframe(2)
 74 |         exc_args = _find_line_info_of_attr_in_source(frame, key, attr)
 75 |         *_, line = exc_args
 76 |         err_msg = (
 77 |             f"{line!r} overwriten "
 78 |             f"the property {key!r} of {name}. "
 79 |             f"Please using the optional parameter name={key!r} "
 80 |             f"in {attr!r} to avoid overwriting property."
 81 |         )
 82 |         raise SyntaxError(err_msg, exc_args)
 83 | 
 84 | 
 85 | def _check_field_overwrites_bases_method(
 86 |     cls: object,
 87 |     name: str,
 88 |     bases: Tuple[object],
 89 |     key: str,
 90 |     attr: "AbstractComplexExtractor",
 91 | ) -> None:
 92 |     attr_from_bases = getattr(bases[-1], key, None)
 93 |     if isinstance(attr_from_bases, (FunctionType, MethodType)):
 94 |         # Item's attribute overwrites its class bases' method.
 95 |         frame = getframe(2)
 96 |         exc_args = _find_line_info_of_attr_in_source(frame, key, attr)
 97 |         *_, line = exc_args
 98 |         err_msg = (
 99 |             f"{line!r} overwriten "
100 |             f"the method {key!r} of {name!r}. "
101 |             f"Please using the optional parameter name={key!r} "
102 |             f"in {attr!r} to avoid overwriting method."
103 |         )
104 |         raise SyntaxError(err_msg, exc_args)
105 | 
106 | 
107 | def _check_field_overwrites_method(cls: object) -> None:
108 |     frame = getframe(2)
109 |     if frame is None:
110 |         return
111 | 
112 |     filename = frame.f_code.co_filename
113 |     firstlineno = frame.f_lineno
114 |     try:
115 |         lines, _ = inspect.findsource(frame)
116 |     except OSError:
117 |         # can't get the source code from python repl
118 |         return
119 | 
120 |     source = "".join(lines)
121 |     mod = ast.parse(source)
122 |     for node in ast.walk(mod):
123 |         if isinstance(node, (ast.ClassDef, ast.Call)) and node.lineno == firstlineno:
124 |             item_node = node
125 |             break
126 |     else:  # pragma: no cover
127 |         assert 0, f"Can't find the source of {cls}."
128 | 
129 |     if isinstance(item_node, ast.Call):
130 |         # There is no point to check if field overwrites method,
131 |         # due to item is created by `type` function.
132 |         return
133 | 
134 |     assigns: Dict[str, ast.Assign] = {}
135 |     methods: Dict[str, ast.FunctionDef] = {}
136 |     for node in item_node.body:
137 |         if isinstance(node, ast.Assign):
138 |             for target_ in node.targets:
139 |                 if not isinstance(target_, ast.Name):
140 |                     continue
141 | 
142 |                 assigns[target_.id] = node
143 |         elif isinstance(node, ast.FunctionDef):
144 |             methods[node.name] = node
145 | 
146 |     unions = assigns.keys() & methods.keys()
147 |     if not unions:
148 |         return
149 | 
150 |     key = next(iter(unions))
151 |     assign = assigns[key]
152 |     method = methods[key]
153 |     if assign.lineno > method.lineno:
154 |         lineno = assign.lineno
155 |         offset = assign.col_offset
156 |         line = lines[lineno - 1].strip()
157 | 
158 |         msg = (
159 |             f"method {lines[method.lineno - 1].strip()!r} "
160 |             f"on lineno={method.lineno} "
161 |             f"overwrited by assign {line!r}. "
162 |             f"Please using the optional parameter name={key!r} "
163 |             f"in {line!r} to avoid overwriting."
164 |         )
165 |     else:
166 |         lineno = method.lineno
167 |         offset = method.col_offset
168 |         line = lines[lineno - 1].strip()
169 |         msg = (
170 |             f"assign {lines[assign.lineno - 1].strip()!r} "
171 |             f"on lineno={assign.lineno} "
172 |             f"overwrited by method {line!r}. "
173 |             f"Please using the optional parameter name={key!r} "
174 |             f"in {lines[assign.lineno - 1].strip()!r} to avoid overwriting."
175 |         )
176 | 
177 |     raise SyntaxError(msg, (filename, lineno, offset, line))
178 | 
179 | 
180 | class SimpleExtractorMeta(type):
181 |     """
182 |     Simple Extractor Meta Class.
183 |     """
184 | 
185 | 
186 | class ComplexExtractorMeta(SimpleExtractorMeta):
187 |     """
188 |     Complex Extractor Meta Class.
189 |     """
190 | 
191 |     def __init__(
192 |         cls,  # noqa: B902
193 |         name: str,
194 |         bases: Tuple[type],
195 |         attr_dict: Dict[str, Any],
196 |     ):
197 |         super().__init__(name, bases, attr_dict)
198 | 
199 |         field_names = set()
200 |         for key, attr in attr_dict.items():
201 |             if isinstance(type(attr), ComplexExtractorMeta):
202 |                 # can't using data_extractor.utils.is_complex_extractor here,
203 |                 # because AbstractComplexExtractor which being used in it
204 |                 # bases on ComplexExtractorMeta.
205 |                 _check_field_overwrites_bases_method(cls, name, bases, key, attr)
206 |                 _check_field_overwrites_bases_property(cls, name, bases, key, attr)
207 | 
208 |                 field_names.add(key)
209 | 
210 |         # check field overwrites method
211 |         _check_field_overwrites_method(cls)
212 | 
213 |         field_names |= set(getattr(cls, "_field_names", []))
214 |         for key in field_names.copy():
215 |             attr = getattr(cls, key, None)
216 |             if not attr or not isinstance(type(attr), ComplexExtractorMeta):
217 |                 field_names.remove(key)
218 | 
219 |         cls._field_names: Tuple[str, ...] = tuple(field_names)
220 | 
221 | 
222 | class AbstractSimpleExtractor(metaclass=SimpleExtractorMeta):
223 |     """
224 |     Abstract Simple Extractor Class.
225 | 
226 |     Its metaclass is :class:`data_extractor.core.SimpleExtractorMeta`
227 | 
228 |     :param expr: Extractor selector expression.
229 |     :type expr: str
230 |     """
231 | 
232 |     expr = Property[str]()
233 | 
234 |     def __init__(self, expr: str):
235 |         self.expr = expr
236 | 
237 |     def __repr__(self) -> str:
238 |         return f"{self.__class__.__name__}({self.expr!r})"
239 | 
240 |     @abstractmethod
241 |     def extract(self, element: Any) -> Any:
242 |         """
243 |         Extract data or subelement from element.
244 | 
245 |         :param element: The target data node element.
246 |         :type element: Any
247 | 
248 |         :returns: Data or subelement.
249 |         :rtype: Any
250 | 
251 |         :raises ~data_extractor.exceptions.ExprError: Extractor Expression Error.
252 |         """
253 |         raise NotImplementedError
254 | 
255 |     def extract_first(self, element: Any, default: Any = sentinel) -> Any:
256 |         """
257 |         Extract the first data or subelement from `extract` method call result.
258 | 
259 |         :param element: The target data node element.
260 |         :type element: Any
261 |         :param default: Default value when not found. \
262 |             Default: :data:`data_extractor.utils.sentinel`.
263 |         :type default: Any, optional
264 | 
265 |         :returns: Data or subelement.
266 |         :rtype: Any
267 | 
268 |         :raises ~data_extractor.exceptions.ExtractError: \
269 |             Thrown by extractor extracting wrong data.
270 |         """
271 |         rv = self.extract(element)
272 |         if not rv:
273 |             if default is sentinel:
274 |                 # Local Folder
275 |                 from .exceptions import ExtractError
276 | 
277 |                 raise ExtractError(self, element)
278 | 
279 |             return default
280 | 
281 |         return rv[0]
282 | 
283 | 
284 | class AbstractComplexExtractor(metaclass=ComplexExtractorMeta):
285 |     """
286 |     Abstract Complex Extractor Clase.
287 | 
288 |     Its metaclass is :class:`data_extractor.core.ComplexExtractorMeta`
289 |     """
290 | 
291 |     @abstractmethod
292 |     def extract(self, element: Any) -> Any:
293 |         """
294 |         Extract the wanted data.
295 | 
296 |         :param element: The target data node element.
297 |         :type element: Any
298 | 
299 |         :returns: Data or subelement.
300 |         :rtype: Any
301 | 
302 |         :raises ~data_extractor.exceptions.ExtractError: \
303 |             Thrown by extractor extracting wrong data.
304 |         """
305 |         raise NotImplementedError
306 | 
307 | 
308 | AbstractExtractors = Union[AbstractSimpleExtractor, AbstractComplexExtractor]
309 | 
310 | __all__ = (
311 |     "AbstractComplexExtractor",
312 |     "AbstractExtractors",
313 |     "AbstractSimpleExtractor",
314 |     "ComplexExtractorMeta",
315 |     "SimpleExtractorMeta",
316 | )
317 | 


--------------------------------------------------------------------------------
/data_extractor/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ===========================================
 3 | :mod:`exceptions` -- Extracting Exceptions.
 4 | ===========================================
 5 | """
 6 | 
 7 | # Standard Library
 8 | import reprlib
 9 | 
10 | from typing import Any
11 | 
12 | # Local Folder
13 | from .core import AbstractExtractors, AbstractSimpleExtractor
14 | from .utils import LazyStr
15 | 
16 | 
17 | class ExprError(Exception):
18 |     """
19 |     Invalid Expr.
20 | 
21 |     :param extractor: The object for data extracting.
22 |     :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor`
23 |     :param exc: The actual exception is thrown when extracting.
24 |     :type exc: Exception
25 |     """
26 | 
27 |     def __init__(self, extractor: AbstractSimpleExtractor, exc: Exception):
28 |         self.extractor = extractor
29 |         self.exc = exc
30 | 
31 |     def __str__(self) -> str:
32 |         return f"ExprError with {self.exc!r} raised by {self.extractor!r} extracting"
33 | 
34 |     def __repr__(self) -> str:
35 |         return f"{self.__class__.__name__}({self.extractor!r}, exc={self.exc!r})"
36 | 
37 | 
38 | class ExtractError(Exception):
39 |     """
40 |     Thrown by extractor extracting wrong data.
41 | 
42 |     :param extractor: The object for data extracting.
43 |     :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor`, \
44 |         :class:`data_extractor.core.AbstractComplexExtractor`
45 |     :param element: The target data node element.
46 |     :type element: Any
47 |     """
48 | 
49 |     def __init__(self, extractor: AbstractExtractors, element: Any):
50 |         super().__init__(LazyStr(func=lambda: self._trace_repr))
51 |         self.element = element
52 |         self.extractors = [extractor]
53 | 
54 |     def __repr__(self) -> str:
55 |         return (
56 |             f"{self.__class__.__name__}"
57 |             f"({self.extractors[0]!r}, element={reprlib.repr(self.element)})"
58 |         )
59 | 
60 |     def _append(self, extractor: AbstractExtractors) -> None:
61 |         self.extractors.append(extractor)
62 | 
63 |     @property
64 |     def _trace_repr(self) -> str:
65 |         return f"{self.__repr__()}\n" + "\n".join(
66 |             "  " * idx + "|-" + repr(extractor)
67 |             for idx, extractor in enumerate([*self.extractors[::-1], self.element])
68 |         )
69 | 
70 | 
71 | __all__ = ("ExprError", "ExtractError")
72 | 


--------------------------------------------------------------------------------
/data_extractor/item.py:
--------------------------------------------------------------------------------
  1 | """
  2 | =====================================================
  3 | :mod:`item` -- Complex Extractor for data extracting.
  4 | =====================================================
  5 | """
  6 | 
  7 | # Standard Library
  8 | import copy
  9 | 
 10 | from typing import (
 11 |     Any,
 12 |     Callable,
 13 |     Dict,
 14 |     Generic,
 15 |     Iterator,
 16 |     List,
 17 |     Optional,
 18 |     Type,
 19 |     TypeVar,
 20 |     Union,
 21 | )
 22 | 
 23 | # Local Folder
 24 | from .core import AbstractComplexExtractor, AbstractSimpleExtractor
 25 | from .exceptions import ExtractError
 26 | from .utils import Property, is_simple_extractor, sentinel
 27 | 
 28 | RV = TypeVar("RV")
 29 | Convertor = Callable[[Any], RV]
 30 | 
 31 | 
 32 | class Field(Generic[RV], AbstractComplexExtractor):
 33 |     """
 34 |     Extract data by cooperating with extractor.
 35 | 
 36 |     :param extractor: The object for data extracting
 37 |     :type extractor: :class:`data_extractor.core.AbstractSimpleExtractor`
 38 |     :param name: Optional parameter for special field name.
 39 |     :type name: str, optional
 40 |     :param default: Default value when not found. \
 41 |         Default: :data:`data_extractor.utils.sentinel`.
 42 |     :type default: Any
 43 |     :param is_many: Indicate the data which extractor extracting is more than one.
 44 |     :type is_many: bool
 45 | 
 46 |     :raises ValueError: Invalid SimpleExtractor.
 47 |     :raises ValueError: Can't both set default and is_manay=True.
 48 |     """
 49 | 
 50 |     extractor = Property[Optional[AbstractSimpleExtractor]]()
 51 |     name = Property[Optional[str]]()
 52 |     default = Property[Any]()
 53 |     is_many = Property[bool]()
 54 | 
 55 |     type = Property[Optional[Type[RV]]]()
 56 |     convertor = Property[Optional[Convertor[RV]]]()
 57 | 
 58 |     def __init__(
 59 |         self,
 60 |         extractor: Optional[AbstractSimpleExtractor] = None,
 61 |         name: Optional[str] = None,
 62 |         default: Any = sentinel,
 63 |         is_many: bool = False,
 64 |         type: Optional[Type[RV]] = None,
 65 |         convertor: Optional[Convertor[RV]] = None,
 66 |     ):
 67 |         super().__init__()
 68 | 
 69 |         if extractor is not None and not is_simple_extractor(extractor):
 70 |             raise ValueError(f"Invalid SimpleExtractor: {extractor!r}")
 71 | 
 72 |         if default is not sentinel and is_many:
 73 |             raise ValueError(f"Can't both set default={default} and is_many=True")
 74 | 
 75 |         self.extractor = extractor
 76 |         self.name = name
 77 |         self.default = default
 78 |         self.is_many = is_many
 79 |         self.type = type
 80 |         self.convertor = convertor
 81 | 
 82 |     def __class_getitem__(cls, rv_type: Type[RV]):
 83 |         def new_init(
 84 |             self,
 85 |             extractor: Optional[AbstractSimpleExtractor] = None,
 86 |             name: Optional[str] = None,
 87 |             default: Any = sentinel,
 88 |             is_many: bool = False,
 89 |             type: Optional[Type[RV]] = None,
 90 |             convertor: Optional[Convertor[RV]] = None,
 91 |         ):
 92 |             cls.__init__(
 93 |                 self,
 94 |                 extractor=extractor,
 95 |                 name=name,
 96 |                 default=default,
 97 |                 is_many=is_many,
 98 |                 type=type or rv_type,
 99 |                 convertor=convertor,
100 |             )
101 | 
102 |         if rv_type is RV:  # type: ignore
103 |             # it is a type-unbound container class
104 |             return cls
105 |         else:
106 |             return type(cls.__name__, (cls,), {"__init__": new_init})
107 | 
108 |     def __repr__(self) -> str:
109 |         args = [f"{self.extractor!r}"]
110 |         if self.name is not None:
111 |             args.append(f"name={self.name!r}")
112 | 
113 |         if self.default is not sentinel:
114 |             args.append(f"default={self.default!r}")
115 | 
116 |         if self.is_many:
117 |             args.append(f"is_many={self.is_many!r}")
118 | 
119 |         return f"{self.__class__.__name__}({', '.join(args)})"
120 | 
121 |     def extract(self, element: Any) -> Union[RV, List[RV]]:
122 |         if self.extractor is None:
123 |             if isinstance(element, list):
124 |                 rv = element
125 |             else:
126 |                 rv = [element]
127 |         else:
128 |             rv = self.extractor.extract(element)
129 | 
130 |         if self.is_many:
131 |             return [self._extract(r) for r in rv]
132 | 
133 |         if not rv:
134 |             if self.default is sentinel:
135 |                 raise ExtractError(self, element)
136 | 
137 |             return self.default
138 | 
139 |         return self._extract(rv[0])
140 | 
141 |     def _extract(self, element: Any) -> RV:
142 |         if self.convertor is not None:
143 |             return self.convertor(element)
144 |         else:
145 |             cls = self.type
146 |             if cls is not None and callable(cls):
147 |                 # TODO: inspect function signature for supporting better conversion
148 |                 return cls(element)  # type: ignore
149 |             else:
150 |                 return element
151 | 
152 |     def __deepcopy__(self, memo: Dict[int, Any]) -> AbstractComplexExtractor:
153 |         deepcopy_method = self.__deepcopy__
154 |         self.__deepcopy__ = None  # type: ignore
155 |         cp = copy.deepcopy(self, memo)
156 |         self.__deepcopy__ = deepcopy_method  # type: ignore
157 | 
158 |         # avoid duplicating the sentinel object.
159 |         if self.default is sentinel:
160 |             Property.change_internal_value(cp, "default", sentinel)
161 | 
162 |         return cp
163 | 
164 | 
165 | class Item(Field[RV]):
166 |     """
167 |     Extract data by cooperating with extractors, fields and items.
168 |     """
169 | 
170 |     def __init__(
171 |         self,
172 |         extractor=None,
173 |         name=None,
174 |         default=sentinel,
175 |         is_many=False,
176 |         type=None,
177 |         convertor=None,
178 |     ):
179 |         super().__init__(
180 |             extractor=extractor,
181 |             name=name,
182 |             default=default,
183 |             is_many=is_many,
184 |             type=type,
185 |             convertor=convertor or self.default_convertor,
186 |         )
187 | 
188 |     def default_convertor(self, rv: Dict[str, Any]) -> RV:
189 |         cls = self.type
190 |         if cls is not None and callable(cls):
191 |             # TODO: inspect function signature for supporting better conversion
192 |             return cls(**rv)  # type: ignore
193 | 
194 |         return rv  # type: ignore
195 | 
196 |     def _extract(self, element: Any) -> RV:
197 |         rv = {}
198 |         for field in self.field_names():
199 |             try:
200 |                 extractor = getattr(self, field)
201 |                 if extractor.name is not None:
202 |                     field = extractor.name
203 | 
204 |                 rv[field] = extractor.extract(element)
205 |             except ExtractError as exc:
206 |                 exc._append(extractor=self)
207 |                 raise exc
208 | 
209 |         return super()._extract(rv)
210 | 
211 |     @classmethod
212 |     def field_names(cls) -> Iterator[str]:
213 |         """
214 |         Iterate all `Item` or `Field` type attributes' name.
215 |         """
216 |         yield from cls._field_names
217 | 
218 |     def simplify(self) -> AbstractSimpleExtractor:
219 |         """
220 |         Create an extractor that has compatible API like SimpleExtractor's.
221 | 
222 |         :returns: A simple extractor.
223 |         :rtype: :class:`data_extractor.core.AbstractSimpleExtractor`
224 |         """
225 |         # duplication seems to be useless due to the properties of Item is unchageable
226 |         # but it maybe need to change is_many property of Item.
227 |         duplicated = copy.deepcopy(self)
228 |         # set for fixing in SimpeExtractor.extract method signature
229 |         Property.change_internal_value(duplicated, "is_many", True)
230 | 
231 |         def extract(self: AbstractSimpleExtractor, element: Any) -> List[RV]:
232 |             return duplicated.extract(element)  # type: ignore
233 | 
234 |         def getter(self: AbstractSimpleExtractor, name: str) -> Any:
235 |             if (
236 |                 name not in ("extract", "extract_first")
237 |                 and not name.startswith("__")
238 |                 and hasattr(duplicated.extractor, name)
239 |             ):
240 |                 return getattr(duplicated.extractor, name)
241 |             return super(type(self), self).__getattribute__(name)
242 | 
243 |         classname = f"{type(duplicated).__name__}Simplified"
244 |         base = AbstractSimpleExtractor
245 |         if duplicated.extractor is not None:
246 |             base = type(duplicated.extractor)
247 | 
248 |         new_cls = type(
249 |             classname,
250 |             (base,),
251 |             {
252 |                 "extract": extract,
253 |                 "__getattribute__": getter,
254 |             },
255 |         )
256 |         # wrapper class no needs for initialization
257 |         obj: AbstractSimpleExtractor = base.__new__(new_cls)
258 |         if not hasattr(obj, "expr"):
259 |             # handle case of Item with extractor=None.
260 |             # and its expr property will raise AttributeError,
261 |             # so hasattr return False
262 |             obj.expr = ""  # set to avoid class.__repr__ raising AttributeError
263 | 
264 |         return obj
265 | 
266 | 
267 | __all__ = ("Field", "Item", "RV", "Convertor")
268 | 


--------------------------------------------------------------------------------
/data_extractor/json.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ===================================================
  3 | :mod:`json` -- Extractors for JSON data extracting.
  4 | ===================================================
  5 | """
  6 | 
  7 | # Standard Library
  8 | from typing import TYPE_CHECKING, Any, Optional, Type
  9 | 
 10 | # Local Folder
 11 | from .core import AbstractSimpleExtractor
 12 | from .exceptions import ExprError
 13 | from .utils import Property, _missing_dependency
 14 | 
 15 | 
 16 | class JSONExtractor(AbstractSimpleExtractor):
 17 |     """
 18 |     Use JSONPath expression implementated by **jsonpath-extractor**,
 19 |     **jsonpath-rw** or **jsonpath-rw-ext** packages for JSON data extracting.
 20 |     Change **json_extractor_backend** value to indicate which package to use.
 21 | 
 22 |     >>> import data_extractor.json
 23 |     >>> from data_extractor.json import JSONPathExtractor
 24 |     >>> data_extractor.json.json_extractor_backend = JSONPathExtractor
 25 | 
 26 |     Before extracting, should parse the JSON text into Python object.
 27 | 
 28 |     :param expr: JSONPath Expression.
 29 |     :type expr: str
 30 |     """
 31 | 
 32 |     def __new__(
 33 |         cls: Type["JSONExtractor"], *args: Any, **kwargs: Any
 34 |     ) -> "JSONExtractor":
 35 |         if json_extractor_backend is None:
 36 |             raise RuntimeError(
 37 |                 "'jsonpath-extractor', 'jsonpath-rw' or 'jsonpath-rw-ext' "
 38 |                 "package is needed, run pip to install it. "
 39 |             )
 40 | 
 41 |         obj: JSONExtractor
 42 |         if cls is JSONExtractor:
 43 |             # invoke the json extractor backend for object creation
 44 |             # TODO: cache renamed type
 45 |             obj = super(AbstractSimpleExtractor, cls).__new__(
 46 |                 type(
 47 |                     "JSONExtractor", (json_extractor_backend,), {}
 48 |                 )  # rename into JSONExtractor
 49 |             )
 50 |         else:
 51 |             # invoke subclasses directly
 52 |             obj = super(AbstractSimpleExtractor, cls).__new__(cls)
 53 | 
 54 |         return obj
 55 | 
 56 |     def extract(self, element: Any) -> Any:
 57 |         raise NotImplementedError
 58 | 
 59 | 
 60 | try:
 61 |     # Third Party Library
 62 |     import jsonpath_rw
 63 | 
 64 |     _missing_jsonpath_rw = False
 65 | except ImportError:
 66 |     _missing_jsonpath_rw = True
 67 | 
 68 | 
 69 | class JSONPathRWExtractor(JSONExtractor):
 70 |     """
 71 |     Use JSONPath expression implementated by **jsonpath-rw** package
 72 |     for JSON data extracting.
 73 | 
 74 |     Before extracting, should parse the JSON text into Python object.
 75 | 
 76 |     :param expr: JSONPath Expression.
 77 |     :type expr: str
 78 |     """
 79 | 
 80 |     if TYPE_CHECKING:
 81 |         # Third Party Library
 82 |         from jsonpath_rw import JSONPath
 83 |     _jsonpath = Property["JSONPath"]()
 84 | 
 85 |     def __init__(self, expr: str) -> None:
 86 |         super(JSONExtractor, self).__init__(expr)
 87 |         if _missing_jsonpath_rw:
 88 |             _missing_dependency("jsonpath-rw")
 89 | 
 90 |         # Third Party Library
 91 |         from jsonpath_rw.lexer import JsonPathLexerError
 92 | 
 93 |         try:
 94 |             self._jsonpath = jsonpath_rw.parse(self.expr)
 95 |         except (JsonPathLexerError, Exception) as exc:
 96 |             # jsonpath_rw.parser.JsonPathParser.p_error raises exc of Exception type
 97 |             raise ExprError(extractor=self, exc=exc) from exc
 98 | 
 99 |     def extract(self, element: Any) -> Any:
100 |         """
101 |         Extract data from JSON data.
102 | 
103 |         :param element: Python object parsed from JSON text.
104 |         :type element: Any
105 | 
106 |         :returns: Data.
107 |         :rtype: Any
108 |         """
109 |         return [m.value for m in self._jsonpath.find(element)]
110 | 
111 | 
112 | try:
113 |     # Third Party Library
114 |     import jsonpath_rw_ext
115 | 
116 |     _missing_jsonpath_rw_ext = False
117 | except ImportError:
118 |     _missing_jsonpath_rw_ext = True
119 | 
120 | 
121 | class JSONPathRWExtExtractor(JSONPathRWExtractor):
122 |     """
123 |     Use JSONPath expression implementated by **jsonpath-rw-ext** package
124 |     for JSON data extracting.
125 | 
126 |     Before extracting, should parse the JSON text into Python object.
127 | 
128 |     :param expr: JSONPath Expression.
129 |     :type expr: str
130 |     """
131 | 
132 |     if TYPE_CHECKING:
133 |         # Third Party Library
134 |         from jsonpath_rw_ext import JSONPath as JSONPathExt
135 |     _jsonpath = Property["JSONPathExt"]()
136 | 
137 |     def __init__(self, expr: str) -> None:
138 |         super(JSONExtractor, self).__init__(expr)
139 |         if _missing_jsonpath_rw_ext:
140 |             _missing_dependency("jsonpath-rw-ext")
141 | 
142 |         # Third Party Library
143 |         from jsonpath_rw.lexer import JsonPathLexerError
144 | 
145 |         try:
146 |             self._jsonpath = jsonpath_rw_ext.parse(self.expr)
147 |         except (JsonPathLexerError, Exception) as exc:
148 |             # jsonpath_rw.parser.JsonPathParser.p_error raises exc of Exception type
149 |             raise ExprError(extractor=self, exc=exc) from exc
150 | 
151 | 
152 | try:
153 |     # Third Party Library
154 |     import jsonpath
155 | 
156 |     _missing_jsonpath = False
157 | except ImportError:
158 |     _missing_jsonpath = True
159 | 
160 | 
161 | class JSONPathExtractor(JSONExtractor):
162 |     """
163 |     Use JSONPath expression implementated by **jsonpath-extractor** package
164 |     for JSON data extracting.
165 | 
166 |     Before extracting, should parse the JSON text into Python object.
167 | 
168 |     :param expr: JSONPath Expression.
169 |     :type expr: str
170 |     """
171 | 
172 |     if TYPE_CHECKING:
173 |         # Third Party Library
174 |         from jsonpath import Expr
175 | 
176 |     _jsonpath = Property["Expr"]()
177 | 
178 |     def __init__(self, expr: str) -> None:
179 |         super(JSONExtractor, self).__init__(expr)
180 | 
181 |         if _missing_jsonpath:
182 |             _missing_dependency("jsonpath-extractor")
183 | 
184 |         try:
185 |             self._jsonpath = jsonpath.parse(self.expr)
186 |         except SyntaxError as exc:
187 |             raise ExprError(extractor=self, exc=exc) from exc
188 | 
189 |     def extract(self, element: Any) -> Any:
190 |         """
191 |         Extract data from JSON data.
192 | 
193 |         :param element: Python object parsed from JSON text.
194 |         :type element: Any
195 | 
196 |         :returns: Data.
197 |         :rtype: Any
198 |         """
199 |         return self._jsonpath.find(element)
200 | 
201 | 
202 | json_extractor_backend: Optional[Type[JSONExtractor]] = None
203 | if not _missing_jsonpath:
204 |     json_extractor_backend = JSONPathExtractor
205 | elif not _missing_jsonpath_rw_ext:
206 |     json_extractor_backend = JSONPathRWExtExtractor
207 | elif not _missing_jsonpath_rw:
208 |     json_extractor_backend = JSONPathRWExtractor
209 | 
210 | 
211 | __all__ = (
212 |     "JSONExtractor",
213 |     "JSONPathExtractor",
214 |     "JSONPathRWExtExtractor",
215 |     "JSONPathRWExtractor",
216 |     "json_extractor_backend",
217 | )
218 | 


--------------------------------------------------------------------------------
/data_extractor/lxml.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :mod:`lxml` -- Extractors for XML or HTML data extracting.
  3 | ==========================================================
  4 | """
  5 | 
  6 | # Standard Library
  7 | from typing import List, Union
  8 | 
  9 | # Local Folder
 10 | from .core import AbstractSimpleExtractor
 11 | from .exceptions import ExprError
 12 | from .utils import Property, _missing_dependency
 13 | 
 14 | try:
 15 |     # Third Party Library
 16 |     from lxml.etree import XPath, XPathSyntaxError
 17 |     from lxml.etree import _Element as Element
 18 | 
 19 |     _missing_lxml = False
 20 | except ImportError:
 21 |     _missing_lxml = True
 22 | 
 23 |     Element = None  # TODO: Find a way to get rid of this. See PEP 562
 24 | 
 25 | 
 26 | class XPathExtractor(AbstractSimpleExtractor):
 27 |     """
 28 |     Use XPath for XML or HTML data extracting.
 29 | 
 30 |     Before extracting, should parse the XML or HTML text \
 31 |         into :class:`data_extractor.lxml.Element` object.
 32 | 
 33 |     :param expr: XPath Expression.
 34 |     :type exprt: str
 35 |     """
 36 | 
 37 |     _find = Property["XPath"]()
 38 | 
 39 |     def __init__(self, expr: str):
 40 |         super().__init__(expr)
 41 | 
 42 |         if _missing_lxml:
 43 |             _missing_dependency("lxml")
 44 | 
 45 |         try:
 46 |             self._find = XPath(self.expr)
 47 |         except XPathSyntaxError as exc:
 48 |             raise ExprError(extractor=self, exc=exc) from exc
 49 | 
 50 |     def extract(self, element: Element) -> Union[List[Element], List[str]]:
 51 |         """
 52 |         Extract subelements or data from XML or HTML data.
 53 | 
 54 |         :param element: Target.
 55 |         :type element: :class:`data_extractor.lxml.Element`
 56 | 
 57 |         :returns: List of :class:`data_extractor.lxml.Element` objects, \
 58 |             List of str, or str.
 59 |         :rtype: list
 60 | 
 61 |         :raises data_extractor.exceptions.ExprError: XPath Expression Error.
 62 |         """
 63 |         # Third Party Library
 64 |         from lxml.etree import XPathEvalError
 65 | 
 66 |         try:
 67 |             rv = self._find(element)
 68 |             if not isinstance(rv, list):
 69 |                 return [rv]
 70 |             else:
 71 |                 return rv
 72 |         except XPathEvalError as exc:
 73 |             raise ExprError(extractor=self, exc=exc) from exc
 74 | 
 75 | 
 76 | try:
 77 |     # Third Party Library
 78 |     import cssselect
 79 | 
 80 |     del cssselect
 81 |     _missing_cssselect = False
 82 | except ImportError:
 83 |     _missing_cssselect = True
 84 | 
 85 | 
 86 | class CSSExtractor(AbstractSimpleExtractor):
 87 |     """
 88 |     Use CSS Selector for XML or HTML data subelements extracting.
 89 | 
 90 |     Before extracting, should parse the XML or HTML text \
 91 |         into :class:`data_extractor.lxml.Element` object.
 92 | 
 93 |     :param expr: CSS Selector Expression.
 94 |     :type expr: str
 95 |     """
 96 | 
 97 |     _extractor = Property[XPathExtractor]()
 98 | 
 99 |     def __init__(self, expr: str):
100 |         super().__init__(expr)
101 | 
102 |         if _missing_cssselect:
103 |             _missing_dependency("cssselect")
104 | 
105 |         # Third Party Library
106 |         from cssselect import GenericTranslator
107 |         from cssselect.parser import SelectorError
108 | 
109 |         try:
110 |             xpath_expr = GenericTranslator().css_to_xpath(self.expr)
111 |         except SelectorError as exc:
112 |             raise ExprError(extractor=self, exc=exc) from exc
113 | 
114 |         self._extractor = XPathExtractor(xpath_expr)
115 | 
116 |     def extract(self, element: Element) -> List[Element]:
117 |         """
118 |         Extract subelements from XML or HTML data.
119 | 
120 |         :param element: Target.
121 |         :type element: :class:`data_extractor.lxml.Element`
122 | 
123 |         :returns: List of :class:`data_extractor.lxml.Element` objects, \
124 |             extracted result.
125 |         :rtype: list
126 |         """
127 |         return self._extractor.extract(element)
128 | 
129 | 
130 | class TextCSSExtractor(CSSExtractor):
131 |     """
132 |     Use CSS Selector for XML or HTML data subelements' text extracting.
133 | 
134 |     Before extracting, should parse the XML or HTML text \
135 |         into :class:`data_extractor.lxml.Element` object.
136 | 
137 |     :param expr: CSS Selector Expression.
138 |     :type expr: str
139 |     """
140 | 
141 |     def extract(self, element: Element) -> List[str]:
142 |         """
143 |         Extract subelements' text from XML or HTML data.
144 | 
145 |         :param element: Target.
146 |         :type element: :class:`data_extractor.lxml.Element`
147 | 
148 |         :returns: List of str, extracted result.
149 |         :rtype: list
150 | 
151 |         :raises ~data_extractor.exceptions.ExprError: CSS Selector Expression Error.
152 |         """
153 |         return [ele.text for ele in super().extract(element)]
154 | 
155 | 
156 | class AttrCSSExtractor(CSSExtractor):
157 |     """
158 |     Use CSS Selector for XML or HTML data subelements' attribute value extracting.
159 | 
160 |     Before extracting, should parse the XML or HTML text \
161 |         into :class:`data_extractor.lxml.Element` object.
162 | 
163 |     :param expr: CSS Selector Expression.
164 |     :type expr: str
165 |     :param attr: Target attribute name.
166 |     :type attr: str
167 |     """
168 | 
169 |     attr = Property[str]()
170 | 
171 |     def __init__(self, expr: str, attr: str):
172 |         super().__init__(expr)
173 |         self.attr = attr
174 | 
175 |     def __repr__(self) -> str:
176 |         return f"{self.__class__.__name__}(expr={self.expr!r}, attr={self.attr!r})"
177 | 
178 |     def extract(self, element: Element) -> List[str]:
179 |         """
180 |         Extract subelements' attribute value from XML or HTML data.
181 | 
182 |         :param element: Target.
183 |         :type element: :class:`data_extractor.lxml.Element`
184 | 
185 |         :returns: List of str, extracted result.
186 |         :rtype: list
187 | 
188 |         :raises ~data_extractor.exceptions.ExprError: CSS Selector Expression Error.
189 |         """
190 |         return [
191 |             ele.get(self.attr)
192 |             for ele in super().extract(element)
193 |             if self.attr in ele.keys()
194 |         ]
195 | 
196 | 
197 | __all__ = (
198 |     "AttrCSSExtractor",
199 |     "CSSExtractor",
200 |     "Element",
201 |     "TextCSSExtractor",
202 |     "XPathExtractor",
203 | )
204 | 


--------------------------------------------------------------------------------
/data_extractor/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linw1995/data_extractor/ca1a4c4dacec7852590ad7bbf1bee421a3ab1e4a/data_extractor/py.typed


--------------------------------------------------------------------------------
/data_extractor/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | =================================
  3 | :mod:`utils` -- Extracting Utils.
  4 | =================================
  5 | """
  6 | 
  7 | # Standard Library
  8 | import inspect
  9 | 
 10 | from types import FrameType
 11 | from typing import (
 12 |     TYPE_CHECKING,
 13 |     Any,
 14 |     Callable,
 15 |     Generic,
 16 |     Optional,
 17 |     Type,
 18 |     TypeVar,
 19 |     Union,
 20 |     overload,
 21 | )
 22 | 
 23 | 
 24 | class __Sentinel:
 25 |     """Singleton."""
 26 | 
 27 |     def __repr__(self) -> str:
 28 |         return "sentinel"
 29 | 
 30 | 
 31 | sentinel = __Sentinel()
 32 | 
 33 | 
 34 | class LazyStr:
 35 |     """
 36 |     Lazy String.
 37 | 
 38 |     :param func: Lazy __str__ function.
 39 |     """
 40 | 
 41 |     def __init__(self, func: Callable[[], str]):
 42 |         self.func = func
 43 | 
 44 |     def __str__(self) -> str:
 45 |         return self.func()
 46 | 
 47 | 
 48 | def is_extractor(obj: Any) -> bool:
 49 |     """
 50 |     Determine the object if it is an extractor, return :obj:`True` if it is.
 51 |     """
 52 |     # Local Folder
 53 |     from .core import AbstractComplexExtractor, AbstractSimpleExtractor
 54 | 
 55 |     return isinstance(obj, (AbstractComplexExtractor, AbstractSimpleExtractor))
 56 | 
 57 | 
 58 | def is_simple_extractor(obj: Any) -> bool:
 59 |     """
 60 |     Determine the object if it is a simple extractor, return :obj:`True` if it is.
 61 |     """
 62 |     # Local Folder
 63 |     from .core import AbstractSimpleExtractor
 64 | 
 65 |     return isinstance(obj, AbstractSimpleExtractor)
 66 | 
 67 | 
 68 | def is_complex_extractor(obj: Any) -> bool:
 69 |     """
 70 |     Determine the object if it is a complex extractor, return :obj:`True` if it is.
 71 |     """
 72 |     # Local Folder
 73 |     from .core import AbstractComplexExtractor
 74 | 
 75 |     return isinstance(obj, AbstractComplexExtractor)
 76 | 
 77 | 
 78 | def getframe(depth: int = 0) -> Optional[FrameType]:
 79 |     cur = frame = inspect.currentframe()
 80 |     if frame is None:
 81 |         # If running in an implementation without Python stack frame support,
 82 |         return None
 83 | 
 84 |     while depth > -1:
 85 |         if cur is None:
 86 |             raise ValueError(f"Invalid depth = {depth!r} for frame = {frame!r}")
 87 | 
 88 |         cur = cur.f_back
 89 |         depth -= 1
 90 | 
 91 |     return cur
 92 | 
 93 | 
 94 | T = TypeVar("T")
 95 | 
 96 | if TYPE_CHECKING:
 97 |     # Local Folder
 98 |     from .core import AbstractExtractors
 99 | 
100 | 
101 | class Property(Generic[T]):
102 |     """
103 |     Extractor property.
104 |     """
105 | 
106 |     def __set_name__(self, owner: Any, name: str) -> None:
107 |         """
108 |         Customized names -- Descriptor HowTo Guide
109 |         https://docs.python.org/3/howto/descriptor.html#customized-names
110 |         """
111 |         self.public_name = name
112 |         self.private_name = f"__property_{name}"
113 | 
114 |     @overload
115 |     def __get__(self, obj: None, cls: Type["AbstractExtractors"]) -> "Property[T]":
116 |         pass
117 | 
118 |     @overload
119 |     def __get__(self, obj: Any, cls: Type["AbstractExtractors"]) -> T:
120 |         pass
121 | 
122 |     def __get__(
123 |         self, obj: Any, cls: Type["AbstractExtractors"]
124 |     ) -> Union["Property[T]", T]:
125 |         if obj is None:
126 |             return self
127 | 
128 |         try:
129 |             return getattr(obj, self.private_name)
130 |         except AttributeError as exc:
131 |             # raise right AttributeError
132 |             msg: str = exc.args[0]
133 |             raise AttributeError(msg.replace(self.private_name, self.public_name))
134 | 
135 |     def __set__(self, obj: Any, value: T) -> T:
136 |         if hasattr(obj, self.private_name):
137 |             raise AttributeError("can't set attribute")
138 |         else:
139 |             setattr(obj, self.private_name, value)
140 |             return value
141 | 
142 |     @staticmethod
143 |     def change_internal_value(
144 |         obj: "AbstractExtractors", property_name: str, value: T
145 |     ) -> None:
146 |         attr = getattr(type(obj), property_name)
147 |         if not isinstance(attr, Property):
148 |             raise AttributeError(f"Type of attribute {property_name!r} is not Property")
149 | 
150 |         setattr(obj, attr.private_name, value)
151 | 
152 | 
153 | def _missing_dependency(dependency: str) -> None:
154 |     """
155 |     Raise :class:RuntimeError for the extractor class that missing optional dependency.
156 |     """
157 |     raise RuntimeError(f"{dependency!r} package is needed, run pip to install it. ")
158 | 
159 | 
160 | __all__ = (
161 |     "LazyStr",
162 |     "Property",
163 |     "getframe",
164 |     "is_complex_extractor",
165 |     "is_extractor",
166 |     "is_simple_extractor",
167 |     "sentinel",
168 | )
169 | 


--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   lib,
 3 |   dream2nix,
 4 |   ...
 5 | }: {
 6 |   imports = [
 7 |     dream2nix.modules.dream2nix.WIP-python-pdm
 8 |   ];
 9 | 
10 |   mkDerivation = {
11 |     src = lib.cleanSourceWith {
12 |       src = lib.cleanSource ./.;
13 |       filter = name: type:
14 |         !(builtins.any (x: x) [
15 |           (lib.hasSuffix ".nix" name)
16 |           (lib.hasPrefix "." (builtins.baseNameOf name))
17 |           (lib.hasSuffix "flake.lock" name)
18 |         ]);
19 |     };
20 |   };
21 | 
22 |   pdm.lockfile = ./pdm.lock;
23 |   pdm.pyproject = ./pyproject.toml;
24 | 
25 |   buildPythonPackage = {
26 |     pythonImportsCheck = [
27 |       "data_extractor"
28 |     ];
29 |   };
30 | 
31 |   pdm.editables = lib.mkForce {};
32 | }
33 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
 1 | div.sphinxsidebar {
 2 |     width: 250px;
 3 | }
 4 | 
 5 | dl.class{
 6 |     margin-bottom: 1rem;
 7 | }
 8 | 
 9 | dl.method, dl.field-list {
10 |     margin-bottom: 0.5rem;
11 | }
12 | 
13 | pre {
14 |     padding: 7px 15px;
15 | }
16 | 


--------------------------------------------------------------------------------
/docs/source/api_core.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: data_extractor.core
 2 | 
 3 | .. autoclass:: data_extractor.core.SimpleExtractorMeta
 4 | 
 5 | .. autoclass:: data_extractor.core.ComplexExtractorMeta
 6 | 
 7 | .. autoclass:: data_extractor.core.AbstractSimpleExtractor
 8 |     :members:
 9 | 
10 | .. autoclass:: data_extractor.core.AbstractComplexExtractor
11 |     :members:
12 | 


--------------------------------------------------------------------------------
/docs/source/api_exceptions.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.exceptions
2 | 
3 | .. autoexception:: data_extractor.exceptions.ExprError
4 | 
5 | .. autoexception:: data_extractor.exceptions.ExtractError
6 | 


--------------------------------------------------------------------------------
/docs/source/api_item.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: data_extractor.item
 2 | 
 3 | .. autoclass:: data_extractor.item.Field
 4 |     :show-inheritance:
 5 |     :inherited-members:
 6 |     :members:
 7 | 
 8 | .. autoclass:: data_extractor.item.Item
 9 |     :show-inheritance:
10 |     :inherited-members:
11 |     :members: extract, field_names, simplify
12 | 


--------------------------------------------------------------------------------
/docs/source/api_json.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.json
2 |     :members:
3 |     :inherited-members:
4 |     :show-inheritance:
5 | 


--------------------------------------------------------------------------------
/docs/source/api_lxml.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: data_extractor.lxml
2 |     :members:
3 |     :inherited-members:
4 |     :show-inheritance:
5 | 


--------------------------------------------------------------------------------
/docs/source/api_reference.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | API Reference
 3 | =============
 4 | 
 5 | .. automodule:: data_extractor
 6 | 
 7 | .. toctree::
 8 |    :name: API Reference
 9 |    :maxdepth: 2
10 | 
11 |    api_core
12 |    api_exceptions
13 |    api_utils
14 |    api_lxml
15 |    api_json
16 |    api_item
17 | 


--------------------------------------------------------------------------------
/docs/source/api_utils.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: data_extractor.utils
 2 | 
 3 | .. autoclass:: data_extractor.utils.LazyStr
 4 | 
 5 | .. autodata:: data_extractor.utils.sentinel
 6 | 
 7 | .. autofunction:: data_extractor.utils.is_extractor
 8 | 
 9 | .. autofunction:: data_extractor.utils.is_simple_extractor
10 | 
11 | .. autofunction:: data_extractor.utils.is_complex_extractor
12 | 
13 | .. autoclass:: data_extractor.utils.Property
14 | 


--------------------------------------------------------------------------------
/docs/source/changelog.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Changelog
 3 | =========
 4 | 
 5 | v1.0.1
 6 | ~~~~~~
 7 | 
 8 | **Build**
 9 | 
10 | - Supports Python 3.13
11 | 
12 | 
13 | .. include:: history.rst
14 |     :start-line: 4
15 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | # Standard Library
21 | from datetime import date
22 | 
23 | project = "Data-Extractor"
24 | year = date.today().year
25 | copyright = f"{year}, 林玮"
26 | author = "林玮"
27 | 
28 | 
29 | # -- General configuration ---------------------------------------------------
30 | 
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.githubpages"]
35 | autodoc_inherit_docstrings = True
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ["_templates"]
39 | 
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = []
44 | 
45 | 
46 | # -- Options for HTML output -------------------------------------------------
47 | 
48 | # The theme to use for HTML and HTML Help pages.  See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = "alabaster"
52 | html_theme_options = {
53 |     "description": "Combine XPath, CSS Selectors and JSONPath for Web data extracting.",
54 |     "github_button": True,
55 |     "github_type": "star",
56 |     "travis_button": True,
57 |     "codecov_button": True,
58 |     "github_user": "linw1995",
59 |     "github_repo": "data_extractor",
60 |     "fixed_sidebar": False,
61 |     "page_width": "1024px",
62 |     "sidebar_width": "230px",
63 | }
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ["_static"]
68 | html_sidebars = {
69 |     "**": ["about.html", "navigation.html", "relations.html", "searchbox.html"]
70 | }
71 | 


--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Contributing
 3 | ============
 4 | 
 5 | 
 6 | Environment Setup
 7 | ~~~~~~~~~~~~~~~~~
 8 | 
 9 | Clone the source codes from Github.
10 | 
11 | .. code-block:: shell
12 | 
13 |     git clone https://github.com/linw1995/data_extractor.git
14 |     cd data_extractor
15 | 
16 | Setup the development environment.
17 | Please make sure you install the pdm_,
18 | pre-commit_ and nox_ CLIs in your environment.
19 | 
20 | .. code-block:: shell
21 | 
22 |     make init
23 |     make PYTHON=3.7 init  # for specific python version
24 | 
25 | Linting
26 | ~~~~~~~
27 | 
28 | Use pre-commit_ for installing linters to ensure a good code style.
29 | 
30 | .. code-block:: shell
31 | 
32 |     make pre-commit
33 | 
34 | Run linters. Some linters run via CLI nox_, so make sure you install it.
35 | 
36 | .. code-block:: shell
37 | 
38 |     make check-all
39 | 
40 | Testing
41 | ~~~~~~~
42 | 
43 | Run quick tests.
44 | 
45 | .. code-block:: shell
46 | 
47 |     make
48 | 
49 | Run quick tests with verbose.
50 | 
51 | .. code-block:: shell
52 | 
53 |     make vtest
54 | 
55 | Run tests with coverage.
56 | Testing in multiple Python environments is powered by CLI nox_.
57 | 
58 | .. code-block:: shell
59 | 
60 |     make cov
61 | 
62 | .. _pdm: https://github.com/pdm-project/pdm
63 | .. _pre-commit: https://pre-commit.com/
64 | .. _nox: https://nox.thea.codes/en/stable/
65 | 


--------------------------------------------------------------------------------
/docs/source/history.rst:
--------------------------------------------------------------------------------
  1 | =======
  2 | History
  3 | =======
  4 | 
  5 | v1.0.0
  6 | ~~~~~~
  7 | 
  8 | **Feature**
  9 | 
 10 | - Generic extractor with convertor (#83)
 11 | - mypy plugin for type annotation of extracting result (#83)
 12 | 
 13 | v0.10.2
 14 | ~~~~~~~
 15 | 
 16 | **Build**
 17 | 
 18 | - upgrade jsonpath-extractor to v0.8.0
 19 | 
 20 | 
 21 | v0.10.1
 22 | ~~~~~~~
 23 | 
 24 | **Fix**
 25 | 
 26 | - typo in .utils.Property
 27 | 
 28 | v0.10.0
 29 | ~~~~~~~
 30 | 
 31 | **Feature**
 32 | 
 33 | - supports PEP 561 -- Distributing and Packaging Type Information
 34 | 
 35 | **Fix**
 36 | 
 37 | - remove LICENSE file from dist files
 38 | - duplicated extracting if class attrs overlap happened #67
 39 | - remove super class sub-extractors error #68
 40 | 
 41 | **Refactor**
 42 | 
 43 | - remove duplciated module "data_extractor.abc"
 44 | - remove the lazy build mechanism of extractors
 45 | - JSON backend invoking mechanism
 46 | - make all properties of extractors immutable
 47 | 
 48 | **Document**
 49 | 
 50 | - fix wrong docstring of "data_extractor.utils.Property"
 51 | 
 52 | v0.9.0
 53 | ~~~~~~
 54 | 
 55 | **Fix**
 56 | 
 57 | - type annotations #63 #64
 58 | 
 59 | **Refactor**
 60 | 
 61 | - .utils.Property with "Customized names" support #64
 62 | - rename .abc to .core and mark elder duplciated #65
 63 | 
 64 | v0.8.0
 65 | ~~~~~~
 66 | 
 67 | - 11bfd2c supports latest jsonpath-extractor package
 68 | 
 69 | v0.7.0
 70 | ~~~~~~
 71 | 
 72 | - 65d1fce Fix:Create JSONExtractor with wrong subtype
 73 | - 407cd78 New:Make lxml and cssselect optional (#61)
 74 | 
 75 | v0.6.1
 76 | ~~~~~~
 77 | 
 78 | - d28fff4 Fix:Item created error by ``type`` function. (Issue #56)
 79 | 
 80 | v0.6.0
 81 | ~~~~~~
 82 | 
 83 | - f1d21fe New:Make different implementations of JSONExtractor optional
 84 | - 0175cde New:Add jsonpath-extractor as opitional json extractor backend
 85 | - 3b6da8b Chg:Upgrade dependencies
 86 | 
 87 | v0.6.0-alpha.3
 88 | ~~~~~~~~~~~~~~
 89 | 
 90 | - 1982302 Fix:Type annotation error
 91 | 
 92 | v0.6.0.dev2
 93 | ~~~~~~~~~~~
 94 | 
 95 | - b7edbae Dev,New:Use nox test in multi-py-versions, Update workflow
 96 | - a043838 Fix:Can't import JSONPathExtractor from root module
 97 | - a23ece9 Test,Fix:Missing JSONPathExtractor in simple extractor tests
 98 | - 5903ff9 Dev,Fix:Nox changes symlink '.venv' of virtualenv of development
 99 | - 57d03ad Dev,Fix:Install unneeded development dependencies
100 | 
101 | v0.6.0.dev1
102 | ~~~~~~~~~~~
103 | 
104 | - 2459f7d Dev,New:Add Github Actions for CI
105 | - a151a91 Dev,New:Add scripts/export_requirements_txt.sh
106 | - f7cdaa3 Dev,Chg:Remove travis-ci
107 | - f1d21fe New:Make different implementations of JSONExtractor optional
108 | - 9f74619 Fix:Use __getattr__ on the module in the wrong way
109 | - 25a8bf8 Dev,Fix:Cannot use pytest.mark.usefixtures() in pytest.param
110 | - 8f51603 Dev,Chg:Upgrade poetry version in Makefile
111 | - 21aa08e Dev,Chg:Test in two ways
112 | - 4cb4678 Chg:Upgrade dependencies
113 | - 4177b98 Dev,Fix:remove the venv before pretest installation
114 | - 0175cde New:Add jsonpath-extractor as opitional json extractor backend
115 | 
116 | v0.5.4
117 | ~~~~~~
118 | 
119 | - 9552c79 Fix:Simplified item's extract_first method fail to raise ExtractError
120 | - 08167ab Fix:Simplified item's extract_first method
121 |   should support param default
122 | - 6e4c269 New:More unittest for testing the simplified items
123 | - a35b85a Chg:Update poetry.lock
124 | - e5ff37b Docs,Chg:Update travis-ci status source in the README.rst
125 | 
126 | v0.5.3
127 | ~~~~~~
128 | 
129 | - 6a26be5 Chg:Wrap the single return value as a list
130 | - 0b63927 Fix:Item can not extract the data is list type
131 | - 9deeb5f Chg:Update poetry.lock
132 | 
133 | v0.5.2
134 | ~~~~~~
135 | 
136 | - 0561672 Fix:Wrong parameter name
137 | 
138 | v0.5.1
139 | ~~~~~~
140 | 
141 | - c9b07f4 Fix:Wrong shield placing
142 | - b198712 Dev,Fix:Build travis-ci config validation
143 | 
144 | v0.5.0
145 | ~~~~~~
146 | 
147 | - 0056f37 Split AbstractExtractor into AbstractSimpleExtractor and
148 |   AbstractComplexExtractor
149 | - c42aeb5 Feature/more friendly development setup (#34)
150 | - 2f9a71c New:Support testing in 3.8
151 | - c8bd593 New:Stash unstaged code before testing
152 | - d2a18a8 New:Best way to raise new exc
153 | - 90fa9c8 New:ExprError ``__str__`` implementation
154 | - d961768 Fix:Update mypy pre-commit config
155 | - e5d59c3 New:Raise SyntaxError when field overwrites method (#38)
156 | - 7720fb9 Feature/avoid field overwriting (#39)
157 | - b722717 Dev,Fix:Black configure not working
158 | - f8f0df8 New:Implement extractors' build method
159 | - 98ada74 Chg:Update docs
160 | 
161 | v0.4.1
162 | ~~~~~~
163 | 
164 | - d180992 Add pre-commit support and fix pre-commit check error (#32)
165 | - bd680c1 Update pyproject.toml
166 | - 64f30f7 remove unhappened condtional
167 | 
168 | v0.4.0
169 | ~~~~~~
170 | 
171 | - 74f569b Update docs and lint docs (#31)
172 | - 4188634 Support RTD (#30)
173 | - a5b776f Separate dependencies (#29)
174 | - 69079b4 Generate simple extractor from a complex extractor (#28)
175 | - 58a7570 Support JSONPath ext syntax (#26)
176 | - bb7c602 Replace Pipenv with Poetry (#24)
177 | 
178 | v0.3.2
179 | ~~~~~~
180 | 
181 | - cd65ad0 Make Parameter extractor Optional
182 | 
183 | v0.2.2
184 | ~~~~~~
185 | 
186 | - fca801a Merge pull request #22 from linw1995/hotfix
187 | 
188 |   + 8bf2a62 Fix name overwritten syntax checking
189 |     that includes the ``__init__`` first parameter.
190 | 
191 |   + 10e2ca0 Fix raise wrong execption from python repl,
192 |     oneline code or type() creation.
193 | 
194 | v0.2.1
195 | ~~~~~~
196 | 
197 | - a05b75f Export all from the root module.
198 | - d2900d3 Add Optional Parameter name for special field name. (#19)
199 | - 99a4a7f Raise SyntaxError
200 |   when the field name is the same as Item's parameter… (#18)
201 | 
202 | v0.2.0
203 | ~~~~~~
204 | 
205 | - 9c2e2cd Rename ExtractFirstMixin into SimpleExtractorBase (#12)
206 | - bac925d Raise ValueError
207 |   when misplaced the complex extractor in complex extractor. (#13)
208 | 
209 | - 88b9227 Wrap expr exception (#14)
210 | - aeb9520 Deploy Docs on GitHub Pages. (#15)
211 | 
212 |   + Update docstring.
213 |   + Deploy Docs on Github Pages.
214 |   + Add Quickstarts.rst
215 | 
216 | - Bump into beta
217 | 
218 | v0.1.5
219 | ~~~~~~
220 | 
221 | - cabfac3 Add utils.py
222 | - 9e1c005 Make all extractor class inherit the same ABC.
223 | - 7828a1a Make easy to trace exception thrown
224 |   by complex extractor extracting data.
225 | 
226 | v0.1.4
227 | ~~~~~~
228 | 
229 | - f4267fe Modify docstr
230 | - 6f2f8d1 Add more docstr
231 | 
232 | v0.1.3
233 | ~~~~~~
234 | 
235 | - 5f4b0e0 Update README.md
236 | - 1b8bfb9 Add UserWarning when extractor can't extract first item from result
237 | - dd2cd25 Remove the useless _extract call
238 | - 655ec9d Add UserWarning when expr is conflict with parameter is_many=True
239 | - bcade2c No alow user to set is_many=True and default!=sentinel at same time
240 | - 761bd30 Add more unit tests
241 | 
242 | v0.1.2
243 | ~~~~~~
244 | 
245 | - Add exceptions.py and ExprError
246 | - Change travis-ci deploy stage condition
247 | - Add travis-ci deploy github release
248 | 
249 | v0.1.1
250 | ~~~~~~
251 | 
252 | - Rename ``.html`` to ``.lxml``;
253 |   Remove ``fromstring``, ``tostring`` function from ``.lxml``
254 | 
255 |   + Rename .html to .lxml
256 |   + use ``lxml.html.fromstring`` and ``lxml.html.tostring`` to process HTML
257 |   + use ``lxml.etree.fromstring`` and ``lxml.etree.tostring`` to process XML
258 | 
259 | - Add check_isort, check_black, check,
260 |   check_all, fc: format_code into Makefile for development.
261 | 
262 | v0.1.0
263 | ~~~~~~
264 | 
265 | - initialize project
266 | - add Extractor to extract data from the text which format is HTML or JSON.
267 | - add complex extractor: Field, Item
268 | 


--------------------------------------------------------------------------------
/docs/source/howto/index.rst:
--------------------------------------------------------------------------------
 1 | =====================
 2 | Data-Extractor HOWTOs
 3 | =====================
 4 | 
 5 | Learning how to use data-extractor.
 6 | 
 7 | .. toctree::
 8 |     :maxdepth: 2
 9 | 
10 |     json
11 |     lxml
12 |     item
13 | 


--------------------------------------------------------------------------------
/docs/source/howto/item.rst:
--------------------------------------------------------------------------------
  1 | ==================
  2 | Complex Extracting
  3 | ==================
  4 | 
  5 | .. include:: lxml.rst
  6 |     :start-line: 7
  7 |     :end-before: Using
  8 | 
  9 | Defining :class:`ChannelItem` class, then extracting the data.
 10 | 
 11 | .. code-block:: python3
 12 | 
 13 |     from data_extractor import Field, Item, XPathExtractor
 14 | 
 15 | 
 16 |     class ChannelItem(Item):
 17 |         title = Field(XPathExtractor("./title/text()"), default="")
 18 |         link = Field(XPathExtractor("./link/text()"), default="")
 19 |         description = Field(XPathExtractor("./description/text()"))
 20 |         publish_date = Field(XPathExtractor("./pubDate/text()"))
 21 |         guid = Field(XPathExtractor("./guid/text()"))
 22 | 
 23 | Extracting all channel items from file.
 24 | 
 25 | .. code-block:: python3
 26 | 
 27 |     from data_extractor import XPathExtractor
 28 | 
 29 |     extractor = ChannelItem(XPathExtractor("//channel/item"), is_many=True)
 30 |     assert extractor.extract(root)[:2] == [
 31 |         {
 32 |             "title": "Star City",
 33 |             "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
 34 |             "description": 'How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.',
 35 |             "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT",
 36 |             "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
 37 |         },
 38 |         {
 39 |             "title": "",
 40 |             "link": "",
 41 |             "description": 'Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st.',
 42 |             "publish_date": "Fri, 30 May 2003 11:06:42 GMT",
 43 |             "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
 44 |         },
 45 |     ]
 46 | 
 47 | Nested Extractors
 48 | ~~~~~~~~~~~~~~~~~
 49 | 
 50 | Defining :class:`Channel` class with :class:`ChannelItem`.
 51 | 
 52 | .. code-block:: python3
 53 | 
 54 |     class Channel(Item):
 55 |         title = Field(XPathExtractor("./title/text()"))
 56 |         link = Field(XPathExtractor("./link/text()"))
 57 |         description = Field(XPathExtractor("./description/text()"))
 58 |         language = Field(XPathExtractor("./language/text()"))
 59 |         publish_date = Field(XPathExtractor("./pubDate/text()"))
 60 |         last_build_date = Field(XPathExtractor("./lastBuildDate/text()"))
 61 |         docs = Field(XPathExtractor("./docs/text()"))
 62 |         generator = Field(XPathExtractor("./generator/text()"))
 63 |         managing_editor = Field(XPathExtractor("./managingEditor/text()"))
 64 |         web_master = Field(XPathExtractor("./webMaster/text()"))
 65 | 
 66 |         items = ChannelItem(XPathExtractor("./item[position()<3]"), is_many=True)
 67 | 
 68 | Extracting the rss channel data from file.
 69 | 
 70 | .. code-block:: python3
 71 | 
 72 |     from data_extractor import XPathExtractor
 73 | 
 74 |     extractor = Channel(XPathExtractor("//channel"))
 75 |     assert extractor.extract(root) == {
 76 |         "title": "Liftoff News",
 77 |         "link": "http://liftoff.msfc.nasa.gov/",
 78 |         "description": "Liftoff to Space Exploration.",
 79 |         "language": "en-us",
 80 |         "publish_date": "Tue, 10 Jun 2003 04:00:00 GMT",
 81 |         "last_build_date": "Tue, 10 Jun 2003 09:41:01 GMT",
 82 |         "docs": "http://blogs.law.harvard.edu/tech/rss",
 83 |         "generator": "Weblog Editor 2.0",
 84 |         "managing_editor": "editor@example.com",
 85 |         "web_master": "webmaster@example.com",
 86 |         "items": [
 87 |             {
 88 |                 "title": "Star City",
 89 |                 "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
 90 |                 "description": 'How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.',
 91 |                 "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT",
 92 |                 "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
 93 |             },
 94 |             {
 95 |                 "title": "",
 96 |                 "link": "",
 97 |                 "description": 'Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st.',
 98 |                 "publish_date": "Fri, 30 May 2003 11:06:42 GMT",
 99 |                 "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
100 |             },
101 |         ],
102 |     }
103 | 
104 | Simplifying Complex Extractor
105 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
106 | 
107 | A complex extractor can be simplified
108 | into a simple extractor
109 | by using :meth:`data_extractor.item.Item.simplify`.
110 | 
111 | .. code-block:: python3
112 | 
113 |     from data_extractor import XPathExtractor
114 | 
115 |     complex_extractorra = ChannelItem(XPathExtractor("//channel/item"))
116 |     simple_extractor = complex_extractor.simplify()
117 | 
118 |     complex_extractor.is_many = False
119 |     assert simple_extractor.extract_first(root) == complex_extractor.extract(root)
120 | 
121 |     complex_extractor.is_many = True
122 |     assert simple_extractor.extract(root) == complex_extractor.extract(root)
123 | 
124 | Set Paramater Extractor Be None To Extract Data From Root
125 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
126 | 
127 | .. code-block:: python3
128 | 
129 |     from data_extractor import Item, Field, JSONExtractor
130 | 
131 | 
132 |     class User(Item):
133 |         nickname = Field(JSONExtractor("name"))
134 |         age = Field(JSONExtractor("age"))
135 |         raw = Field()
136 | 
137 | 
138 |     assert User().extract({"name": "john", "age": 17, "gender": "male"}) == {
139 |         "nickname": "john",
140 |         "age": 17,
141 |         "raw": {"name": "john", "age": 17, "gender": "male"},
142 |     }
143 | 
144 | Avoid Field Overwrites Property Or Method
145 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
146 | 
147 | To avoid complex extractor's field overwrites its property or method,
148 | use the parameter **name** of the complex extractor.
149 | 
150 | .. code-block:: python3
151 | 
152 |     from data_extractor import Field, Item, JSONExtractor
153 | 
154 | 
155 |     class User(Item):
156 |         name_ = Field(JSONExtractor("name"), name="name")
157 | 
158 | 
159 |     assert User().extract({"name": "john", "age": 17}) == {"name": "john"}
160 | 


--------------------------------------------------------------------------------
/docs/source/howto/json.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | Extract JSON Data
 3 | =================
 4 | 
 5 | The function to extract data from the JSON file
 6 | powered by python-jsonpath-rw_ and python-jsonpath-rw-ext_
 7 | to support JSONPath_.
 8 | Or use a new syntax of JSONPATH for extracting
 9 | by installing optional dependency jsonpath-extractor_.
10 | 
11 | Run below command to install optional dependency.
12 | 
13 | .. code-block:: shell
14 | 
15 |     pip install "data_extractor[jsonpath-rw]"
16 |     pip install "data_extractor[jsonpath-rw-ext]"
17 | 
18 |     pip install "data_extractor[jsonpath-extractor]"
19 | 
20 | Use the :class:`data_extractor.json.JSONExtractor` to extract data.
21 | 
22 | .. code-block:: python3
23 | 
24 |     import json
25 |     from data_extractor import JSONExtractor
26 | 
27 |     text = '{"foo": [{"baz": 1}, {"baz": 2}]}'
28 |     data = json.loads(text)
29 |     assert JSONExtractor("foo[*].baz").extract(data) == [1, 2]
30 | 
31 | .. _python-jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw
32 | .. _python-jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/
33 | .. _JSONPath: https://goessner.net/articles/JsonPath/
34 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath
35 | 
36 | By changing :data:`json_extractor_backend`
37 | to use a specific backend of JSON extractor.
38 | See APIs ref of :class:`data_extractor.json.JSONExtractor`
39 | for additional details.
40 | 


--------------------------------------------------------------------------------
/docs/source/howto/lxml.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | Extract HTML or XML Data
 3 | ========================
 4 | 
 5 | The function to extract data from the html or xml file
 6 | powered by lxml_ to support XPath_, by cssselect_ to support CSS-Selectors_.
 7 | 
 8 | Run below command to install optional dependency.
 9 | 
10 | .. code-block:: shell
11 | 
12 |     pip install "data_extractor[lxml]"  # For using XPath
13 |     pip install "data_extractor[cssselect]"  # For using CSS-Selectors
14 | 
15 | Download RSS Sample file for demonstrate.
16 | 
17 | .. code-block:: shell
18 | 
19 |     wget http://www.rssboard.org/files/sample-rss-2.xml
20 | 
21 | Parse it into :class:`data_extractor.lxml.Element`.
22 | 
23 | .. code-block:: python3
24 | 
25 |     from pathlib import Path
26 | 
27 |     from lxml.etree import fromstring
28 | 
29 |     root = fromstring(Path("sample-rss-2.xml").read_text())
30 | 
31 | Using :class:`data_extractor.lxml.XPathExtractor` to extract rss channel title.
32 | 
33 | .. code-block:: python3
34 | 
35 |     from data_extractor import XPathExtractor
36 | 
37 |     assert XPathExtractor("//channel/title/text()").extract_first(root) == "Liftoff News"
38 | 
39 | Using :class:`data_extractor.lxml.TextCSSExtractor`
40 | to extract all rss item links.
41 | 
42 | .. code-block:: python3
43 | 
44 |     from data_extractor import TextCSSExtractor
45 | 
46 |     assert TextCSSExtractor("item>link").extract(root) == [
47 |         "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
48 |         "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
49 |         "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
50 |     ]
51 | 
52 | Using :class:`data_extractor.lxml.AttrCSSExtractor` to extract rss version.
53 | 
54 | .. code-block:: python3
55 | 
56 |     from data_extractor import AttrCSSExtractor
57 | 
58 |     assert AttrCSSExtractor("rss", attr="version").extract_first(root) == "2.0"
59 | 
60 | .. _lxml: https://lxml.de
61 | .. _XPath: https://www.w3.org/TR/xpath-10/
62 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/
63 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/
64 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ==========================================
 2 | Welcome to Data-Extractor's documentation!
 3 | ==========================================
 4 | 
 5 | .. include:: readme.rst
 6 |     :start-line: 4
 7 | 
 8 | Contents
 9 | ========
10 | 
11 | .. toctree::
12 |     :maxdepth: 4
13 | 
14 |     quickstarts
15 |     howto/index
16 |     contributing
17 |     api_reference
18 |     changelog
19 | 
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`search`
26 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | Install the stable version from PYPI.
 6 | 
 7 | .. code-block:: shell
 8 | 
 9 |     pip install "data-extractor[jsonpath-extractor]"  # for extracting JSON data
10 |     pip install "data-extractor[lxml]"  # for extracting HTML data
11 | 
12 | Or install the latest version from Github.
13 | 
14 | .. code-block:: shell
15 | 
16 |     pip install "data-extractor[jsonpath-extractor] @ git+https://github.com/linw1995/data_extractor.git@master"
17 | 
18 | Extract JSON data
19 | ~~~~~~~~~~~~~~~~~
20 | 
21 | Currently supports to extract JSON data with below optional dependencies
22 | 
23 | - jsonpath-extractor_
24 | - jsonpath-rw_
25 | - jsonpath-rw-ext_
26 | 
27 | .. _jsonpath-extractor: https://github.com/linw1995/jsonpath
28 | .. _jsonpath-rw: https://github.com/kennknowles/python-jsonpath-rw
29 | .. _jsonpath-rw-ext: https://python-jsonpath-rw-ext.readthedocs.org/en/latest/
30 | 
31 | install one dependency of them to extract JSON data.
32 | 
33 | Extract HTML(XML) data
34 | ~~~~~~~~~~~~~~~~~~~~~~
35 | 
36 | Currently supports to extract HTML(XML) data with below optional dependencies
37 | 
38 | - lxml_ for using XPath_
39 | - cssselect_ for using CSS-Selectors_
40 | 
41 | .. _lxml: https://lxml.de/
42 | .. _XPath: https://www.w3.org/TR/xpath-10/
43 | .. _cssselect: https://cssselect.readthedocs.io/en/latest/
44 | .. _CSS-Selectors: https://www.w3.org/TR/selectors-3/
45 | 


--------------------------------------------------------------------------------
/docs/source/quickstarts.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Quickstarts
 3 | ===========
 4 | 
 5 | Installation
 6 | ~~~~~~~~~~~~
 7 | 
 8 | .. include:: installation.rst
 9 |     :start-line: 4
10 | 
11 | Usage
12 | ~~~~~
13 | 
14 | .. code-block:: python3
15 | 
16 |     from data_extractor import Field, Item, JSONExtractor
17 | 
18 | 
19 |     class Count(Item):
20 |         followings = Field(JSONExtractor("countFollowings"))
21 |         fans = Field(JSONExtractor("countFans"))
22 | 
23 | 
24 |     class User(Item):
25 |         name_ = Field(JSONExtractor("name"), name="name")
26 |         age = Field(JSONExtractor("age"), default=17)
27 |         count = Count()
28 | 
29 | 
30 |     assert User(JSONExtractor("data.users[*]"), is_many=True).extract(
31 |         {
32 |             "data": {
33 |                 "users": [
34 |                     {
35 |                         "name": "john",
36 |                         "age": 19,
37 |                         "countFollowings": 14,
38 |                         "countFans": 212,
39 |                     },
40 |                     {
41 |                         "name": "jack",
42 |                         "description": "",
43 |                         "countFollowings": 54,
44 |                         "countFans": 312,
45 |                     },
46 |                 ]
47 |             }
48 |         }
49 |     ) == [
50 |         {"name": "john", "age": 19, "count": {"followings": 14, "fans": 212}},
51 |         {"name": "jack", "age": 17, "count": {"followings": 54, "fans": 312}},
52 |     ]
53 | 


--------------------------------------------------------------------------------
/docs/source/readme.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | Data Extractor
 3 | ==============
 4 | 
 5 | |license| |Pypi Status| |Python version| |Package version| |PyPI - Downloads|
 6 | |GitHub last commit| |Code style: black| |Build Status| |codecov|
 7 | |Documentation Status| |PDM managed|
 8 | 
 9 | Combine **XPath**, **CSS Selectors** and **JSONPath** for Web data extracting.
10 | 
11 | Quickstarts
12 | <<<<<<<<<<<
13 | 
14 | .. include:: quickstarts.rst
15 |     :start-line: 4
16 | 
17 | Changelog
18 | <<<<<<<<<
19 | 
20 | .. include:: changelog.rst
21 |     :start-line: 4
22 |     :end-before: .. include:: history.rst
23 | 
24 | Contributing
25 | <<<<<<<<<<<<
26 | 
27 | .. include:: contributing.rst
28 |     :start-line: 4
29 | 
30 | .. |license| image:: https://img.shields.io/github/license/linw1995/data_extractor.svg
31 |     :target: https://github.com/linw1995/data_extractor/blob/master/LICENSE
32 | 
33 | .. |Pypi Status| image:: https://img.shields.io/pypi/status/data_extractor.svg
34 |     :target: https://pypi.org/project/data_extractor
35 | 
36 | .. |Python version| image:: https://img.shields.io/pypi/pyversions/data_extractor.svg
37 |     :target: https://pypi.org/project/data_extractor
38 | 
39 | .. |Package version| image:: https://img.shields.io/pypi/v/data_extractor.svg
40 |     :target: https://pypi.org/project/data_extractor
41 | 
42 | .. |PyPI - Downloads| image:: https://img.shields.io/pypi/dm/data-extractor.svg
43 |     :target: https://pypi.org/project/data_extractor
44 | 
45 | .. |GitHub last commit| image:: https://img.shields.io/github/last-commit/linw1995/data_extractor.svg
46 |     :target: https://github.com/linw1995/data_extractor
47 | 
48 | .. |Code style: black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
49 |     :target: https://github.com/ambv/black
50 | 
51 | .. |Build Status| image:: https://github.com/linw1995/data_extractor/workflows/Lint&Test/badge.svg
52 |     :target: https://github.com/linw1995/data_extractor/actions?query=workflow%3ALint%26Test
53 | 
54 | .. |codecov| image:: https://codecov.io/gh/linw1995/data_extractor/branch/master/graph/badge.svg
55 |     :target: https://codecov.io/gh/linw1995/data_extractor
56 | 
57 | .. |Documentation Status| image:: https://readthedocs.org/projects/data-extractor/badge/?version=latest
58 |     :target: https://data-extractor.readthedocs.io/en/latest/?badge=latest
59 | 
60 | .. |PDM managed| image:: https://img.shields.io/badge/pdm-managed-blueviolet
61 |     :target: https://pdm.fming.dev
62 | 


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes": {
  3 |     "dream2nix": {
  4 |       "inputs": {
  5 |         "nixpkgs": "nixpkgs",
  6 |         "purescript-overlay": "purescript-overlay",
  7 |         "pyproject-nix": "pyproject-nix"
  8 |       },
  9 |       "locked": {
 10 |         "lastModified": 1728585693,
 11 |         "narHash": "sha256-rhx5SYpIkPu7d+rjF9FGGBVxS0BwAEkmYIsJg2a3E20=",
 12 |         "owner": "nix-community",
 13 |         "repo": "dream2nix",
 14 |         "rev": "c6935471f7e1a9e190aaa9ac9823dca34e00d92a",
 15 |         "type": "github"
 16 |       },
 17 |       "original": {
 18 |         "owner": "nix-community",
 19 |         "repo": "dream2nix",
 20 |         "type": "github"
 21 |       }
 22 |     },
 23 |     "flake-compat": {
 24 |       "flake": false,
 25 |       "locked": {
 26 |         "lastModified": 1696426674,
 27 |         "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
 28 |         "owner": "edolstra",
 29 |         "repo": "flake-compat",
 30 |         "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
 31 |         "type": "github"
 32 |       },
 33 |       "original": {
 34 |         "owner": "edolstra",
 35 |         "repo": "flake-compat",
 36 |         "type": "github"
 37 |       }
 38 |     },
 39 |     "nixpkgs": {
 40 |       "locked": {
 41 |         "lastModified": 1728538411,
 42 |         "narHash": "sha256-f0SBJz1eZ2yOuKUr5CA9BHULGXVSn6miBuUWdTyhUhU=",
 43 |         "owner": "NixOS",
 44 |         "repo": "nixpkgs",
 45 |         "rev": "b69de56fac8c2b6f8fd27f2eca01dcda8e0a4221",
 46 |         "type": "github"
 47 |       },
 48 |       "original": {
 49 |         "owner": "NixOS",
 50 |         "ref": "nixpkgs-unstable",
 51 |         "repo": "nixpkgs",
 52 |         "type": "github"
 53 |       }
 54 |     },
 55 |     "purescript-overlay": {
 56 |       "inputs": {
 57 |         "flake-compat": "flake-compat",
 58 |         "nixpkgs": [
 59 |           "dream2nix",
 60 |           "nixpkgs"
 61 |         ],
 62 |         "slimlock": "slimlock"
 63 |       },
 64 |       "locked": {
 65 |         "lastModified": 1724504251,
 66 |         "narHash": "sha256-TIw+sac0NX0FeAneud+sQZT+ql1G/WEb7/Vb436rUXM=",
 67 |         "owner": "thomashoneyman",
 68 |         "repo": "purescript-overlay",
 69 |         "rev": "988b09676c2a0e6a46dfa3589aa6763c90476b8a",
 70 |         "type": "github"
 71 |       },
 72 |       "original": {
 73 |         "owner": "thomashoneyman",
 74 |         "repo": "purescript-overlay",
 75 |         "type": "github"
 76 |       }
 77 |     },
 78 |     "pyproject-nix": {
 79 |       "flake": false,
 80 |       "locked": {
 81 |         "lastModified": 1702448246,
 82 |         "narHash": "sha256-hFg5s/hoJFv7tDpiGvEvXP0UfFvFEDgTdyHIjDVHu1I=",
 83 |         "owner": "davhau",
 84 |         "repo": "pyproject.nix",
 85 |         "rev": "5a06a2697b228c04dd2f35659b4b659ca74f7aeb",
 86 |         "type": "github"
 87 |       },
 88 |       "original": {
 89 |         "owner": "davhau",
 90 |         "ref": "dream2nix",
 91 |         "repo": "pyproject.nix",
 92 |         "type": "github"
 93 |       }
 94 |     },
 95 |     "root": {
 96 |       "inputs": {
 97 |         "dream2nix": "dream2nix",
 98 |         "nixpkgs": [
 99 |           "dream2nix",
100 |           "nixpkgs"
101 |         ]
102 |       }
103 |     },
104 |     "slimlock": {
105 |       "inputs": {
106 |         "nixpkgs": [
107 |           "dream2nix",
108 |           "purescript-overlay",
109 |           "nixpkgs"
110 |         ]
111 |       },
112 |       "locked": {
113 |         "lastModified": 1688756706,
114 |         "narHash": "sha256-xzkkMv3neJJJ89zo3o2ojp7nFeaZc2G0fYwNXNJRFlo=",
115 |         "owner": "thomashoneyman",
116 |         "repo": "slimlock",
117 |         "rev": "cf72723f59e2340d24881fd7bf61cb113b4c407c",
118 |         "type": "github"
119 |       },
120 |       "original": {
121 |         "owner": "thomashoneyman",
122 |         "repo": "slimlock",
123 |         "type": "github"
124 |       }
125 |     }
126 |   },
127 |   "root": "root",
128 |   "version": 7
129 | }
130 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   inputs = {
 3 |     dream2nix.url = "github:nix-community/dream2nix";
 4 |     nixpkgs.follows = "dream2nix/nixpkgs";
 5 |   };
 6 | 
 7 |   outputs = {
 8 |     self,
 9 |     dream2nix,
10 |     nixpkgs,
11 |   }: let
12 |     eachSystem = nixpkgs.lib.genAttrs [
13 |       "aarch64-darwin"
14 |       "aarch64-linux"
15 |       "x86_64-darwin"
16 |       "x86_64-linux"
17 |     ];
18 |   in {
19 |     packages = eachSystem (system: {
20 |       default = dream2nix.lib.evalModules {
21 |         packageSets.nixpkgs = import nixpkgs {inherit system;};
22 |         modules = [
23 |           ./default.nix
24 |           {
25 |             paths.projectRoot = ./.;
26 |             paths.projectRootFile = "flake.nix";
27 |             paths.package = ./.;
28 |           }
29 |         ];
30 |       };
31 |     });
32 |     devShells = eachSystem (system: let
33 |       pkgs = import nixpkgs {inherit system;};
34 |     in {
35 |       default = pkgs.mkShell {
36 |         inputsFrom = [
37 |           self.packages.${system}.default.devShell
38 |         ];
39 | 
40 |         packages = with pkgs; [
41 |           pre-commit
42 |           python312Packages.nox
43 | 
44 |           python39
45 |           python310
46 |           python311
47 |           python312
48 |           python313
49 |         ];
50 |       };
51 |     });
52 |   };
53 | }
54 | 


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
  1 | # Standard Library
  2 | import os
  3 | 
  4 | from pathlib import Path
  5 | 
  6 | # Third Party Library
  7 | import nox
  8 | 
  9 | nox.options.stop_on_first_error = True
 10 | 
 11 | 
 12 | pythons = ["3.10", "3.11", "3.12", "3.13"]
 13 | 
 14 | os.environ.update({"PDM_IGNORE_SAVED_PYTHON": "1"})
 15 | os.environ.pop("PYTHONPATH", None)
 16 | 
 17 | 
 18 | def venv_setup_on_create(session, install):
 19 |     cwd = os.getcwd()
 20 |     session.cd(session.create_tmp())
 21 |     if session.run(
 22 |         "python", "-Esc", "import data_extractor", success_codes=(1, 0), silent=True
 23 |     ):
 24 |         install(session)
 25 |     session.cd(cwd)
 26 | 
 27 | 
 28 | @nox.session(python=pythons, venv_backend="venv")
 29 | @nox.parametrize(
 30 |     "extractor_backend",
 31 |     [
 32 |         None,
 33 |         "jsonpath-extractor",
 34 |         "jsonpath-rw",
 35 |         "jsonpath-rw-ext",
 36 |         "lxml",
 37 |         "cssselect",
 38 |     ],
 39 | )
 40 | def coverage_test(session, extractor_backend):
 41 |     venv_setup_on_create(
 42 |         session,
 43 |         lambda s: s.run(
 44 |             "pdm",
 45 |             "sync",
 46 |             "-G",
 47 |             "test",
 48 |             *(("-G", extractor_backend) if extractor_backend else tuple()),
 49 |             external=True,
 50 |         ),
 51 |     )
 52 |     session.run(
 53 |         "pytest",
 54 |         "-vv",
 55 |         "--cov=data_extractor",
 56 |         "--cov-append",
 57 |         "--ignore",
 58 |         "tests/typesafety",
 59 |         *session.posargs,
 60 |     )
 61 | 
 62 | 
 63 | @nox.session(python=pythons, venv_backend="venv")
 64 | def coverage_report(session):
 65 |     venv_setup_on_create(
 66 |         session,
 67 |         lambda s: s.run("pdm", "sync", "-G", "test", external=True),
 68 |     )
 69 |     session.run("coverage", "report")
 70 |     session.run("coverage", "xml")
 71 |     session.run("coverage", "html")
 72 |     session.log(
 73 |         f">> open file:/{(Path() / 'htmlcov/index.html').absolute()} to see coverage"
 74 |     )
 75 | 
 76 | 
 77 | @nox.session(python=pythons, venv_backend="venv")
 78 | def test_mypy_plugin(session):
 79 |     venv_setup_on_create(
 80 |         session,
 81 |         lambda s: s.run("pdm", "sync", "-G", "test-mypy-plugin", external=True),
 82 |     )
 83 | 
 84 |     session.run(
 85 |         "pytest",
 86 |         "-vv",
 87 |         "--cov=data_extractor/contrib/mypy",
 88 |         "--cov-append",
 89 |         "--mypy-same-process",
 90 |         "--mypy-ini-file=./tests/mypy.ini",
 91 |         "tests/typesafety",
 92 |         *(session.posargs if session.posargs else tuple()),
 93 |     )
 94 | 
 95 | 
 96 | @nox.session(python=pythons[-1:], venv_backend="venv")
 97 | def build_readme(session):
 98 |     venv_setup_on_create(
 99 |         session,
100 |         lambda s: s.run("pdm", "sync", "-G", "build_readme", external=True),
101 |     )
102 |     session.run(
103 |         "python", "scripts/build_readme.py", "README.template.rst", "README.rst"
104 |     )
105 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "data_extractor"
 3 | authors = [{ name = "林玮 (Jade Lin)", email = "linw1995@icloud.com" }]
 4 | description = "Combine XPath, CSS Selectors and JSONPath for Web data extracting."
 5 | readme = "README.rst"
 6 | classifiers = [
 7 |   "Intended Audience :: Developers",
 8 |   "License :: OSI Approved :: MIT License",
 9 |   "Programming Language :: Python",
10 |   "Programming Language :: Python :: 3",
11 |   "Programming Language :: Python :: 3.10",
12 |   "Programming Language :: Python :: 3.11",
13 |   "Programming Language :: Python :: 3.12",
14 |   "Programming Language :: Python :: 3.13",
15 |   "Development Status :: 5 - Production/Stable",
16 |   "Operating System :: POSIX",
17 |   "Operating System :: MacOS :: MacOS X",
18 |   "Operating System :: Microsoft :: Windows",
19 | ]
20 | keywords = [
21 |   "data-extractor",
22 |   "data-extraction",
23 |   "xpath",
24 |   "css-selectors",
25 |   "jsonpath",
26 | ]
27 | dependencies = []
28 | requires-python = ">=3.10"
29 | license = { text = "MIT" }
30 | dynamic = ["version"]
31 | 
32 | [project.urls]
33 | homepage = "https://github.com/linw1995/data_extractor"
34 | repository = "https://github.com/linw1995/data_extractor"
35 | documentation = "https://data-extractor.readthedocs.io/en/latest/"
36 | 
37 | [project.optional-dependencies]
38 | lxml = ["lxml >= 4.3, < 6"]
39 | cssselect = ["lxml >= 4.3, < 6", "cssselect >= 1.0.3, < 2"]
40 | jsonpath-extractor = ["jsonpath-extractor >= 0.5, < 0.9"]
41 | jsonpath-rw = ["jsonpath-rw >= 1.4, < 2"]
42 | jsonpath-rw-ext = ["jsonpath-rw >= 1.4, < 2", "jsonpath-rw-ext >= 1.2, < 2"]
43 | 
44 | [build-system]
45 | requires = ["pdm-pep517[setuptools]"]
46 | build-backend = "pdm.pep517.api"
47 | 
48 | [tool.commitizen]
49 | name = "cz_conventional_commits"
50 | version = "0.9.0"
51 | tag_format = "v$version"
52 | 
53 | [tool.pdm]
54 | includes = [
55 |   "data_extractor/*.py",
56 |   "data_extractor/py.typed",
57 |   "data_extractor/contrib/",
58 | ]
59 | version = { use_scm = true }
60 | 
61 | [tool.pdm.dev-dependencies]
62 | docs = [
63 |   "lxml >= 4.3, < 6",
64 |   "cssselect >= 1.0.3, < 2",
65 |   "jsonpath-extractor >= 0.5, < 0.9",
66 |   "jsonpath-rw >= 1.4, < 2",
67 |   "jsonpath-rw-ext >= 1.2, < 2",
68 |   "sphinx ~= 7.4",
69 | ]
70 | build_readme = ["click >= 7.1.2, < 8", "docutils >= 0.16", "pygments ~= 2.8"]
71 | test = ["pytest >= 6, < 8", "pytest-cov >= 2.7.1, < 3"]
72 | test-mypy-plugin = [
73 |   "pytest >= 6, < 8",
74 |   "pytest-cov >= 2.7.1, < 3",
75 |   "pytest-mypy-plugins ~= 1.6",
76 |   "mypy~=0.930",
77 | ]
78 | 
79 | [[tool.pdm.source]]
80 | name = "pypi"
81 | url = "https://pypi.org/simple"
82 | verify_ssl = true
83 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # https://github.com/pytest-dev/pytest/issues/3062
2 | # Don't move below settings into setup.cfg
3 | [pytest]
4 | testpaths = ./tests
5 | log_format = %(asctime)s - %(name)s - %(levelname)s - %(message)s
6 | xfail_strict=true
7 | 


--------------------------------------------------------------------------------
/scripts/build_readme.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | .. _issues-172: https://github.com/github/markup/issues/172
 4 | 
 5 | Because Github markup do not render :include: directive. (issues-172_)
 6 | """
 7 | 
 8 | # Standard Library
 9 | from pathlib import Path
10 | from unittest import mock
11 | 
12 | # Third Party Library
13 | import click
14 | import docutils.nodes
15 | import docutils.parsers.rst
16 | import docutils.parsers.rst.directives.misc
17 | import docutils.statemachine
18 | import docutils.utils
19 | 
20 | 
21 | @click.command()
22 | @click.argument("source_file")
23 | @click.argument("target_file")
24 | def build_readme(source_file, target_file):
25 |     old_string2lines = docutils.statemachine.string2lines
26 |     old_run = docutils.parsers.rst.directives.misc.Include.run
27 |     text = ""
28 |     target_text = None
29 | 
30 |     def string2lines(*args, **kwargs):
31 |         nonlocal text, target_text
32 |         if target_text is not None:
33 |             text = text.replace(target_text, args[0])
34 |             target_text = None
35 |         else:
36 |             text += args[0]
37 | 
38 |         rv = old_string2lines(*args, **kwargs)
39 |         return rv
40 | 
41 |     def run(self):
42 |         nonlocal target_text
43 |         target_text = self.block_text
44 |         rv = old_run(self)
45 |         return rv
46 | 
47 |     with (
48 |         mock.patch.object(docutils.statemachine, "string2lines", string2lines),
49 |         mock.patch.object(docutils.parsers.rst.directives.misc.Include, "run", run),
50 |     ):
51 |         source_file_path: Path = Path.cwd() / source_file
52 |         target_file_path: Path = Path.cwd() / target_file
53 |         parser = docutils.parsers.rst.Parser()
54 |         default_settings = docutils.frontend.OptionParser(
55 |             components=(docutils.parsers.rst.Parser,)
56 |         ).get_default_values()
57 |         document = docutils.utils.new_document(source_file_path.name, default_settings)
58 |         parser.parse(source_file_path.read_text(encoding="utf-8"), document)
59 |         text = text.rstrip() + "\n"
60 |         if (
61 |             target_file_path.exists()
62 |             and target_file_path.read_text(encoding="utf-8") == text
63 |         ):
64 |             return
65 | 
66 |         target_file_path.write_text(text, encoding="utf-8")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     build_readme()
71 | 


--------------------------------------------------------------------------------
/scripts/export_requirements_txt.py:
--------------------------------------------------------------------------------
 1 | # Standard Library
 2 | import enum
 3 | import shlex
 4 | import subprocess
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | Format = enum.Enum("Format", "requirements setuppy")
 9 | BASE_DIR = Path(__file__).parent / "requirements"
10 | 
11 | 
12 | def fix_end_of_file(text):
13 |     return text.rstrip() + "\n"
14 | 
15 | 
16 | def pdm_export(args, filename, format: Format):
17 |     output = subprocess.check_output(
18 |         shlex.split(f"pdm export -f {format.name} {' '.join(args)}"), encoding="utf-8"
19 |     )
20 |     output = fix_end_of_file(output)
21 |     if format is Format.setuppy:
22 |         output = "\n".join(
23 |             ['# This a dummy setup.py to enable GitHub "Used By" stats', output]
24 |         )
25 |     p = Path(filename)
26 |     if not p.parent.exists():
27 |         p.parent.mkdir(parents=True)
28 |     is_new = not p.exists()
29 |     if is_new or p.read_text() != output:
30 |         p.write_text(output)
31 |     if is_new:
32 |         raise RuntimeError("Create a new file")
33 | 
34 | 
35 | pdm_export(
36 |     args=["--prod"],
37 |     filename=BASE_DIR / "requirements-mini.txt",
38 |     format=Format.requirements,
39 | )
40 | pdm_export(
41 |     args=[
42 |         "--prod",
43 |         "-G:all",
44 |     ],
45 |     filename=BASE_DIR / "requirements.txt",
46 |     format=Format.requirements,
47 | )
48 | pdm_export(
49 |     args=["-G:all"],
50 |     filename=BASE_DIR / "requirements-dev.txt",
51 |     format=Format.requirements,
52 | )
53 | pdm_export(
54 |     args=["-G", "docs"],
55 |     filename=BASE_DIR / "requirements-docs.txt",
56 |     format=Format.requirements,
57 | )
58 | # pdm_export(args=[], filename=BASE_DIR / "setup.py", format=Format.setuppy)
59 | 


--------------------------------------------------------------------------------
/scripts/requirements/requirements-docs.txt:
--------------------------------------------------------------------------------
  1 | # This file is @generated by PDM.
  2 | # Please do not edit it manually.
  3 | 
  4 | alabaster==0.7.16 \
  5 |     --hash=sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65 \
  6 |     --hash=sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92
  7 | babel==2.16.0 \
  8 |     --hash=sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b \
  9 |     --hash=sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316
 10 | certifi==2024.12.14 \
 11 |     --hash=sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56 \
 12 |     --hash=sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db
 13 | charset-normalizer==3.4.0 \
 14 |     --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \
 15 |     --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \
 16 |     --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \
 17 |     --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \
 18 |     --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \
 19 |     --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \
 20 |     --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \
 21 |     --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \
 22 |     --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \
 23 |     --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \
 24 |     --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \
 25 |     --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \
 26 |     --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \
 27 |     --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \
 28 |     --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \
 29 |     --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \
 30 |     --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \
 31 |     --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \
 32 |     --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \
 33 |     --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \
 34 |     --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \
 35 |     --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \
 36 |     --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \
 37 |     --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \
 38 |     --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \
 39 |     --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \
 40 |     --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \
 41 |     --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \
 42 |     --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \
 43 |     --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \
 44 |     --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \
 45 |     --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \
 46 |     --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \
 47 |     --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \
 48 |     --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \
 49 |     --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \
 50 |     --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \
 51 |     --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \
 52 |     --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \
 53 |     --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \
 54 |     --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \
 55 |     --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \
 56 |     --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \
 57 |     --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \
 58 |     --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \
 59 |     --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \
 60 |     --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \
 61 |     --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \
 62 |     --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \
 63 |     --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \
 64 |     --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \
 65 |     --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \
 66 |     --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \
 67 |     --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \
 68 |     --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \
 69 |     --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \
 70 |     --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \
 71 |     --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \
 72 |     --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \
 73 |     --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \
 74 |     --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \
 75 |     --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482
 76 | colorama==0.4.6; sys_platform == "win32" \
 77 |     --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
 78 |     --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
 79 | cssselect==1.2.0 \
 80 |     --hash=sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc \
 81 |     --hash=sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e
 82 | decorator==5.1.1 \
 83 |     --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
 84 |     --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
 85 | docutils==0.21.2 \
 86 |     --hash=sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f \
 87 |     --hash=sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2
 88 | idna==3.10 \
 89 |     --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
 90 |     --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
 91 | imagesize==1.4.1 \
 92 |     --hash=sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b \
 93 |     --hash=sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a
 94 | jinja2==3.1.5 \
 95 |     --hash=sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb \
 96 |     --hash=sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb
 97 | jsonpath-extractor==0.8.0 \
 98 |     --hash=sha256:08c53808f981fbd27f3488687940607b6213da38cc8c67e56cb41610acd53783 \
 99 |     --hash=sha256:e82fcd6ae89123eb5ea09a2afb76d2884346369d0cd0c9509efff65c49fd15b6
100 | jsonpath-rw==1.4.0 \
101 |     --hash=sha256:05c471281c45ae113f6103d1268ec7a4831a2e96aa80de45edc89b11fac4fbec
102 | jsonpath-rw-ext==1.2.2 \
103 |     --hash=sha256:0947e018c4e6d46f9d04c56487793c702eb225fa252891aa4ed41a9ca26f3d84 \
104 |     --hash=sha256:a9e44e803b6d87d135b09d1e5af0db4d4cf97ba62711a80aa51c8c721980a994
105 | lxml==5.3.0 \
106 |     --hash=sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3 \
107 |     --hash=sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002 \
108 |     --hash=sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd \
109 |     --hash=sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832 \
110 |     --hash=sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e \
111 |     --hash=sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30 \
112 |     --hash=sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51 \
113 |     --hash=sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4 \
114 |     --hash=sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4 \
115 |     --hash=sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86 \
116 |     --hash=sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8 \
117 |     --hash=sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f \
118 |     --hash=sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03 \
119 |     --hash=sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e \
120 |     --hash=sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99 \
121 |     --hash=sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7 \
122 |     --hash=sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d \
123 |     --hash=sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22 \
124 |     --hash=sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492 \
125 |     --hash=sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b \
126 |     --hash=sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f \
127 |     --hash=sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a \
128 |     --hash=sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a \
129 |     --hash=sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4 \
130 |     --hash=sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442 \
131 |     --hash=sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b \
132 |     --hash=sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c \
133 |     --hash=sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1 \
134 |     --hash=sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be \
135 |     --hash=sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367 \
136 |     --hash=sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e \
137 |     --hash=sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16 \
138 |     --hash=sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d \
139 |     --hash=sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83 \
140 |     --hash=sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba \
141 |     --hash=sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763 \
142 |     --hash=sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff \
143 |     --hash=sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b \
144 |     --hash=sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c \
145 |     --hash=sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8 \
146 |     --hash=sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f \
147 |     --hash=sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a \
148 |     --hash=sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce \
149 |     --hash=sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1 \
150 |     --hash=sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330 \
151 |     --hash=sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18 \
152 |     --hash=sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff \
153 |     --hash=sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c \
154 |     --hash=sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179 \
155 |     --hash=sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080 \
156 |     --hash=sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d \
157 |     --hash=sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32 \
158 |     --hash=sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a \
159 |     --hash=sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79 \
160 |     --hash=sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3 \
161 |     --hash=sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5 \
162 |     --hash=sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f \
163 |     --hash=sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d \
164 |     --hash=sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3 \
165 |     --hash=sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9 \
166 |     --hash=sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957 \
167 |     --hash=sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb \
168 |     --hash=sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656 \
169 |     --hash=sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b \
170 |     --hash=sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d \
171 |     --hash=sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd \
172 |     --hash=sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859 \
173 |     --hash=sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a \
174 |     --hash=sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005 \
175 |     --hash=sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654 \
176 |     --hash=sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80 \
177 |     --hash=sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec \
178 |     --hash=sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7 \
179 |     --hash=sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965 \
180 |     --hash=sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8
181 | markupsafe==3.0.2 \
182 |     --hash=sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4 \
183 |     --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \
184 |     --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \
185 |     --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \
186 |     --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \
187 |     --hash=sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca \
188 |     --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \
189 |     --hash=sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832 \
190 |     --hash=sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b \
191 |     --hash=sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579 \
192 |     --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \
193 |     --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \
194 |     --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \
195 |     --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \
196 |     --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \
197 |     --hash=sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb \
198 |     --hash=sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e \
199 |     --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \
200 |     --hash=sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a \
201 |     --hash=sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d \
202 |     --hash=sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b \
203 |     --hash=sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8 \
204 |     --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \
205 |     --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \
206 |     --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \
207 |     --hash=sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d \
208 |     --hash=sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93 \
209 |     --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \
210 |     --hash=sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158 \
211 |     --hash=sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84 \
212 |     --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \
213 |     --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \
214 |     --hash=sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171 \
215 |     --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \
216 |     --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \
217 |     --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \
218 |     --hash=sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d \
219 |     --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \
220 |     --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \
221 |     --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \
222 |     --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \
223 |     --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \
224 |     --hash=sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798 \
225 |     --hash=sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c \
226 |     --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \
227 |     --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \
228 |     --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \
229 |     --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \
230 |     --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \
231 |     --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430 \
232 |     --hash=sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50
233 | packaging==24.2 \
234 |     --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \
235 |     --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f
236 | pbr==6.1.0 \
237 |     --hash=sha256:788183e382e3d1d7707db08978239965e8b9e4e5ed42669bf4758186734d5f24 \
238 |     --hash=sha256:a776ae228892d8013649c0aeccbb3d5f99ee15e005a4cbb7e61d55a067b28a2a
239 | ply==3.11 \
240 |     --hash=sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3 \
241 |     --hash=sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce
242 | pygments==2.18.0 \
243 |     --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \
244 |     --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a
245 | requests==2.32.3 \
246 |     --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
247 |     --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
248 | six==1.17.0 \
249 |     --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
250 |     --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
251 | snowballstemmer==2.2.0 \
252 |     --hash=sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1 \
253 |     --hash=sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a
254 | sphinx==7.4.7 \
255 |     --hash=sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe \
256 |     --hash=sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239
257 | sphinxcontrib-applehelp==2.0.0 \
258 |     --hash=sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1 \
259 |     --hash=sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5
260 | sphinxcontrib-devhelp==2.0.0 \
261 |     --hash=sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad \
262 |     --hash=sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2
263 | sphinxcontrib-htmlhelp==2.1.0 \
264 |     --hash=sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8 \
265 |     --hash=sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9
266 | sphinxcontrib-jsmath==1.0.1 \
267 |     --hash=sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178 \
268 |     --hash=sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8
269 | sphinxcontrib-qthelp==2.0.0 \
270 |     --hash=sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab \
271 |     --hash=sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb
272 | sphinxcontrib-serializinghtml==2.0.0 \
273 |     --hash=sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331 \
274 |     --hash=sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d
275 | tomli==2.2.1; python_version < "3.11" \
276 |     --hash=sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6 \
277 |     --hash=sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd \
278 |     --hash=sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c \
279 |     --hash=sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b \
280 |     --hash=sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8 \
281 |     --hash=sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6 \
282 |     --hash=sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77 \
283 |     --hash=sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff \
284 |     --hash=sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea \
285 |     --hash=sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192 \
286 |     --hash=sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249 \
287 |     --hash=sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee \
288 |     --hash=sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4 \
289 |     --hash=sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98 \
290 |     --hash=sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8 \
291 |     --hash=sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4 \
292 |     --hash=sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281 \
293 |     --hash=sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744 \
294 |     --hash=sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69 \
295 |     --hash=sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13 \
296 |     --hash=sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140 \
297 |     --hash=sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e \
298 |     --hash=sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e \
299 |     --hash=sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc \
300 |     --hash=sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff \
301 |     --hash=sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec \
302 |     --hash=sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2 \
303 |     --hash=sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222 \
304 |     --hash=sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106 \
305 |     --hash=sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272 \
306 |     --hash=sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a \
307 |     --hash=sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7
308 | typing-extensions==3.10.0.2 \
309 |     --hash=sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e \
310 |     --hash=sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34
311 | urllib3==2.3.0 \
312 |     --hash=sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df \
313 |     --hash=sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d
314 | --index-url https://pypi.org/simple
315 | 


--------------------------------------------------------------------------------
/scripts/requirements/requirements-mini.txt:
--------------------------------------------------------------------------------
1 | # This file is @generated by PDM.
2 | # Please do not edit it manually.
3 | 
4 | --index-url https://pypi.org/simple
5 | 


--------------------------------------------------------------------------------
/scripts/requirements/requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file is @generated by PDM.
  2 | # Please do not edit it manually.
  3 | 
  4 | cssselect==1.2.0 \
  5 |     --hash=sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc \
  6 |     --hash=sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e
  7 | decorator==5.1.1 \
  8 |     --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
  9 |     --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
 10 | jsonpath-extractor==0.8.0 \
 11 |     --hash=sha256:08c53808f981fbd27f3488687940607b6213da38cc8c67e56cb41610acd53783 \
 12 |     --hash=sha256:e82fcd6ae89123eb5ea09a2afb76d2884346369d0cd0c9509efff65c49fd15b6
 13 | jsonpath-rw==1.4.0 \
 14 |     --hash=sha256:05c471281c45ae113f6103d1268ec7a4831a2e96aa80de45edc89b11fac4fbec
 15 | jsonpath-rw-ext==1.2.2 \
 16 |     --hash=sha256:0947e018c4e6d46f9d04c56487793c702eb225fa252891aa4ed41a9ca26f3d84 \
 17 |     --hash=sha256:a9e44e803b6d87d135b09d1e5af0db4d4cf97ba62711a80aa51c8c721980a994
 18 | lxml==5.3.0 \
 19 |     --hash=sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3 \
 20 |     --hash=sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002 \
 21 |     --hash=sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd \
 22 |     --hash=sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832 \
 23 |     --hash=sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e \
 24 |     --hash=sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30 \
 25 |     --hash=sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51 \
 26 |     --hash=sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4 \
 27 |     --hash=sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4 \
 28 |     --hash=sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86 \
 29 |     --hash=sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8 \
 30 |     --hash=sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f \
 31 |     --hash=sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03 \
 32 |     --hash=sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e \
 33 |     --hash=sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99 \
 34 |     --hash=sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7 \
 35 |     --hash=sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d \
 36 |     --hash=sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22 \
 37 |     --hash=sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492 \
 38 |     --hash=sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b \
 39 |     --hash=sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f \
 40 |     --hash=sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a \
 41 |     --hash=sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a \
 42 |     --hash=sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4 \
 43 |     --hash=sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442 \
 44 |     --hash=sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b \
 45 |     --hash=sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c \
 46 |     --hash=sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1 \
 47 |     --hash=sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be \
 48 |     --hash=sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367 \
 49 |     --hash=sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e \
 50 |     --hash=sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16 \
 51 |     --hash=sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d \
 52 |     --hash=sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83 \
 53 |     --hash=sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba \
 54 |     --hash=sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763 \
 55 |     --hash=sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff \
 56 |     --hash=sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b \
 57 |     --hash=sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c \
 58 |     --hash=sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8 \
 59 |     --hash=sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f \
 60 |     --hash=sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a \
 61 |     --hash=sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce \
 62 |     --hash=sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1 \
 63 |     --hash=sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330 \
 64 |     --hash=sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18 \
 65 |     --hash=sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff \
 66 |     --hash=sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c \
 67 |     --hash=sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179 \
 68 |     --hash=sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080 \
 69 |     --hash=sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d \
 70 |     --hash=sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32 \
 71 |     --hash=sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a \
 72 |     --hash=sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79 \
 73 |     --hash=sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3 \
 74 |     --hash=sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5 \
 75 |     --hash=sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f \
 76 |     --hash=sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d \
 77 |     --hash=sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3 \
 78 |     --hash=sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9 \
 79 |     --hash=sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957 \
 80 |     --hash=sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb \
 81 |     --hash=sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656 \
 82 |     --hash=sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b \
 83 |     --hash=sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d \
 84 |     --hash=sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd \
 85 |     --hash=sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859 \
 86 |     --hash=sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a \
 87 |     --hash=sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005 \
 88 |     --hash=sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654 \
 89 |     --hash=sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80 \
 90 |     --hash=sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec \
 91 |     --hash=sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7 \
 92 |     --hash=sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965 \
 93 |     --hash=sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8
 94 | pbr==6.1.0 \
 95 |     --hash=sha256:788183e382e3d1d7707db08978239965e8b9e4e5ed42669bf4758186734d5f24 \
 96 |     --hash=sha256:a776ae228892d8013649c0aeccbb3d5f99ee15e005a4cbb7e61d55a067b28a2a
 97 | ply==3.11 \
 98 |     --hash=sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3 \
 99 |     --hash=sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce
100 | six==1.17.0 \
101 |     --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
102 |     --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
103 | typing-extensions==3.10.0.2 \
104 |     --hash=sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e \
105 |     --hash=sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34
106 | --index-url https://pypi.org/simple
107 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 88
 3 | extend-ignore = E203, W503
 4 | 
 5 | [isort]
 6 | profile=black
 7 | lines_between_types=1
 8 | 
 9 | import_heading_stdlib=Standard Library
10 | import_heading_thirdparty=Third Party Library
11 | import_heading_firstparty=First Party Library
12 | import_heading_localfolder=Local Folder
13 | 
14 | [coverage:run]
15 | branch = true
16 | omit =
17 |     site-packages
18 | 
19 | [coverage:report]
20 | precision = 2
21 | # Regexes for lines to exclude from consideration
22 | exclude_lines =
23 |     # Have to re-enable the standard pragma
24 |     pragma: no cover
25 | 
26 |     # Don't complain about missing debug-only code:
27 |     def __repr__
28 |     if self\.debug
29 | 
30 |     # Don't complain if tests don't hit defensive assertion code:
31 |     raise AssertionError
32 |     raise NotImplementedError
33 | 
34 |     # Don't complain if non-runnable code isn't run:
35 |     if 0:
36 |     if __name__ == .__main__.:
37 |     if TYPE_CHECKING:
38 | 
39 |     # type annotations
40 |     @overload
41 | 
42 | 
43 | ignore_errors = True
44 | 
45 | [mypy]
46 | follow_imports = silent
47 | warn_redundant_casts = true
48 | check_untyped_defs = true
49 | disallow_any_generics = false
50 | no_implicit_optional = true
51 | #disallow_untyped_defs = true
52 | #warn_unused_ignores = true
53 | plugins = data_extractor.contrib.mypy:plugin
54 | 
55 | [mypy-lxml.*]
56 | ignore_missing_imports = true
57 | 
58 | [mypy-cssselect.*]
59 | ignore_missing_imports = true
60 | 
61 | [mypy-jsonpath.*]
62 | ignore_missing_imports = true
63 | 
64 | [mypy-jsonpath_rw.*]
65 | ignore_missing_imports = true
66 | 
67 | [mypy-jsonpath_rw_ext.*]
68 | ignore_missing_imports = true
69 | 
70 | [mypy-mypy.*]
71 | ignore_missing_imports = true
72 | 
73 | [mypy-tests.*]
74 | disallow_untyped_defs = false
75 | 
76 | [mypy-pytest.*]
77 | ignore_missing_imports = true
78 | 
79 | [mypy-_pytest.*]
80 | ignore_missing_imports = true
81 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Standard Library
 2 | import platform
 3 | import sys
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | current_python_version = "%s.%s" % platform.python_version_tuple()[:2]
 8 | 
 9 | # when executing pytest cli, the sys.path will be changed.
10 | # jsonpath-extractor package's module `jsonpath` same as
11 | # the file `jsonpath.py` in f'{sys.prefix}/bin'.
12 | # So need to remove it to avoid import the wrong module.
13 | for p in [
14 |     Path(f"{sys.prefix}/bin/jsonpath.py"),
15 |     Path(f"__pypackages__/{current_python_version}/bin/jsonpath.py"),
16 | ]:
17 |     if p.exists():
18 |         p.unlink()
19 | 
20 | # pdm
21 | 


--------------------------------------------------------------------------------
/tests/assets/sample-rss-2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <rss version="2.0">
 3 |    <channel>
 4 |       <title>Liftoff News</title>
 5 |       <link>http://liftoff.msfc.nasa.gov/</link>
 6 |       <description>Liftoff to Space Exploration.</description>
 7 |       <language>en-us</language>
 8 |       <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
 9 |       <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
10 |       <docs>http://blogs.law.harvard.edu/tech/rss</docs>
11 |       <generator>Weblog Editor 2.0</generator>
12 |       <managingEditor>editor@example.com</managingEditor>
13 |       <webMaster>webmaster@example.com</webMaster>
14 |       <item>
15 |          <title>Star City</title>
16 |          <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
17 |          <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
18 |          <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
19 |          <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
20 |       </item>
21 |       <item>
22 |          <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.</description>
23 |          <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
24 |          <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
25 |       </item>
26 |       <item>
27 |          <title>The Engine That Does More</title>
28 |          <link>http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp</link>
29 |          <description>Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly.  The proposed VASIMR engine would do that.</description>
30 |          <pubDate>Tue, 27 May 2003 08:37:32 GMT</pubDate>
31 |          <guid>http://liftoff.msfc.nasa.gov/2003/05/27.html#item571</guid>
32 |       </item>
33 |       <item>
34 |          <title>Astronauts' Dirty Laundry</title>
35 |          <link>http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp</link>
36 |          <description>Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them.  Instead, astronauts have other options.</description>
37 |          <pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>
38 |          <guid>http://liftoff.msfc.nasa.gov/2003/05/20.html#item570</guid>
39 |       </item>
40 |    </channel>
41 | </rss>
42 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Standard Library
 2 | import importlib.util
 3 | 
 4 | from unittest import mock
 5 | 
 6 | # Third Party Library
 7 | import pytest
 8 | 
 9 | # First Party Library
10 | import data_extractor.json
11 | import data_extractor.utils
12 | 
13 | 
14 | @pytest.fixture(
15 |     params=[
16 |         (
17 |             "jsonpath-extractor",
18 |             "jsonpath",
19 |             data_extractor.json.JSONPathExtractor,
20 |         ),
21 |         ("jsonpath-rw", "jsonpath_rw", data_extractor.json.JSONPathRWExtractor),
22 |         (
23 |             "jsonpath-rw-ext",
24 |             "jsonpath_rw_ext",
25 |             data_extractor.json.JSONPathRWExtExtractor,
26 |         ),
27 |     ],
28 |     ids=lambda r: r[1] if r[1] else f"Missing {r[0]!r}",
29 | )
30 | def json_extractor_backend(request):
31 |     package_name, module_name, backend_cls = request.param
32 |     if not importlib.util.find_spec(module_name):
33 |         pytest.skip(f"missing {package_name!r}")
34 |         return
35 | 
36 |     data_extractor.json.json_extractor_backend = backend_cls
37 |     return backend_cls
38 | 
39 | 
40 | @pytest.fixture
41 | def json0():
42 |     return {
43 |         "data": {
44 |             "users": [
45 |                 {"id": 0, "name": "Vang Stout", "gender": "female"},
46 |                 {"id": 1, "name": "Jeannie Gaines", "gender": "male"},
47 |                 {"id": 2, "name": "Guzman Hunter", "gender": "female"},
48 |                 {"id": 3, "name": "Janine Gross"},
49 |                 {"id": 4, "name": "Clarke Patrick", "gender": "male"},
50 |                 {"id": 5, "name": "Whitney Mcfadden"},
51 |             ],
52 |             "start": 0,
53 |             "size": 5,
54 |             "total": 100,
55 |         },
56 |         "status": 0,
57 |     }
58 | 
59 | 
60 | @pytest.fixture(params=[False, True], ids=lambda x: f"stack_frame_support={x}")
61 | def stack_frame_support(request):
62 |     if request.param:
63 |         yield True
64 |     else:
65 |         with mock.patch("inspect.currentframe") as mocked:
66 |             mocked.return_value = None
67 |             yield False
68 | 


--------------------------------------------------------------------------------
/tests/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | follow_imports = silent
 3 | disallow_any_generics = false
 4 | plugins = data_extractor.contrib.mypy:plugin
 5 | 
 6 | [mypy-mypy.*]
 7 | ignore_missing_imports = true
 8 | 
 9 | [mypy-lxml.*]
10 | ignore_missing_imports = true
11 | 
12 | [mypy-cssselect.*]
13 | ignore_missing_imports = true
14 | 
15 | [mypy-jsonpath.*]
16 | ignore_missing_imports = true
17 | 
18 | [mypy-jsonpath_rw.*]
19 | ignore_missing_imports = true
20 | 
21 | [mypy-jsonpath_rw_ext.*]
22 | ignore_missing_imports = true
23 | 
24 | [mypy-tests.*]
25 | ignore_missing_imports = true
26 | disallow_untyped_defs = false
27 | 
28 | [mypy-pytest.*]
29 | ignore_missing_imports = true
30 | 


--------------------------------------------------------------------------------
/tests/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | # Third Party Library
 2 | import pytest
 3 | 
 4 | # First Party Library
 5 | import data_extractor.json
 6 | 
 7 | from data_extractor.exceptions import ExtractError
 8 | from data_extractor.item import Field, Item
 9 | from data_extractor.json import JSONExtractor
10 | 
11 | 
12 | def test_no_needed_packages():
13 |     data_extractor.json.json_extractor_backend = None
14 |     with pytest.raises(RuntimeError):
15 |         JSONExtractor()
16 | 
17 | 
18 | @pytest.mark.usefixtures("json_extractor_backend")
19 | def test_exception_trace(json0):
20 |     data = json0
21 | 
22 |     class User(Item):
23 |         uid = Field(JSONExtractor("id"))
24 |         username = Field(JSONExtractor("name"), name="name")
25 |         gender = Field(JSONExtractor("gender"))
26 | 
27 |     class UserResponse(Item):
28 |         start = Field(JSONExtractor("start"), default=0)
29 |         size = Field(JSONExtractor("size"))
30 |         total = Field(JSONExtractor("total"))
31 |         data = User(JSONExtractor("users[*]"), is_many=True)
32 | 
33 |     extractor = UserResponse(JSONExtractor("data"))
34 | 
35 |     with pytest.raises(ExtractError) as catch:
36 |         extractor.extract(data)
37 | 
38 |     exc = catch.value
39 |     assert len(exc.extractors) == 3
40 |     assert exc.extractors[0] is User.gender
41 |     assert exc.extractors[1] is UserResponse.data
42 |     assert exc.extractors[2] is extractor
43 |     assert exc.element == {"id": 3, "name": "Janine Gross"}
44 | 
45 |     assert (
46 |         str(exc.args[0])
47 |         == """
48 | ExtractError(Field(JSONExtractor('gender')), element={'id': 3, 'name': 'Janine Gross'})
49 | |-UserResponse(JSONExtractor('data'))
50 |   |-User(JSONExtractor('users[*]'), is_many=True)
51 |     |-Field(JSONExtractor('gender'))
52 |       |-{'id': 3, 'name': 'Janine Gross'}
53 |     """.strip()
54 |     )
55 | 


--------------------------------------------------------------------------------
/tests/test_generic_item.py:
--------------------------------------------------------------------------------
 1 | # Standard Library
 2 | from collections import namedtuple
 3 | 
 4 | # Third Party Library
 5 | import pytest
 6 | 
 7 | # First Party Library
 8 | from data_extractor.item import RV, Field, Item
 9 | from data_extractor.json import JSONExtractor
10 | 
11 | # Local Folder
12 | from .utils import D
13 | 
14 | 
15 | def test_field_with_type():
16 |     StrField = Field[str]
17 |     f = StrField(D())
18 |     assert f.type is str
19 |     assert f.extract(1) == "1"
20 | 
21 |     f = Field[str](D())
22 |     assert f.type is str
23 |     assert f.extract(1) == "1"
24 | 
25 |     assert Field[str](D()).extract(1) == "1"
26 | 
27 | 
28 | def test_field_with_convertor():
29 |     f = Field(D(), convertor=lambda x: str(x).upper())
30 |     assert f.type is None
31 |     assert f.extract("abc") == "ABC"
32 |     f = Field(D(), type=str, convertor=lambda x: str(x).upper())
33 |     assert f.type is str
34 |     assert f.extract("abc") == "ABC"
35 | 
36 | 
37 | @pytest.mark.usefixtures("json_extractor_backend")
38 | def test_item_with_type():
39 |     class Article(Item[RV]):
40 |         title = Field[str](JSONExtractor("title"))
41 | 
42 |     ArticleTuple = namedtuple("ArticleTuple", "title")
43 |     article = Article[ArticleTuple]()
44 |     rv = article.extract({"title": "example"})
45 |     assert isinstance(rv, ArticleTuple)
46 |     assert rv.title == "example"
47 | 


--------------------------------------------------------------------------------
/tests/test_json.py:
--------------------------------------------------------------------------------
  1 | # Standard Library
  2 | import json
  3 | import re
  4 | 
  5 | # Third Party Library
  6 | import pytest
  7 | 
  8 | # First Party Library
  9 | import data_extractor.json
 10 | 
 11 | from data_extractor.exceptions import ExprError, ExtractError
 12 | from data_extractor.json import JSONExtractor
 13 | 
 14 | 
 15 | @pytest.fixture(scope="module")
 16 | def text():
 17 |     return """
 18 |         {
 19 |             "foo": [
 20 |                 {
 21 |                     "baz": 1
 22 |                 },
 23 |                 {
 24 |                     "baz": 2
 25 |                 }
 26 |             ]
 27 |         }
 28 |     """
 29 | 
 30 | 
 31 | @pytest.fixture(scope="module")
 32 | def element(text):
 33 |     return json.loads(text)
 34 | 
 35 | 
 36 | @pytest.mark.usefixtures("json_extractor_backend")
 37 | @pytest.mark.parametrize(
 38 |     "expr,expect",
 39 |     [
 40 |         ("foo[*].baz", [1, 2]),
 41 |         ("foo.baz", []),
 42 |         ("foo[0].baz", [1]),
 43 |         ("foo[1].baz", [2]),
 44 |         ("foo[2].baz", []),
 45 |     ],
 46 |     ids=repr,
 47 | )
 48 | def test_extract(element, expr, expect):
 49 |     extractor = JSONExtractor(expr)
 50 |     assert expect == extractor.extract(element)
 51 | 
 52 | 
 53 | @pytest.mark.usefixtures("json_extractor_backend")
 54 | @pytest.mark.parametrize(
 55 |     "expr,expect",
 56 |     [
 57 |         ("foo[*].baz", 1),
 58 |         ("foo.baz", "default"),
 59 |         ("foo[0].baz", 1),
 60 |         ("foo[1].baz", 2),
 61 |         ("foo[2].baz", "default"),
 62 |     ],
 63 |     ids=repr,
 64 | )
 65 | def test_extract_first(element, expr, expect):
 66 |     extractor = JSONExtractor(expr)
 67 |     assert expect == extractor.extract_first(element, default="default")
 68 | 
 69 | 
 70 | @pytest.mark.usefixtures("json_extractor_backend")
 71 | @pytest.mark.parametrize("expr", ["foo.baz", "foo[2].baz"], ids=repr)
 72 | def test_extract_first_without_default(element, expr):
 73 |     extractor = JSONExtractor(expr)
 74 | 
 75 |     with pytest.raises(ExtractError) as catch:
 76 |         extractor.extract_first(element)
 77 | 
 78 |     exc = catch.value
 79 |     assert len(exc.extractors) == 1
 80 |     assert exc.extractors[0] is extractor
 81 |     assert exc.element is element
 82 | 
 83 | 
 84 | @pytest.mark.usefixtures("json_extractor_backend")
 85 | @pytest.mark.parametrize("expr", ["foo..", "a[]", ""], ids=repr)
 86 | def test_invalid_jsonpath_expr(element, expr):
 87 |     with pytest.raises(ExprError) as catch:
 88 |         JSONExtractor(expr)
 89 | 
 90 |     exc = catch.value
 91 | 
 92 |     if (
 93 |         data_extractor.json.json_extractor_backend
 94 |         is data_extractor.json.JSONPathExtractor
 95 |     ):
 96 |         # JSONExtractor implementated by 'jsonpath-extractor'
 97 |         # only raise SyntaxError
 98 |         assert isinstance(exc.exc, SyntaxError)
 99 |     else:
100 |         # Third Party Library
101 |         from jsonpath_rw.lexer import JsonPathLexerError
102 | 
103 |         assert isinstance(exc.exc, (JsonPathLexerError, Exception))
104 | 
105 |     assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
106 | 


--------------------------------------------------------------------------------
/tests/test_lxml.py:
--------------------------------------------------------------------------------
  1 | # Standard Library
  2 | import importlib.util
  3 | import re
  4 | 
  5 | # Third Party Library
  6 | import pytest
  7 | 
  8 | # First Party Library
  9 | from data_extractor.exceptions import ExprError, ExtractError
 10 | from data_extractor.lxml import (
 11 |     AttrCSSExtractor,
 12 |     CSSExtractor,
 13 |     TextCSSExtractor,
 14 |     XPathExtractor,
 15 | )
 16 | 
 17 | need_cssselect = pytest.mark.skipif(
 18 |     importlib.util.find_spec("cssselect") is None,
 19 |     reason="Missing 'cssselect'",
 20 | )
 21 | need_lxml = pytest.mark.skipif(
 22 |     importlib.util.find_spec("lxml") is None, reason="Missing 'lxml'"
 23 | )
 24 | 
 25 | 
 26 | @pytest.fixture(scope="module")
 27 | def text():
 28 |     return """
 29 |         <html>
 30 |             <ul>
 31 |                 <li>
 32 |                     <span class='class_a'>a</span>
 33 |                     <i>i text 1</i>
 34 |                     <b>b text 1</b>
 35 |                 </li>
 36 |                 <li>
 37 |                     <span class='class_b'>b</span>
 38 |                     <i>i text 2</i>
 39 |                     <b>b text 2</b>
 40 |                 </li>
 41 |                 <li>
 42 |                     <span>c</span>
 43 |                     <i>i text 3</i>
 44 |                     <b>b text 3</b>
 45 |                 </li>
 46 |             </ul>
 47 |         </html>
 48 |     """
 49 | 
 50 | 
 51 | @pytest.fixture(scope="module")
 52 | def element(text):
 53 |     try:
 54 |         # Third Party Library
 55 |         from lxml.html import fromstring
 56 |     except ImportError:
 57 |         pytest.skip("Missing 'lxml'")
 58 | 
 59 |     return fromstring(text)
 60 | 
 61 | 
 62 | @pytest.mark.parametrize(
 63 |     "Extractor,expr,expect",
 64 |     [
 65 |         pytest.param(TextCSSExtractor, "span.class_a", ["a"], marks=need_cssselect),
 66 |         pytest.param(TextCSSExtractor, "span.class_b", ["b"], marks=need_cssselect),
 67 |         pytest.param(TextCSSExtractor, "span", ["a", "b", "c"], marks=need_cssselect),
 68 |         pytest.param(TextCSSExtractor, "notexits", [], marks=need_cssselect),
 69 |         (XPathExtractor, "//span[@class='class_a']/text()", ["a"]),
 70 |         (XPathExtractor, "//span[@class='class_b']/text()", ["b"]),
 71 |         (XPathExtractor, "//span[@class]/text()", ["a", "b"]),
 72 |         (XPathExtractor, "//span/@class", ["class_a", "class_b"]),
 73 |         (XPathExtractor, "//notexists/text()", []),
 74 |     ],
 75 |     ids=repr,
 76 | )
 77 | def test_extract(element, Extractor, expr, expect):
 78 |     extractor = Extractor(expr)
 79 |     assert expect == extractor.extract(element)
 80 | 
 81 | 
 82 | @pytest.mark.parametrize(
 83 |     "Extractor,expr,expect",
 84 |     [
 85 |         pytest.param(TextCSSExtractor, "span.class_a", "a", marks=need_cssselect),
 86 |         pytest.param(TextCSSExtractor, "span.class_b", "b", marks=need_cssselect),
 87 |         pytest.param(TextCSSExtractor, "span", "a", marks=need_cssselect),
 88 |         pytest.param(TextCSSExtractor, "notexits", "default", marks=need_cssselect),
 89 |         (XPathExtractor, "//span[@class='class_a']/text()", "a"),
 90 |         (XPathExtractor, "//span[@class='class_b']/text()", "b"),
 91 |         (XPathExtractor, "//span[@class]/text()", "a"),
 92 |         (XPathExtractor, "//span/@class", "class_a"),
 93 |         (XPathExtractor, "//notexists/text()", "default"),
 94 |     ],
 95 |     ids=repr,
 96 | )
 97 | def test_extract_first(element, Extractor, expr, expect):
 98 |     extractor = Extractor(expr)
 99 |     assert expect == extractor.extract_first(element, default="default")
100 | 
101 | 
102 | @pytest.mark.parametrize(
103 |     "Extractor,expr",
104 |     [
105 |         pytest.param(TextCSSExtractor, "notexits", marks=need_cssselect),
106 |         (XPathExtractor, "//notexists/text()"),
107 |     ],
108 |     ids=repr,
109 | )
110 | def test_extract_first_without_default(element, Extractor, expr):
111 |     extractor = Extractor(expr)
112 |     with pytest.raises(ExtractError) as catch:
113 |         extractor.extract_first(element)
114 | 
115 |     exc = catch.value
116 |     assert len(exc.extractors) == 1
117 |     assert exc.extractors[0] is extractor
118 |     assert exc.element is element
119 | 
120 | 
121 | @need_cssselect
122 | @pytest.mark.parametrize(
123 |     "expr,attr,expect",
124 |     [
125 |         ("span.class_a", "class", ["class_a"]),
126 |         ("span.class_b", "class", ["class_b"]),
127 |         ("span", "class", ["class_a", "class_b"]),
128 |         ("span", "notexists", []),
129 |         ("notexists", "class", []),
130 |     ],
131 |     ids=repr,
132 | )
133 | def test_attr_css_extract(element, expr, attr, expect):
134 |     extractor = AttrCSSExtractor(expr=expr, attr=attr)
135 |     assert expect == extractor.extract(element)
136 | 
137 | 
138 | @need_cssselect
139 | @pytest.mark.parametrize(
140 |     "expr,attr,expect",
141 |     [
142 |         ("span.class_a", "class", "class_a"),
143 |         ("span.class_b", "class", "class_b"),
144 |         ("span", "class", "class_a"),
145 |         ("span", "notexists", "default"),
146 |         ("notexists", "class", "default"),
147 |     ],
148 |     ids=repr,
149 | )
150 | def test_attr_css_extract_first(element, expr, attr, expect):
151 |     extractor = AttrCSSExtractor(expr=expr, attr=attr)
152 |     assert expect == extractor.extract_first(element, default="default")
153 | 
154 | 
155 | @need_cssselect
156 | @pytest.mark.parametrize(
157 |     "expr,attr", [("span", "notexists"), ("notexists", "class")], ids=repr
158 | )
159 | def test_attr_css_extract_first_without_default(element, expr, attr):
160 |     extractor = AttrCSSExtractor(expr=expr, attr=attr)
161 |     with pytest.raises(ExtractError) as catch:
162 |         extractor.extract_first(element)
163 | 
164 |     exc = catch.value
165 |     assert len(exc.extractors) == 1
166 |     assert exc.extractors[0] is extractor
167 |     assert exc.element is element
168 | 
169 | 
170 | @need_lxml
171 | @pytest.mark.parametrize("expr", ["///", "/text(", ""])
172 | def test_invalid_xpath_expr(expr):
173 |     with pytest.raises(ExprError) as catch:
174 |         XPathExtractor(expr)
175 | 
176 |     exc = catch.value
177 |     # Third Party Library
178 |     from lxml.etree import XPathError
179 | 
180 |     assert isinstance(exc.exc, XPathError)
181 |     assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
182 | 
183 | 
184 | @pytest.mark.parametrize("expr", ["//ns:a"])
185 | def test_invalid_xpath_expr_by_XPathEvalError_from_extract(element, expr):
186 |     extractor = XPathExtractor(expr)
187 |     with pytest.raises(ExprError) as catch:
188 |         extractor.extract(element)
189 | 
190 |     exc = catch.value
191 |     assert exc.extractor is extractor
192 |     # Third Party Library
193 |     from lxml.etree import XPathEvalError
194 | 
195 |     assert isinstance(exc.exc, XPathEvalError)
196 |     assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
197 | 
198 | 
199 | @need_cssselect
200 | @pytest.mark.parametrize("expr", ["<", "a##", ""])
201 | def test_invalid_css_selector_expr(element, expr):
202 |     with pytest.raises(ExprError) as catch:
203 |         CSSExtractor(expr)
204 | 
205 |     exc = catch.value
206 |     # Third Party Library
207 |     from cssselect.parser import SelectorError
208 | 
209 |     assert isinstance(exc.exc, SelectorError)
210 |     assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
211 | 
212 | 
213 | def test_xpath_result_not_list(element):
214 |     extractor = XPathExtractor("normalize-space(//span)")
215 |     assert extractor.extract(element) == ["a"]
216 |     assert extractor.extract_first(element) == "a"
217 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | # Third Party Library
  2 | # Standard Library
  3 | import importlib.util
  4 | import sys
  5 | 
  6 | # Third Party Library
  7 | import pytest
  8 | 
  9 | # First Party Library
 10 | from data_extractor.core import AbstractSimpleExtractor
 11 | from data_extractor.item import Field, Item
 12 | from data_extractor.json import (
 13 |     JSONPathExtractor,
 14 |     JSONPathRWExtExtractor,
 15 |     JSONPathRWExtractor,
 16 |     _missing_jsonpath,
 17 |     _missing_jsonpath_rw,
 18 |     _missing_jsonpath_rw_ext,
 19 | )
 20 | from data_extractor.lxml import (
 21 |     AttrCSSExtractor,
 22 |     CSSExtractor,
 23 |     TextCSSExtractor,
 24 |     XPathExtractor,
 25 |     _missing_cssselect,
 26 |     _missing_lxml,
 27 | )
 28 | from data_extractor.utils import (
 29 |     LazyStr,
 30 |     Property,
 31 |     getframe,
 32 |     is_complex_extractor,
 33 |     is_extractor,
 34 |     is_simple_extractor,
 35 | )
 36 | 
 37 | 
 38 | def test_lazy_str():
 39 |     string = ""
 40 | 
 41 |     def func():
 42 |         nonlocal string
 43 |         return string
 44 | 
 45 |     ls = LazyStr(func=func)
 46 |     assert str(ls) == ""
 47 | 
 48 |     string = "abc"
 49 |     assert str(ls) == "abc"
 50 | 
 51 | 
 52 | @pytest.fixture(params=[Field(), Item()], ids=repr)
 53 | def complex_extractor(request):
 54 |     return request.param
 55 | 
 56 | 
 57 | @pytest.fixture(
 58 |     params=[
 59 |         (
 60 |             AttrCSSExtractor(expr="div.class", attr="id")
 61 |             if not _missing_cssselect
 62 |             else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip())
 63 |         ),
 64 |         (
 65 |             CSSExtractor(expr="div.class")
 66 |             if not _missing_cssselect
 67 |             else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip())
 68 |         ),
 69 |         (
 70 |             JSONPathExtractor(expr="boo")
 71 |             if not _missing_jsonpath
 72 |             else pytest.param("Missing 'jsonpath-extractor'", marks=pytest.mark.skip())
 73 |         ),
 74 |         (
 75 |             JSONPathRWExtractor(expr="boo")
 76 |             if not _missing_jsonpath_rw
 77 |             else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip())
 78 |         ),
 79 |         (
 80 |             JSONPathRWExtExtractor(expr="boo")
 81 |             if not _missing_jsonpath_rw_ext
 82 |             else pytest.param("Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip())
 83 |         ),
 84 |         (
 85 |             TextCSSExtractor(expr="div.class")
 86 |             if not _missing_cssselect
 87 |             else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip())
 88 |         ),
 89 |         (
 90 |             XPathExtractor(expr="//div")
 91 |             if not _missing_lxml
 92 |             else pytest.param("Missing 'lxml'", marks=pytest.mark.skip())
 93 |         ),
 94 |     ],
 95 |     ids=repr,
 96 | )
 97 | def simple_extractor(request):
 98 |     return request.param
 99 | 
100 | 
101 | def test_complex_extractor_is_extractor(complex_extractor):
102 |     assert is_extractor(complex_extractor)
103 | 
104 | 
105 | def test_simple_extractor_is_extractor(simple_extractor):
106 |     assert is_extractor(simple_extractor)
107 | 
108 | 
109 | def test_is_complex_extractor(complex_extractor):
110 |     assert is_complex_extractor(complex_extractor)
111 | 
112 | 
113 | def test_is_not_complex_extractor(simple_extractor):
114 |     assert not is_complex_extractor(simple_extractor)
115 | 
116 | 
117 | def test_is_simple_extractor(simple_extractor):
118 |     assert is_simple_extractor(simple_extractor)
119 | 
120 | 
121 | def test_is_not_simple_extractor(complex_extractor):
122 |     assert not is_simple_extractor(complex_extractor)
123 | 
124 | 
125 | @pytest.mark.skipif(
126 |     importlib.util.find_spec("cssselect") is not None,
127 |     reason="'cssselect' installed",
128 | )
129 | def test_missing_cssselect():
130 |     with pytest.raises(RuntimeError) as catch:
131 |         CSSExtractor("a>b")
132 | 
133 |     assert "cssselect" in str(catch.value)
134 | 
135 |     with pytest.raises(RuntimeError) as catch:
136 |         AttrCSSExtractor("a>b", "href")
137 | 
138 |     assert "cssselect" in str(catch.value)
139 | 
140 |     with pytest.raises(RuntimeError) as catch:
141 |         TextCSSExtractor("a>b")
142 | 
143 |     assert "cssselect" in str(catch.value)
144 | 
145 | 
146 | @pytest.mark.skipif(
147 |     importlib.util.find_spec("lxml") is not None, reason="'lxml' installed"
148 | )
149 | def test_missing_lxml():
150 |     with pytest.raises(RuntimeError) as catch:
151 |         XPathExtractor("//boo")
152 | 
153 |     assert "lxml" in str(catch.value)
154 | 
155 | 
156 | @pytest.mark.skipif(
157 |     importlib.util.find_spec("jsonpath") is not None,
158 |     reason="'jsonpath-extractor' installed",
159 | )
160 | def test_missing_jsonpath_extractor():
161 |     with pytest.raises(RuntimeError) as catch:
162 |         JSONPathExtractor("boo")
163 | 
164 |     assert "jsonpath-extractor" in str(catch.value)
165 | 
166 | 
167 | @pytest.mark.skipif(
168 |     importlib.util.find_spec("jsonpath_rw") is not None,
169 |     reason="'jsonpath-rw' installed",
170 | )
171 | def test_missing_jsonpath_rw():
172 |     with pytest.raises(RuntimeError) as catch:
173 |         JSONPathRWExtractor("boo")
174 | 
175 |     assert "jsonpath-rw" in str(catch.value)
176 | 
177 |     with pytest.raises(RuntimeError) as catch:
178 |         JSONPathRWExtExtractor("boo")
179 | 
180 |     assert "jsonpath-rw" in str(catch.value)
181 | 
182 | 
183 | @pytest.mark.skipif(
184 |     not (
185 |         importlib.util.find_spec("jsonpath_rw_ext") is None
186 |         and importlib.util.find_spec("jsonpath_rw") is not None
187 |     ),
188 |     reason="'jsonpath-rw-ext' installed or 'jsonpath-rw' uninstalled",
189 | )
190 | def test_missing_jsonpath_rw_ext():
191 |     with pytest.raises(RuntimeError) as catch:
192 |         JSONPathRWExtExtractor("boo")
193 | 
194 |     assert "jsonpath-rw-ext" in str(catch.value)
195 | 
196 | 
197 | def test_getframe_value_error():
198 |     with pytest.raises(ValueError):
199 |         getframe(sys.getrecursionlimit() + 1)
200 | 
201 | 
202 | def test_property_accessing_error():
203 |     class Bar(AbstractSimpleExtractor):
204 |         unset_attribute = Property[None]()
205 | 
206 |         def extract(self, element):
207 |             return super().extract(element)
208 | 
209 |     assert isinstance(Bar.unset_attribute, Property)
210 | 
211 |     with pytest.raises(AttributeError):
212 |         bar = Bar("dummy expr")
213 |         bar.unset_attribute
214 | 
215 | 
216 | def test_property_re_set_error():
217 |     class Bar(AbstractSimpleExtractor):
218 |         boo = Property[int]()
219 | 
220 |         def extract(self, element):
221 |             return super().extract(element)
222 | 
223 |     bar = Bar("dummy expr")
224 |     bar.boo = 0
225 |     assert bar.boo == 0
226 |     with pytest.raises(AttributeError):
227 |         bar.boo = 1
228 |     assert bar.boo == 0
229 | 
230 | 
231 | def test_property_change_internal_value_success():
232 |     class Bar(AbstractSimpleExtractor):
233 |         boo = Property[int]()
234 | 
235 |         def extract(self, element):
236 |             return super().extract(element)
237 | 
238 |     bar = Bar("dummy expr")
239 |     bar.boo = 0
240 |     assert bar.boo == 0
241 |     Property.change_internal_value(bar, "boo", 1)
242 |     assert bar.boo == 1
243 | 
244 | 
245 | def test_property_change_internal_value_failure():
246 |     class Bar(AbstractSimpleExtractor):
247 |         boo = 1
248 | 
249 |         def extract(self, element):
250 |             return super().extract(element)
251 | 
252 |     bar = Bar("dummy expr")
253 |     with pytest.raises(AttributeError):
254 |         Property.change_internal_value(bar, "boo", 1)
255 | 


--------------------------------------------------------------------------------
/tests/typesafety/conftest.py:
--------------------------------------------------------------------------------
 1 | # Standard Library
 2 | from typing import List
 3 | 
 4 | # Third Party Library
 5 | import pytest
 6 | 
 7 | from _pytest.nodes import Node
 8 | 
 9 | xfail: List[str] = []
10 | 
11 | 
12 | def pytest_collection_modifyitems(config, items: List[Node]):
13 |     for item in items:
14 |         if item.name in xfail:
15 |             item.add_marker(pytest.mark.xfail(strict=True))
16 | 


--------------------------------------------------------------------------------
/tests/typesafety/test_extracted_typed_dict.yml:
--------------------------------------------------------------------------------
 1 | - case: item_extracted_result_is_typeddict
 2 |   skip: sys.version_info.minor < 8
 3 |   main: |
 4 |     from tests.utils import D
 5 |     from data_extractor.item import Item, Field
 6 | 
 7 |     class Point2D(Item):
 8 |         x = Field[int](D())
 9 |         y = Field[int](D())
10 |         _dummy = Field(D())
11 |         _dummy_val = 1
12 | 
13 |     p = Point2D(D())
14 |     rv = p.extract({"x": 1, "y": 3})
15 |     reveal_type(rv)
16 |   out: |
17 |     main:12: note: Revealed type is "TypedDict({'x': builtins.int, 'y': builtins.int, '_dummy': Any})"
18 | - case: item_extracted_many_results_are_typeddict
19 |   skip: sys.version_info.minor < 8
20 |   main: |
21 |     from tests.utils import D
22 |     from data_extractor.item import Item, Field
23 | 
24 |     class Point2D(Item):
25 |         def distance(self, point) -> int:
26 |             return 1
27 |         _dummy = Field(D())
28 |         _dummy_val = 1
29 |         x = Field[int](D())
30 |         y = Field[int](D())
31 | 
32 |     p = Point2D(D(), is_many=True)
33 |     rvs = p.extract([{"x": 1, "y": 3}])
34 |     reveal_type(rvs)
35 |   out: |
36 |     main:14: note: Revealed type is "builtins.list[TypedDict({'_dummy': Any, 'x': builtins.int, 'y': builtins.int})]"
37 | - case: item_in-place_extracting
38 |   skip: sys.version_info.minor < 8
39 |   main: |
40 |     from tests.utils import D
41 |     from data_extractor.item import Item, Field
42 | 
43 |     class Point2D(Item):
44 |         def distance(self, point) -> int:
45 |             return 1
46 |         _dummy = Field(D())
47 |         _dummy_val = 1
48 |         x = Field[int](D())
49 |         y = Field[int](D())
50 | 
51 |     rvs = Point2D(D(), is_many=True).extract([{"x": 1, "y": 3}])
52 |     reveal_type(rvs)
53 |     rv = Point2D(D()).extract([{"x": 1, "y": 3}])
54 |     reveal_type(rv)
55 |   out: |
56 |     main:13: note: Revealed type is "builtins.list[TypedDict({'_dummy': Any, 'x': builtins.int, 'y': builtins.int})]"
57 |     main:15: note: Revealed type is "TypedDict({'_dummy': Any, 'x': builtins.int, 'y': builtins.int})"
58 | - case: name_paramater_overwrite_typeddict_type
59 |   skip: sys.version_info.minor < 8
60 |   main: |
61 |     from tests.utils import D
62 |     from data_extractor.item import Item, Field
63 | 
64 |     class NamedPoint(Item):
65 |         x = Field[int](D())
66 |         y = Field[int](D())
67 |         name_ = Field[str](D(), name="name")
68 | 
69 |     p = NamedPoint(D())
70 |     rv = p.extract([{"x": 1, "y": 3, "name": "A"}])
71 |     reveal_type(rv)
72 |   out: |
73 |     main:11: note: Revealed type is "TypedDict({'x': builtins.int, 'y': builtins.int, 'name': builtins.str})"
74 | 


--------------------------------------------------------------------------------
/tests/typesafety/test_generic.yml:
--------------------------------------------------------------------------------
  1 | - case: field_extract_without_typing
  2 |   main: |
  3 |     from tests.utils import D
  4 |     from data_extractor.item import Field
  5 | 
  6 |     f = Field(D())
  7 |     rv = f.extract(1)
  8 |     reveal_type(rv)
  9 |   out: |
 10 |     main:6: note: Revealed type is "Any"
 11 | - case: field_extract_with_typing
 12 |   main: |
 13 |     from tests.utils import D
 14 |     from data_extractor.item import Field
 15 | 
 16 |     f_str = Field[str](D())
 17 |     rv_str = f_str.extract(1)
 18 |     reveal_type(rv_str)
 19 | 
 20 |     f_int = Field(D(), type=int)
 21 |     rv_int = f_int.extract("1")
 22 |     reveal_type(rv_int)
 23 | 
 24 |     f_str_2: Field[str] = Field(D())
 25 |     rv_str_2 = f_str_2.extract("1")
 26 |     reveal_type(rv_str_2)
 27 | 
 28 |     reveal_type(Field[str](D()).extract(1))
 29 |   out: |
 30 |     main:6: note: Revealed type is "builtins.str"
 31 |     main:10: note: Revealed type is "builtins.int"
 32 |     main:14: note: Revealed type is "builtins.str"
 33 |     main:16: note: Revealed type is "builtins.str"
 34 | - case: field_extract_with_typing_alias
 35 |   main: |
 36 |     from tests.utils import D
 37 |     from data_extractor.item import Field
 38 | 
 39 |     StrField = Field[str]
 40 |     f = StrField(D())
 41 |     rv = f.extract(1)
 42 |     reveal_type(rv)
 43 |   out: |
 44 |     main:7: note: Revealed type is "builtins.str"
 45 | - case: field_type_hinting_conflict_with_type_param
 46 |   main: |
 47 |     from tests.utils import D
 48 |     from data_extractor.item import Field
 49 | 
 50 |     f_1 = Field[str](D(), type=int)
 51 |     reveal_type(f_1)
 52 | 
 53 |     f_2: Field[str] = Field(D(), type=int)
 54 |     reveal_type(f_2)
 55 |   out: |
 56 |     main:4: error: Argument "type" to "Field" has incompatible type "Type[int]"; expected "Optional[Type[str]]"  [arg-type]
 57 |     main:5: note: Revealed type is "data_extractor.item.Field[builtins.str]"
 58 |     main:7: error: Argument "type" to "Field" has incompatible type "Type[int]"; expected "Optional[Type[str]]"  [arg-type]
 59 |     main:8: note: Revealed type is "data_extractor.item.Field[builtins.str]"
 60 | - case: field_type_hinting_conflict_with_convertor_param
 61 |   main: |
 62 |     from tests.utils import D
 63 |     from data_extractor.item import Field
 64 | 
 65 |     f_1 = Field[str](D(), convertor=int)
 66 |     reveal_type(f_1)
 67 | 
 68 |     f_2: Field[str] = Field(D(), convertor=int)
 69 |     reveal_type(f_2)
 70 |   out: |
 71 |     main:4: error: Argument "convertor" to "Field" has incompatible type "Type[int]"; expected "Optional[Callable[[Any], str]]"  [arg-type]
 72 |     main:5: note: Revealed type is "data_extractor.item.Field[builtins.str]"
 73 |     main:7: error: Argument "convertor" to "Field" has incompatible type "Type[int]"; expected "Optional[Callable[[Any], str]]"  [arg-type]
 74 |     main:8: note: Revealed type is "data_extractor.item.Field[builtins.str]"
 75 | - case: field_extract_with_typing_while_trying_to_change_type
 76 |   main: |
 77 |     from tests.utils import D
 78 |     from data_extractor.item import Field
 79 | 
 80 |     f = Field[str](D())
 81 |     rv = f.extract(1)
 82 |     reveal_type(rv)
 83 |     f.type = int  # unable to change
 84 |     rv = f.extract("1")
 85 |     reveal_type(rv)
 86 |   out: |
 87 |     main:6: note: Revealed type is "builtins.str"
 88 |     main:7: error: Incompatible types in assignment (expression has type "Type[int]", variable has type "Optional[Type[str]]")  [assignment]
 89 |     main:9: note: Revealed type is "builtins.str"
 90 | - case: field_extract_with_flag_is_many
 91 |   main: |
 92 |     from tests.utils import D
 93 |     from data_extractor.item import Field
 94 | 
 95 |     f = Field[str](D(), is_many=True)
 96 |     rvs = f.extract([1])
 97 |     reveal_type(rvs)
 98 | 
 99 |     reveal_type(Field[str](D(), is_many=True).extract([1]))
100 |   out: |
101 |     main:6: note: Revealed type is "builtins.list[builtins.str]"
102 |     main:8: note: Revealed type is "builtins.list[builtins.str]"
103 | - case: field_extract_while_trying_to_change_the_flag_of_is_many
104 |   main: |
105 |     from tests.utils import D
106 |     from data_extractor.item import Field
107 | 
108 |     f = Field[str](D(), is_many=True)
109 |     rvs = f.extract([1])
110 |     reveal_type(rvs)
111 | 
112 |     f.is_many = False  # unable to change
113 |     rv = f.extract(1)
114 |     reveal_type(rv)
115 |   out: |
116 |     main:6: note: Revealed type is "builtins.list[builtins.str]"
117 |     main:10: note: Revealed type is "builtins.list[builtins.str]"
118 | - case: subclass_field_extract_with_flag_is_many
119 |   main: |
120 |     from tests.utils import D
121 |     from data_extractor import Field, RV
122 | 
123 |     class CField(Field[RV]):
124 |         pass
125 | 
126 |     f1 = CField[str](D())
127 |     rv = f1.extract([1])
128 |     reveal_type(rv)
129 |     f2 = CField[str](D(), is_many=True)
130 |     rvs = f2.extract([1])
131 |     reveal_type(rvs)
132 |   out: |
133 |     main:9: note: Revealed type is "builtins.str"
134 |     main:12: note: Revealed type is "builtins.list[builtins.str]"
135 | - case: item_extract_with_flag_is_many
136 |   main: |
137 |     from tests.utils import D
138 |     from data_extractor import RV, Item
139 | 
140 |     class C(Item[RV]):
141 |         pass
142 | 
143 |     f1 = C(D())
144 |     rv = f1.extract([1])
145 |     reveal_type(rv)
146 |     f2 = C(D(), is_many=True)
147 |     rvs = f2.extract([1])
148 |     reveal_type(rvs)
149 |   out: |
150 |     main:9: note: Revealed type is "TypedDict({})"
151 |     main:12: note: Revealed type is "builtins.list[TypedDict({})]"
152 | - case: field_is_many_work_with_assign_expr
153 |   main: |
154 |     from tests.utils import D
155 |     from data_extractor import Field
156 | 
157 |     f2 = f1 = Field(D(), is_many=True)
158 |     rv1 = f1.extract([1])
159 |     reveal_type(rv1)
160 |     rv2 = f2.extract([1])
161 |     reveal_type(rv2)
162 |   out: |
163 |     main:6: note: Revealed type is "builtins.list[Any]"
164 |     main:8: note: Revealed type is "builtins.list[Any]"
165 | - case: field_is_many_work_with_assign_expr_in_classdef
166 |   main: |
167 |     from tests.utils import D
168 |     from data_extractor import Field
169 | 
170 |     class B:
171 |         f2 = f1 = Field(D(), is_many=True)
172 | 
173 |     rv1 = B.f1.extract([1])
174 |     reveal_type(rv1)
175 |     rv2 = B.f2.extract([1])
176 |     reveal_type(rv2)
177 |   out: |
178 |     main:8: note: Revealed type is "builtins.list[Any]"
179 |     main:10: note: Revealed type is "builtins.list[Any]"
180 | - case: field_is_many_work_with_member_assign_expr
181 |   main: |
182 |     from typing import Any
183 | 
184 |     from tests.utils import D
185 |     from data_extractor import Field
186 | 
187 |     class C:
188 |         def bar(self):
189 |             pass
190 | 
191 |         f: int = 0
192 |         f1: Field
193 |         f3: Any
194 | 
195 |     C.f3 = f2 = C.f1 = Field(D(), is_many=True)
196 |     rv1 = C.f1.extract([1])
197 |     reveal_type(rv1)
198 |     rv2 = f2.extract([1])
199 |     reveal_type(rv2)
200 |     rv3 = C.f3.extract([1])
201 |     reveal_type(rv3)
202 | 
203 |     C.f = Field(D())
204 |     c.f = Field(D())
205 |   out: |
206 |     main:16: note: Revealed type is "builtins.list[Any]"
207 |     main:18: note: Revealed type is "builtins.list[Any]"
208 |     main:20: note: Revealed type is "builtins.list[Any]"
209 |     main:22: error: Incompatible types in assignment (expression has type "Field[Any]", variable has type "int")  [assignment]
210 |     main:23: error: Name "c" is not defined  [name-defined]
211 | - case: disallow_any_generic
212 |   main: |
213 |     from data_extractor import Field
214 |     f1 = Field()
215 |     f2 = Field[int]()
216 |     rv = f2.extract([1])
217 |     reveal_type(rv)
218 |   mypy_config: |
219 |     disallow_any_generics=true
220 |   out: |
221 |     main:2: error: Need type annotation for "f1"  [var-annotated]
222 |     main:5: note: Revealed type is "builtins.int"
223 | - case: extractor_cls_as_func_argument
224 |   main: |
225 |     from typing import Type
226 |     from data_extractor import Field
227 | 
228 |     def bar1(CF):
229 |         f1 = CF(is_many=True)
230 |         rv1 = f1.extract([1])
231 |         reveal_type(rv1)
232 | 
233 |     def bar2(CF: Type[Field[int]]):
234 |         f2 = CF(is_many=False)
235 |         rv2 = f2.extract([1])
236 |         reveal_type(rv2)
237 |   out: |
238 |     main:7: note: Revealed type is "Any"
239 |     main:7: note: 'reveal_type' always outputs 'Any' in unchecked functions
240 |     main:12: note: Revealed type is "builtins.int"
241 | - case: item_classdef_not_effects_normal_function_call
242 |   main: |
243 |     import inspect
244 |     from data_extractor import Item, Field
245 | 
246 |     class User(Item):
247 |         uid = Field()
248 | 
249 |     _ = inspect.currentframe()
250 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | # First Party Library
 2 | from data_extractor.core import AbstractSimpleExtractor
 3 | 
 4 | 
 5 | class DumyExtractor(AbstractSimpleExtractor):
 6 |     def __init__(self, expr=""):
 7 |         super().__init__(expr)
 8 | 
 9 |     def extract(self, element):
10 |         return [element]
11 | 
12 | 
13 | D = DumyExtractor
14 | 


--------------------------------------------------------------------------------