├── .coveragerc
├── .github
├── CONTRIBUTING.md
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── documentation-improvement.md
│ ├── feature_request.md
│ └── submit-question.md
├── config.yml
├── dependabot.yml
└── workflows
│ ├── ci-tests.yml
│ └── publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── Makefile
├── README.md
├── asv_bench
├── README.md
└── benchmarks
│ ├── __init__.py
│ ├── dataframe_schema.py
│ └── series_schema.py
├── docs
├── Makefile
├── make.bat
└── source
│ ├── _static
│ ├── custom.js
│ ├── default.css
│ ├── docsearch_config.js_t
│ ├── pandera-banner.png
│ ├── pandera-favicon.png
│ └── pandera-logo.png
│ ├── _templates
│ ├── class.rst
│ ├── dtype.rst
│ ├── model_component_class.rst
│ ├── module.rst
│ ├── page.html
│ ├── sidebar
│ │ └── search.html
│ └── strategies_module.rst
│ ├── checks.md
│ ├── conf.py
│ ├── configuration.md
│ ├── dask.md
│ ├── data_format_conversion.md
│ ├── data_synthesis_strategies.md
│ ├── dataframe_models.md
│ ├── dataframe_schemas.md
│ ├── decorators.md
│ ├── drop_invalid_rows.md
│ ├── dtype_validation.md
│ ├── dtypes.md
│ ├── error_report.md
│ ├── extensions.md
│ ├── fastapi.md
│ ├── frictionless.md
│ ├── fugue.md
│ ├── geopandas.md
│ ├── hypothesis.md
│ ├── index.md
│ ├── integrations.md
│ ├── jupyterlite_config.json
│ ├── lazy_validation.md
│ ├── modin.md
│ ├── mypy_integration.md
│ ├── notebooks
│ └── try_pandera.ipynb
│ ├── parsers.md
│ ├── polars.md
│ ├── pydantic_integration.md
│ ├── pyspark.md
│ ├── pyspark_sql.md
│ ├── reference
│ ├── core.rst
│ ├── dataframe_models.rst
│ ├── decorators.rst
│ ├── dtypes.rst
│ ├── errors.rst
│ ├── extensions.rst
│ ├── index.md
│ ├── io.rst
│ ├── schema_inference.rst
│ └── strategies.rst
│ ├── schema_inference.md
│ ├── series_schemas.md
│ └── supported_libraries.md
├── environment.yml
├── mypy.ini
├── noxfile.py
├── pandera
├── __init__.py
├── _pandas_deprecated.py
├── _patch_numpy2.py
├── accessors
│ ├── __init__.py
│ ├── dask_accessor.py
│ ├── modin_accessor.py
│ ├── pandas_accessor.py
│ ├── polars_accessor.py
│ ├── pyspark_accessor.py
│ └── pyspark_sql_accessor.py
├── api
│ ├── __init__.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── checks.py
│ │ ├── error_handler.py
│ │ ├── model.py
│ │ ├── model_components.py
│ │ ├── model_config.py
│ │ ├── parsers.py
│ │ ├── schema.py
│ │ └── types.py
│ ├── checks.py
│ ├── dataframe
│ │ ├── __init__.py
│ │ ├── components.py
│ │ ├── container.py
│ │ ├── model.py
│ │ ├── model_components.py
│ │ └── model_config.py
│ ├── extensions.py
│ ├── function_dispatch.py
│ ├── hypotheses.py
│ ├── pandas
│ │ ├── __init__.py
│ │ ├── array.py
│ │ ├── components.py
│ │ ├── container.py
│ │ ├── model.py
│ │ ├── model_config.py
│ │ └── types.py
│ ├── parsers.py
│ ├── polars
│ │ ├── __init__.py
│ │ ├── components.py
│ │ ├── container.py
│ │ ├── model.py
│ │ ├── model_config.py
│ │ ├── types.py
│ │ └── utils.py
│ └── pyspark
│ │ ├── __init__.py
│ │ ├── column_schema.py
│ │ ├── components.py
│ │ ├── container.py
│ │ ├── model.py
│ │ ├── model_components.py
│ │ ├── model_config.py
│ │ └── types.py
├── backends
│ ├── __init__.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── builtin_checks.py
│ │ └── builtin_hypotheses.py
│ ├── pandas
│ │ ├── __init__.py
│ │ ├── array.py
│ │ ├── base.py
│ │ ├── builtin_checks.py
│ │ ├── builtin_hypotheses.py
│ │ ├── checks.py
│ │ ├── components.py
│ │ ├── container.py
│ │ ├── error_formatters.py
│ │ ├── hypotheses.py
│ │ ├── parsers.py
│ │ └── register.py
│ ├── polars
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── builtin_checks.py
│ │ ├── checks.py
│ │ ├── components.py
│ │ ├── container.py
│ │ ├── error_formatters.py
│ │ └── register.py
│ ├── pyspark
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── builtin_checks.py
│ │ ├── checks.py
│ │ ├── column.py
│ │ ├── components.py
│ │ ├── container.py
│ │ ├── decorators.py
│ │ ├── error_formatters.py
│ │ ├── register.py
│ │ └── utils.py
│ └── utils.py
├── config.py
├── constants.py
├── decorators.py
├── dtypes.py
├── engines
│ ├── __init__.py
│ ├── engine.py
│ ├── geopandas_engine.py
│ ├── numpy_engine.py
│ ├── pandas_engine.py
│ ├── polars_engine.py
│ ├── pyarrow_engine.py
│ ├── pyspark_engine.py
│ ├── type_aliases.py
│ └── utils.py
├── errors.py
├── extensions.py
├── external_config.py
├── import_utils.py
├── inspection_utils.py
├── io
│ ├── __init__.py
│ └── pandas_io.py
├── mypy.py
├── pandas.py
├── polars.py
├── py.typed
├── pyspark.py
├── schema_inference
│ ├── __init__.py
│ └── pandas.py
├── schema_statistics
│ ├── __init__.py
│ └── pandas.py
├── strategies
│ ├── __init__.py
│ ├── base_strategies.py
│ └── pandas_strategies.py
├── system.py
├── typing
│ ├── __init__.py
│ ├── common.py
│ ├── dask.py
│ ├── fastapi.py
│ ├── formats.py
│ ├── geopandas.py
│ ├── modin.py
│ ├── pandas.py
│ ├── polars.py
│ ├── pyspark.py
│ └── pyspark_sql.py
├── utils.py
└── validation_depth.py
├── pyproject.toml
├── requirements.txt
├── scripts
└── generate_pip_deps_from_conda.py
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
├── base
└── test_base_schema.py
├── conftest.py
├── dask
├── __init__.py
├── test_dask.py
├── test_dask_accessor.py
└── test_dask_not_installed.py
├── fastapi
├── __init__.py
├── app.py
├── models.py
└── test_app.py
├── geopandas
├── test_engine.py
├── test_from_to_format_conversions.py
├── test_geopandas.py
└── test_pydantic.py
├── hypotheses
├── __init__.py
└── test_hypotheses.py
├── io
├── __init__.py
└── test_pandas_io.py
├── modin
├── __init__.py
├── conftest.py
├── test_logical_dtypes.py
├── test_modin_accessor.py
└── test_schemas_on_modin.py
├── mypy
├── config
│ ├── no_plugin.ini
│ └── plugin_mypy.ini
├── pandas_modules
│ ├── pandas_concat.py
│ ├── pandas_dataframe.py
│ ├── pandas_index.py
│ ├── pandas_series.py
│ ├── pandas_time.py
│ ├── pandera_inheritance.py
│ ├── pandera_types.py
│ └── python_slice.py
└── test_pandas_static_type_checking.py
├── pandas
├── __init__.py
├── checks_fixtures.py
├── conftest.py
├── modules
│ ├── __init__.py
│ └── validate_on_init.py
├── test__pandas_deprecated__test_model.py
├── test__pandas_deprecated__test_schemas.py
├── test_checks.py
├── test_checks_builtin.py
├── test_config.py
├── test_decorators.py
├── test_docs_setting_column_widths.py
├── test_dtypes.py
├── test_engine.py
├── test_engine_utils.py
├── test_errors.py
├── test_extension_modules.py
├── test_extensions.py
├── test_from_to_format_conversions.py
├── test_logical_dtypes.py
├── test_model.py
├── test_model_components.py
├── test_multithreaded.py
├── test_numpy_engine.py
├── test_pandas_accessor.py
├── test_pandas_config.py
├── test_pandas_engine.py
├── test_pandas_parallel.py
├── test_parsers.py
├── test_pydantic.py
├── test_pydantic_dtype.py
├── test_schema_components.py
├── test_schema_inference.py
├── test_schema_statistics.py
├── test_schemas.py
├── test_typing.py
└── test_validation_depth.py
├── polars
├── __init__.py
├── conftest.py
├── test_polars_builtin_checks.py
├── test_polars_check.py
├── test_polars_components.py
├── test_polars_config.py
├── test_polars_container.py
├── test_polars_dataframe_generic.py
├── test_polars_decorators.py
├── test_polars_dtypes.py
├── test_polars_model.py
├── test_polars_parallel.py
├── test_polars_pydantic.py
├── test_polars_strategies.py
└── test_polars_typing.py
├── pyspark
├── __init__.py
├── conftest.py
├── test_pyspark_accessor.py
├── test_pyspark_check.py
├── test_pyspark_config.py
├── test_pyspark_container.py
├── test_pyspark_decorators.py
├── test_pyspark_dtypes.py
├── test_pyspark_engine.py
├── test_pyspark_error.py
├── test_pyspark_model.py
└── test_schemas_on_pyspark_pandas.py
├── strategies
├── __init__.py
└── test_strategies.py
└── test_inspection_utils.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = pandera
3 |
4 | [report]
5 | exclude_lines =
6 | if self.debug:
7 | pragma: no cover
8 | raise NotImplementedError
9 | if __name__ == .__main__.:
10 | ignore_errors = True
11 | omit =
12 | tests/*
13 | pandera/mypy.py
14 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [cosmicBboy]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | - [ ] I have checked that this issue has not already been reported.
14 | - [ ] I have confirmed this bug exists on the latest version of pandera.
15 | - [ ] (optional) I have confirmed this bug exists on the main branch of pandera.
16 |
17 | **Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug.
18 |
19 | #### Code Sample, a copy-pastable example
20 |
21 | ```python
22 | # Your code here
23 |
24 | ```
25 |
26 | #### Expected behavior
27 | A clear and concise description of what you expected to happen.
28 |
29 | #### Desktop (please complete the following information):
30 |
31 | - OS: [e.g. iOS]
32 | - Browser: [e.g. chrome, safari]
33 | - Version: [e.g. 22]
34 |
35 | #### Screenshots
36 | If applicable, add screenshots to help explain your problem.
37 |
38 | #### Additional context
39 | Add any other context about the problem here.
40 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-improvement.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Documentation Improvement
3 | about: Report wrong or missing documentation
4 | title: ''
5 | labels: docs
6 | assignees: ''
7 |
8 | ---
9 |
10 | #### Location of the documentation
11 |
12 | [this should provide the location of the documentation, e.g. "pandera.api.pandas.container.DataFrameSchema" or the URL of the documentation, e.g. "https://pandera.readthedocs.io/en/stable/dataframe_schemas.html#column-validation"]
13 |
14 | **Note**: You can check the latest versions of the docs on `master` [here](https://pandera.readthedocs.io/en/latest/).
15 |
16 | #### Documentation problem
17 |
18 | [this should provide a description of what documentation you believe needs to be fixed/improved]
19 |
20 | #### Suggested fix for documentation
21 |
22 | [this should explain the suggested fix and **why** it's better than the existing documentation]
23 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is.
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/submit-question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Submit Question
3 | about: Ask a general question about pandera
4 | title: ''
5 | labels: question
6 | assignees: ''
7 |
8 | ---
9 |
10 | #### Question about pandera
11 |
12 | **Note**: If you'd still like to submit a question, please read [this guide](
13 | https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question.
14 |
15 | ```python
16 | # Your code here, if applicable
17 |
18 | ```
19 |
--------------------------------------------------------------------------------
/.github/config.yml:
--------------------------------------------------------------------------------
1 | # Comment to be posted on PRs from first-time contributors in your repository
2 | newPRWelcomeComment: |
3 | Thank you for opening this pull request! 🙌
4 |
5 | These tips will help get your PR across the finish line:
6 |
7 | - If you haven't already, check out the [Contributing Guide](https://pandera.readthedocs.io/en/stable/CONTRIBUTING.html)
8 | - Sign off your commits (Reference: [DCO Guide](https://github.com/src-d/guide/blob/master/developer-community/fix-DCO.md)).
9 |
10 | # Comment to be posted to on pull requests merged by a first time user
11 | firstPRMergeComment: >
12 | Congrats on merging your first pull request! 🎉
13 |
14 | # Comment to be posted on first-time issues
15 | newIssueWelcomeComment: >
16 | Thank you for opening your first issue here! 🛠
17 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "github-actions"
4 | directory: "/"
5 | schedule:
6 | interval: "monthly"
7 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish Python Package
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | build_wheel_and_sdist:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v4
12 | with:
13 | fetch-depth: "0"
14 | - name: Set up Python
15 | uses: actions/setup-python@v5
16 | with:
17 | python-version: "3.x"
18 | - name: Install dependencies
19 | run: |
20 | python -m pip install --upgrade pip
21 | pip install build twine
22 | - name: Build wheel and sdist
23 | run: python -m build
24 | shell: bash
25 | - uses: actions/upload-artifact@v4
26 | with:
27 | name: pandera-artifact
28 | path: ./dist
29 |
30 | pypi-publish:
31 | name: Upload release to PyPI
32 | needs: [build_wheel_and_sdist]
33 | runs-on: ubuntu-latest
34 | permissions:
35 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
36 | environment: release
37 | steps:
38 | - uses: actions/download-artifact@v4
39 | with:
40 | name: pandera-artifact
41 | path: dist
42 | - run: ls dist
43 | - name: Publish package distributions to PyPI
44 | uses: pypa/gh-action-pypi-publish@release/v1
45 | with:
46 | attestations: false
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | pandera/_version.py
2 | uv.lock
3 | *.db
4 | .vscode
5 | dask-worker-space
6 | spark-warehouse
7 | docs/source/_contents
8 | docs/jupyter_execute
9 | **.DS_Store
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # Environments
95 | .env
96 | .venv
97 | env/
98 | venv/
99 | ENV/
100 | env.bak/
101 | venv.bak/
102 |
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 |
107 | # Rope project settings
108 | .ropeproject
109 |
110 | # mkdocs documentation
111 | /site
112 |
113 | # mypy
114 | .mypy_cache/
115 |
116 | # Pycharm settings
117 | .idea
118 |
119 | # Airspeed Velocity Benchmarks
120 | /asv_bench/html/
121 | /asv_bench/results/
122 |
123 | # Docs
124 | docs/source/reference/generated
125 |
126 | # Nox
127 | .nox
128 | .nox-*
129 |
130 | # ignore markdown files copied from .github
131 | docs/source/CONTRIBUTING.md
132 | .aider*
133 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: (^asv_bench|setup.py|requirements-dev.txt)
2 |
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v4.1.0
6 | hooks:
7 | - id: check-ast
8 | description: Simply check whether files parse as valid python
9 | - id: check-case-conflict
10 | description: Check for files that would conflict in case-insensitive filesystems
11 | - id: check-merge-conflict
12 | description: Check for files that contain merge conflict strings
13 | - id: check-yaml
14 | description: Attempts to load all yaml files to verify syntax
15 | - id: debug-statements
16 | description: Check for debugger imports and py37+ breakpoint() calls in python source
17 | - id: end-of-file-fixer
18 | description: Makes sure files end in a newline and only a newline
19 | - id: trailing-whitespace
20 | description: Trims trailing whitespace
21 | - id: mixed-line-ending
22 | description: Replaces or checks mixed line ending
23 |
24 | - repo: https://github.com/pre-commit/mirrors-isort
25 | rev: v5.10.1
26 | hooks:
27 | - id: isort
28 | args: ["--line-length=79", "--skip=docs/source/conf.py", "--diff"]
29 |
30 | - repo: https://github.com/ikamensh/flynt
31 | rev: "0.76"
32 | hooks:
33 | - id: flynt
34 |
35 | - repo: https://github.com/psf/black
36 | rev: 24.4.2
37 | hooks:
38 | - id: black
39 |
40 | - repo: https://github.com/asottile/pyupgrade
41 | rev: v3.19.1
42 | hooks:
43 | - id: pyupgrade
44 | args: [--py38-plus, --keep-runtime-typing]
45 |
46 | - repo: https://github.com/pycqa/pylint
47 | rev: v3.3.6
48 | hooks:
49 | - id: pylint
50 | args: ["--disable=import-error"]
51 | exclude: (^docs/|^scripts)
52 |
53 | - repo: https://github.com/pre-commit/mirrors-mypy
54 | rev: v1.10.0
55 | hooks:
56 | - id: mypy
57 | additional_dependencies:
58 | - types-click
59 | - types-pytz
60 | - types-pyyaml
61 | - types-requests
62 | - types-setuptools
63 | - polars
64 | args: ["pandera", "tests", "scripts"]
65 | exclude: (^docs/|^tests/mypy/modules/)
66 | pass_filenames: false
67 | require_serial: true
68 | verbose: true
69 |
70 | - repo: https://github.com/codespell-project/codespell
71 | rev: v2.4.1
72 | hooks:
73 | - id: codespell
74 | additional_dependencies:
75 | - tomli
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [BASIC]
2 | ignore=mypy.py,noxfile.py,pandera/accessors/pyspark_sql_accessor.py,pandera/engines/pyspark_engine.py,pandera/pyspark.py,pandera/typing/pyspark_sql.py,
3 | ignore-patterns=pandera/api/pyspark/*,tests/pyspark/*
4 | good-names=
5 | T,
6 | F,
7 | logger,
8 | df,
9 | fn,
10 | i,
11 | e,
12 | x,
13 | f,
14 | k,
15 | v,
16 | fp,
17 | bar,
18 | eq,
19 | ne,
20 | gt,
21 | ge,
22 | lt,
23 | le,
24 | dt,
25 | tz,
26 | TBaseModel,
27 | TArraySchemaBase,
28 | TDataFrameModel,
29 | _DataType
30 |
31 | [MESSAGES CONTROL]
32 | disable=
33 | # C0330 conflicts with black: https://github.com/psf/black/issues/48
34 | R0913,
35 | duplicate-code,
36 | too-many-instance-attributes,
37 | no-else-return,
38 | inconsistent-return-statements,
39 | protected-access,
40 | too-many-ancestors,
41 | too-many-lines,
42 | too-few-public-methods,
43 | line-too-long,
44 | ungrouped-imports,
45 | function-redefined,
46 | arguments-differ,
47 | unnecessary-dunder-call,
48 | use-dict-literal,
49 | invalid-name,
50 | import-outside-toplevel,
51 | missing-class-docstring,
52 | missing-function-docstring,
53 | fixme,
54 | too-many-locals,
55 | redefined-outer-name,
56 | logging-fstring-interpolation,
57 | multiple-statements,
58 | cyclic-import,
59 | too-many-positional-arguments,
60 | too-many-function-args,
61 | # Due to custom `immutable` decorator replacing `dataclasses.dataclass`
62 | invalid-field-call
63 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | build:
9 | os: ubuntu-24.04
10 | apt_packages:
11 | # Install OpenJDK as Java backend to run PySpark examples.
12 | - openjdk-11-jre-headless
13 | tools:
14 | python: "3.11"
15 | jobs:
16 | post_install:
17 | - pip install uv
18 | - UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --all-extras --all-groups --link-mode=copy
19 |
20 | sphinx:
21 | configuration: docs/source/conf.py
22 |
23 |
24 | # Optionally build your docs in additional formats such as PDF and ePub
25 | formats: []
26 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Niels Bantilan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: tests clean clean-pyc upload-pypi-test upload-pypi requirements docs \
2 | code-cov docs-clean requirements-dev.txt
3 |
4 | clean:
5 | python setup.py clean
6 |
7 | clean-pyc:
8 | find . -name '*.pyc' -exec rm {} \;
9 |
10 | upload-pypi-test:
11 | python setup.py sdist bdist_wheel && \
12 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* && \
13 | rm -rf dist
14 |
15 | upload-pypi:
16 | python setup.py sdist bdist_wheel && \
17 | twine upload dist/* && \
18 | rm -rf dist
19 |
20 | .PHONY: install-uv
21 | install-uv:
22 | pip install uv
23 |
24 | setup: install-uv
25 | uv sync --all-extras
26 |
27 | setup-macos: install-uv
28 | uv sync --all-extras
29 | uv pip install polars-lts-cpu
30 |
31 | docs-clean:
32 | rm -rf docs/source/reference/generated docs/**/generated docs/**/methods docs/_build docs/source/_contents
33 |
34 | docs: docs-clean
35 | python -m sphinx -W -E "docs/source" "docs/_build" && make -C docs doctest
36 |
37 | quick-docs:
38 | python -m sphinx -E "docs/source" "docs/_build" && make -C docs doctest
39 |
40 | code-cov:
41 | pytest --cov-report=html --cov=pandera tests/
42 |
43 | NOX_FLAGS ?= "-r"
44 |
45 | deps-from-environment.yml:
46 | python scripts/generate_pip_deps_from_conda.py
47 |
48 | unit-tests:
49 | pytest tests/core
50 |
51 | nox-tests:
52 | nox -db uv -s tests ${NOX_FLAGS}
53 |
--------------------------------------------------------------------------------
/asv_bench/README.md:
--------------------------------------------------------------------------------
1 | # Airspeed Velocity
2 |
3 | `pandera`'s performance benchmarks over time can be [viewed on this airspeed-velocity dashboard](https://pandera-dev.github.io/pandera-asv-logs/).
4 |
5 | The [config](https://github.com/pandera-dev/pandera-asv-logs/tree/master/asv_bench/asv.conf.json) and [results files](https://github.com/pandera-dev/pandera-asv-logs/tree/master/results) files are tracked in the [pandera-asv-logs](https://github.com/pandera-dev/pandera-asv-logs) repo to avoid build files in the main repo.
6 |
7 | The [benchmarks](https://github.com/pandera-dev/pandera/tree/master/benchmarks/) are tracked in the main [pandera repo](https://github.com/pandera-dev/pandera).
8 |
9 | ## Running `asv`
10 |
11 | Ensure both the `pandera` and `pandera-asv-logs` repos are checked out to the same parent directory.
12 |
13 | From the `pandera-asv-logs` repo, run:
14 | ```
15 | asv run ALL --config asv_bench/asv.conf.json
16 | ```
17 |
18 | ## Publishing results:
19 |
20 | To build the html and preview the results:
21 | ```
22 | asv publish --config asv_bench/asv.conf.json
23 | asv preview --config asv_bench/asv.conf.json
24 | ```
25 |
26 | The `.json` results files are committed or PR'd into the master branch of `pandera-asv-logs`.
27 |
28 | The published html is pushed directly to the gh-pages branch of `pandera-asv-logs` by running:
29 |
30 | ```
31 | asv gh-pages --rewrite --config asv_bench/asv.conf.json
32 | ```
33 |
34 | The `--rewrite` flag overwrites the existing `gh-pages`, avoiding duplication of data.
35 |
36 | The `asv` docs are [here](https://asv.readthedocs.io/en/stable/index.html).
37 |
--------------------------------------------------------------------------------
/asv_bench/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/asv_bench/benchmarks/__init__.py
--------------------------------------------------------------------------------
/asv_bench/benchmarks/dataframe_schema.py:
--------------------------------------------------------------------------------
1 | # Airspeed Velocity Benchmarks for pandera
2 | import pandas as pd
3 |
4 | from pandera.pandas import (
5 | Column,
6 | DataFrameSchema,
7 | Bool,
8 | Category,
9 | Check,
10 | DateTime,
11 | Float,
12 | Int,
13 | Object,
14 | String,
15 | Timedelta,
16 | check_input,
17 | check_output,
18 | )
19 |
20 |
21 | class Validate:
22 | """
23 | Benchmarking schema.validate
24 | """
25 |
26 | def setup(self):
27 | self.schema = DataFrameSchema(
28 | {
29 | "a": Column(Int),
30 | "b": Column(Float),
31 | "c": Column(String),
32 | "d": Column(Bool),
33 | "e": Column(Category),
34 | "f": Column(Object),
35 | "g": Column(DateTime),
36 | "i": Column(Timedelta),
37 | },
38 | )
39 | self.df = pd.DataFrame(
40 | {
41 | "a": [1, 2, 3],
42 | "b": [1.1, 2.5, 9.9],
43 | "c": ["z", "y", "x"],
44 | "d": [True, True, False],
45 | "e": pd.Series(["c2", "c1", "c3"], dtype="category"),
46 | "f": [(3,), (2,), (1,)],
47 | "g": [
48 | pd.Timestamp("2015-02-01"),
49 | pd.Timestamp("2015-02-02"),
50 | pd.Timestamp("2015-02-03"),
51 | ],
52 | "i": [
53 | pd.Timedelta(1, unit="D"),
54 | pd.Timedelta(5, unit="D"),
55 | pd.Timedelta(9, unit="D"),
56 | ],
57 | }
58 | )
59 |
60 | def time_df_schema(self):
61 | self.schema.validate(self.df)
62 |
63 | def mem_df_schema(self):
64 | self.schema.validate(self.df)
65 |
66 | def peakmem_df_schema(self):
67 | self.schema.validate(self.df)
68 |
69 |
70 | class Decorators:
71 | """
72 | Benchmarking input and output decorator performance.
73 | """
74 |
75 | def transformer(df):
76 | return df.assign(column2=[1, 2, 3])
77 |
78 | def setup(self):
79 | self.in_schema = DataFrameSchema({"column1": Column(String)})
80 | self.out_schema = DataFrameSchema({"column2": Column(Int)})
81 | self.df = pd.DataFrame({"column1": ["a", "b", "c"]})
82 |
83 | def time_check_input(self):
84 | @check_input(self.in_schema)
85 | def transform_first_arg(self):
86 | return Decorators.transformer(self.df)
87 |
88 | def mem_check_input(self):
89 | @check_input(self.in_schema)
90 | def transform_first_arg(self):
91 | return Decorators.transformer(self.df)
92 |
93 | def peakmem_check_input(self):
94 | @check_input(self.in_schema)
95 | def transform_first_arg(self):
96 | return Decorators.transformer(self.df)
97 |
98 | def time_check_output(self):
99 | @check_output(self.out_schema)
100 | def transform_first_arg(self):
101 | return Decorators.transformer(self.df)
102 |
103 | def mem_check_output(self):
104 | @check_output(self.out_schema)
105 | def transform_first_arg(self):
106 | return Decorators.transformer(self.df)
107 |
108 | def peakmem_check_output(self):
109 | @check_output(self.out_schema)
110 | def transform_first_arg(self):
111 | return Decorators.transformer(self.df)
112 |
--------------------------------------------------------------------------------
/asv_bench/benchmarks/series_schema.py:
--------------------------------------------------------------------------------
1 | # Airspeed Velocity Benchmarks for pandera
2 | import pandas as pd
3 |
4 | from pandera.pandas import (
5 | Column,
6 | DataFrameSchema,
7 | SeriesSchema,
8 | Bool,
9 | Category,
10 | Check,
11 | DateTime,
12 | Float,
13 | Int,
14 | Object,
15 | String,
16 | Timedelta,
17 | String,
18 | )
19 |
20 |
21 | class Validate:
22 | """
23 | Benchmarking Series schema.validate
24 | """
25 |
26 | def setup(self):
27 | self.schema = SeriesSchema(
28 | String,
29 | checks=[
30 | Check(lambda s: s.str.startswith("foo")),
31 | Check(lambda s: s.str.endswith("bar")),
32 | Check(lambda x: len(x) > 3, element_wise=True),
33 | ],
34 | nullable=False,
35 | unique=False,
36 | name="my_series",
37 | )
38 | self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series")
39 |
40 | def time_series_schema(self):
41 | self.schema.validate(self.series)
42 |
43 | def mem_series_schema(self):
44 | self.schema.validate(self.series)
45 |
46 | def peakmem_series_schema(self):
47 | self.schema.validate(self.series)
48 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/_static/custom.js:
--------------------------------------------------------------------------------
1 | // Add event listener for DOMContentLoaded event
2 | window.addEventListener("DOMContentLoaded", function() {
3 | // Select all elements with class "external"
4 | var externalLinks = document.querySelectorAll("a.external");
5 |
6 | // Loop through each element with class "external"
7 | externalLinks.forEach(function(link) {
8 | // Set the target attribute to "_blank"
9 | link.setAttribute("target", "_blank");
10 | });
11 | });
12 |
13 |
14 | function setHtmlDataTheme() {
15 | // Set theme at the root html element
16 | setTimeout(() => {
17 | const theme = document.body.dataset.theme;
18 | const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
19 |
20 | if (theme === "auto") {
21 | document.documentElement.dataset.theme = prefersDark ? "dark" : "light";
22 | } else {
23 | document.documentElement.dataset.theme = theme;
24 | }
25 | }, 10)
26 | }
27 |
28 | function setupAlgoliaTheme() {
29 | // To get darkmode in the algolia search modal, we need to set the theme in
30 | // the root html element. This function propagates the theme set by furo
31 | // that's set in the body element.
32 | const buttons = document.getElementsByClassName("theme-toggle");
33 |
34 | // set for initial document load
35 | setHtmlDataTheme();
36 |
37 | // listen for when theme button is clicked.
38 | Array.from(buttons).forEach((btn) => {
39 | btn.addEventListener("click", setHtmlDataTheme);
40 | });
41 | }
42 |
43 | function main() {
44 | setupAlgoliaTheme()
45 | }
46 |
47 | document.addEventListener('DOMContentLoaded', main);
48 | window.addEventListener('keydown', (event) => {
49 | if (event.code === "Escape") {
50 | // make sure to prevent default behavior with escape key so that algolia
51 | // modal can be closed properly.
52 | event.preventDefault();
53 | }
54 | });
55 |
--------------------------------------------------------------------------------
/docs/source/_static/docsearch_config.js_t:
--------------------------------------------------------------------------------
1 | docsearch({
2 | container: "{{ docsearch_container|default('#docsearch') }}",
3 | appId: "{{ docsearch_app_id }}",
4 | apiKey: "{{ docsearch_api_key }}",
5 | indexName: "{{ docsearch_index_name }}",
6 | {%- if docsearch_search_parameters %}
7 | searchParameters: {
8 | {% for key, value in docsearch_search_parameters.items() %}
9 | {{ key }}: {% if value is string %}"{{ value }}"{% else %}{{ value }}{% endif %}{% if not loop.last %},{% endif %}
10 | {% endfor %}
11 | }
12 | {%- endif %}
13 | });
14 |
--------------------------------------------------------------------------------
/docs/source/_static/pandera-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-banner.png
--------------------------------------------------------------------------------
/docs/source/_static/pandera-favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-favicon.png
--------------------------------------------------------------------------------
/docs/source/_static/pandera-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-logo.png
--------------------------------------------------------------------------------
/docs/source/_templates/class.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 |
3 | .. currentmodule:: {{ module }}
4 |
5 | .. autoclass:: {{ objname }}
6 |
7 | {% block attributes %}
8 | {% if attributes %}
9 | .. rubric:: Attributes
10 |
11 | .. autosummary::
12 | :nosignatures:
13 |
14 | {% for item in attributes %}
15 | ~{{ name }}.{{ item }}
16 | {%- endfor %}
17 |
18 | {% endif %}
19 | {% endblock %}
20 |
21 | {% block methods %}
22 | {% if methods %}
23 | .. rubric:: Methods
24 |
25 | {% for item in methods %}
26 | {%- if item not in inherited_members %}
27 | .. automethod:: {{ item }}
28 | {% endif %}
29 | {%- endfor %}
30 |
31 | {% endif %}
32 |
33 | {%- if members and '__call__' in members %}
34 | .. automethod:: __call__
35 | {%- endif %}
36 |
37 | {% endblock %}
38 |
--------------------------------------------------------------------------------
/docs/source/_templates/dtype.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 |
3 | .. currentmodule:: {{ module }}
4 |
5 | .. autoclass:: {{ objname }}
6 |
7 | {% block attributes %}
8 | {% if attributes %}
9 | .. rubric:: Attributes
10 |
11 | .. autosummary::
12 | :nosignatures:
13 |
14 | {% for item in attributes %}
15 | ~{{ name }}.{{ item }}
16 | {%- endfor %}
17 |
18 | {% endif %}
19 | {% endblock %}
20 |
21 | {% block methods %}
22 | {% if methods %}
23 | .. rubric:: Methods
24 |
25 | {% for item in methods %}
26 | .. automethod:: {{ item }}
27 | {%- endfor %}
28 |
29 | {%- if members and '__call__' in members %}
30 | .. automethod:: __call__
31 | {%- endif %}
32 |
33 | {%- endif %}
34 | {% endblock %}
35 |
--------------------------------------------------------------------------------
/docs/source/_templates/model_component_class.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 |
3 | .. currentmodule:: {{ module }}
4 |
5 | .. autoclass:: {{ objname }}
6 | :show-inheritance:
7 | :exclude-members:
8 |
9 | {% block attributes %}
10 | {% if attributes %}
11 | .. rubric:: Attributes
12 |
13 | .. autosummary::
14 | :nosignatures:
15 |
16 | {% for item in attributes %}
17 | ~{{ name }}.{{ item }}
18 | {%- endfor %}
19 |
20 | {% endif %}
21 | {% endblock %}
22 |
--------------------------------------------------------------------------------
/docs/source/_templates/module.rst:
--------------------------------------------------------------------------------
1 | .. empty
2 |
3 | {{ fullname | escape | underline }}
4 |
5 | .. currentmodule:: {{ fullname }}
6 |
7 | .. automodule:: {{ fullname }}
8 |
9 | {% block classes %}
10 |
11 | {% for item in classes %}
12 | .. autoclass:: {{ item }}
13 | :members:
14 | :member-order: bysource
15 | :show-inheritance:
16 | :exclude-members:
17 | {%- endfor %}
18 |
19 | {% endblock %}
20 |
21 | {% block functions %}
22 |
23 | {% for item in functions %}
24 | .. autofunction:: {{ item }}
25 | {%- endfor %}
26 |
27 | {% endblock %}
28 |
--------------------------------------------------------------------------------
/docs/source/_templates/page.html:
--------------------------------------------------------------------------------
1 | {% extends "!page.html" %}
2 |
3 | {% block body -%}
4 | {{ super() }}
5 |
6 |
11 |
12 | {%- endblock %}
13 |
--------------------------------------------------------------------------------
/docs/source/_templates/sidebar/search.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/source/_templates/strategies_module.rst:
--------------------------------------------------------------------------------
1 | .. empty
2 |
3 | {{ fullname | escape | underline }}
4 |
5 | .. currentmodule:: {{ fullname }}
6 |
7 | .. automodule:: {{ fullname }}
8 |
9 | {% block functions %}
10 |
11 | {% for item in functions %}
12 | {% if item not in ["null_dataframe_masks", "null_field_masks", "set_pandas_index", "strategy_import_error"] %}
13 | .. autofunction:: {{ item }}
14 | {% endif %}
15 | {%- endfor %}
16 |
17 | {% endblock %}
18 |
--------------------------------------------------------------------------------
/docs/source/configuration.md:
--------------------------------------------------------------------------------
1 | (configuration)=
2 |
3 | # Configuration
4 |
5 | *New in version 0.17.3*
6 |
7 | `pandera` provides a global config `~pandera.config.PanderaConfig`. The
8 | global configuration is available through `pandera.config.CONFIG`. It can also
9 | be modified with a configuration context `~pandera.config.config_context` and
10 | fetched with `~pandera.config.get_config_context` in custom code.
11 |
12 | This configuration can also be set using environment variables.
13 |
14 | ## Validation depth
15 |
16 | Validation depth determines whether pandera only runs schema-level validations
17 | (column names and datatypes), data-level validations (checks on actual values),
18 | or both:
19 |
20 | ```
21 | export PANDERA_VALIDATION_ENABLED=False
22 | export PANDERA_VALIDATION_DEPTH=DATA_ONLY # SCHEMA_AND_DATA, SCHEMA_ONLY, DATA_ONLY
23 | ```
24 |
25 | ## Enabling/disabling validation
26 |
27 | Runtime data validation incurs a performance overhead. To mitigate this in the
28 | appropriate contexts, you have the option to disable validation globally.
29 |
30 | This can be achieved by setting the environment variable
31 | `PANDERA_VALIDATION_ENABLED=False`. When validation is disabled, any
32 | `validate` call not actually run any validation checks.
33 |
--------------------------------------------------------------------------------
/docs/source/dask.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | ```{currentmodule} pandera
6 | ```
7 |
8 | (scaling-dask)=
9 |
10 | # Data Validation with Dask
11 |
12 | *new in 0.8.0*
13 |
14 | [Dask](https://docs.dask.org/en/latest/dataframe.html) is a distributed
15 | compute framework that offers a pandas-like dataframe API.
16 | You can use pandera to validate {py:func}`~dask.dataframe.DataFrame`
17 | and {py:func}`~dask.dataframe.Series` objects directly. First, install
18 | `pandera` with the `dask` extra:
19 |
20 | ```bash
21 | pip install 'pandera[dask]'
22 | ```
23 |
24 | Then you can use pandera schemas to validate dask dataframes. In the example
25 | below we'll use the {ref}`class-based API ` to define a
26 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation.
27 |
28 | ```{code-cell} python
29 | import dask.dataframe as dd
30 | import pandas as pd
31 | import pandera.pandas as pa
32 |
33 | from pandera.typing.dask import DataFrame, Series
34 |
35 |
36 | class Schema(pa.DataFrameModel):
37 | state: Series[str]
38 | city: Series[str]
39 | price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})
40 |
41 |
42 | ddf = dd.from_pandas(
43 | pd.DataFrame(
44 | {
45 | 'state': ['FL','FL','FL','CA','CA','CA'],
46 | 'city': [
47 | 'Orlando',
48 | 'Miami',
49 | 'Tampa',
50 | 'San Francisco',
51 | 'Los Angeles',
52 | 'San Diego',
53 | ],
54 | 'price': [8, 12, 10, 16, 20, 18],
55 | }
56 | ),
57 | npartitions=2
58 | )
59 | pandera_ddf = Schema(ddf)
60 | pandera_ddf
61 | ```
62 |
63 | As you can see, passing the dask dataframe into `Schema` will produce
64 | another dask dataframe which hasn't been evaluated yet. What this means is
65 | that pandera will only validate when the dask graph is evaluated.
66 |
67 | ```{code-cell} python
68 | pandera_ddf.compute()
69 | ```
70 |
71 | You can also use the {py:func}`~pandera.check_types` decorator to validate
72 | dask dataframes at runtime:
73 |
74 | ```{code-cell} python
75 | @pa.check_types
76 | def function(ddf: DataFrame[Schema]) -> DataFrame[Schema]:
77 | return ddf[ddf["state"] == "CA"]
78 |
79 | function(ddf).compute()
80 | ```
81 |
82 | And of course, you can use the object-based API to validate dask dataframes:
83 |
84 | ```{code-cell} python
85 | schema = pa.DataFrameSchema({
86 | "state": pa.Column(str),
87 | "city": pa.Column(str),
88 | "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20))
89 | })
90 | schema(ddf).compute()
91 | ```
92 |
--------------------------------------------------------------------------------
/docs/source/drop_invalid_rows.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | ```{currentmodule} pandera
6 | ```
7 |
8 | (drop-invalid-rows)=
9 |
10 | # Dropping Invalid Rows
11 |
12 | *New in version 0.16.0*
13 |
14 | If you wish to use the validation step to remove invalid data, you can pass the
15 | `drop_invalid_rows=True` argument to the `schema` object on creation. On `schema.validate()`,
16 | if a data-level check fails, then that row which caused the failure will be removed from the dataframe
17 | when it is returned.
18 |
19 | `drop_invalid_rows` will prevent data-level schema errors being raised and will instead
20 | remove the rows which causes the failure.
21 |
22 | This functionality is available on `DataFrameSchema`, `SeriesSchema`, `Column`,
23 | as well as `DataFrameModel` schemas.
24 |
25 | **Note** that this functionality works by identifying the index or multi-index of the failing rows.
26 | If the index is not unique on the dataframe, this could result in incorrect rows being dropped.
27 |
28 | Dropping invalid rows with {class}`~pandera.api.pandas.container.DataFrameSchema`:
29 |
30 | ```{code-cell} python
31 | import pandas as pd
32 | import pandera.pandas as pa
33 |
34 |
35 | df = pd.DataFrame({"counter": [1, 2, 3]})
36 | schema = pa.DataFrameSchema(
37 | {"counter": pa.Column(int, checks=[pa.Check(lambda x: x >= 3)])},
38 | drop_invalid_rows=True,
39 | )
40 |
41 | schema.validate(df, lazy=True)
42 | ```
43 |
44 | Dropping invalid rows with {class}`~pandera.api.pandas.array.SeriesSchema`:
45 |
46 | ```{code-cell} python
47 | import pandas as pd
48 | import pandera.pandas as pa
49 |
50 |
51 | series = pd.Series([1, 2, 3])
52 | schema = pa.SeriesSchema(
53 | int,
54 | checks=[pa.Check(lambda x: x >= 3)],
55 | drop_invalid_rows=True,
56 | )
57 |
58 | schema.validate(series, lazy=True)
59 | ```
60 |
61 | Dropping invalid rows with {class}`~pandera.api.pandas.components.Column`:
62 |
63 | ```{code-cell} python
64 | import pandas as pd
65 | import pandera.pandas as pa
66 |
67 |
68 | df = pd.DataFrame({"counter": [1, 2, 3]})
69 | schema = pa.Column(
70 | int,
71 | name="counter",
72 | drop_invalid_rows=True,
73 | checks=[pa.Check(lambda x: x >= 3)]
74 | )
75 |
76 | schema.validate(df, lazy=True)
77 | ```
78 |
79 | Dropping invalid rows with {class}`~pandera.api.pandas.model.DataFrameModel`:
80 |
81 | ```{code-cell} python
82 | import pandas as pd
83 | import pandera.pandas as pa
84 |
85 |
86 | class MySchema(pa.DataFrameModel):
87 | counter: int = pa.Field(in_range={"min_value": 3, "max_value": 5})
88 |
89 | class Config:
90 | drop_invalid_rows = True
91 |
92 |
93 | MySchema.validate(
94 | pd.DataFrame({"counter": [1, 2, 3, 4, 5, 6]}), lazy=True
95 | )
96 | ```
97 |
98 | ```{note}
99 | In order to use `drop_invalid_rows=True`, `lazy=True` must
100 | be passed to the `schema.validate()`. {ref}`lazy-validation` enables all schema
101 | errors to be collected and raised together, meaning all invalid rows can be dropped together.
102 | This provides clear API for ensuring the validated dataframe contains only valid data.
103 | ```
104 |
--------------------------------------------------------------------------------
/docs/source/error_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | (error-report)=
6 |
7 | # Error Reports
8 |
9 | *new in 0.19.0*
10 |
11 | The pandera error report is a generalised machine-readable summary of failures
12 | which occurred during schema validation. It is available for both `pysparksql` and
13 | `pandas` objects.
14 |
15 | By default, error reports are generated for both schema and data level validation,
16 | but more granular control over schema or data only validations is available.
17 |
18 | This is achieved by introducing configurable settings using environment variables
19 | that allow you to control execution at three different levels:
20 |
21 | 1. `SCHEMA_ONLY`: perform schema validations only. It checks that data conforms
22 | to the schema definition, but does not perform any data-level validations on dataframe.
23 | 2. `DATA_ONLY`: perform data-level validations only. It validates that data
24 | conforms to the defined `checks`, but does not validate the schema.
25 | 3. `SCHEMA_AND_DATA`: (**default**) perform both schema and data level
26 | validations. It runs most exhaustive validation and could be compute intensive.
27 |
28 | You can override default behaviour by setting an environment variable from terminal
29 | before running the `pandera` process as:
30 |
31 | ```bash
32 | export PANDERA_VALIDATION_DEPTH=SCHEMA_ONLY
33 | ```
34 |
35 | This will be picked up by `pandera` to only enforce SCHEMA level validations.
36 |
37 | ## Error reports with `pandas`
38 |
39 | To create an error report with pandas, you must specify `lazy=True` to allow all errors
40 | to be aggregated and raised together as a `SchemaErrors`.
41 |
42 | ```{code-cell} python
43 | import pandas as pd
44 | import pandera.pandas as pa
45 | import json
46 |
47 | pandas_schema = pa.DataFrameSchema(
48 | {
49 | "color": pa.Column(str, pa.Check.isin(["red", "green", "blue"])),
50 | "length": pa.Column(int, pa.Check.gt(10)),
51 | }
52 | )
53 | data = [("red", 4), ("blue", 11), ("purple", 15), ("green", 39)]
54 |
55 | df = pd.DataFrame(
56 | {
57 | "color": ["red", "blue", "purple", "green"],
58 | "length": [4, 11, 15, 39],
59 | }
60 | )
61 |
62 | try:
63 | pandas_schema.validate(df, lazy=True)
64 | except pa.errors.SchemaErrors as e:
65 | print(json.dumps(e.message, indent=2))
66 | ```
67 |
68 | ## Error reports with `pyspark.sql`
69 |
70 | Accessing the error report on a validated `pyspark` dataframe can be done via the
71 | `errors` attribute on the `pandera` accessor.
72 |
73 | ```{code-cell} python
74 | import pandera.pyspark as pa
75 | import pyspark.sql.types as T
76 | import json
77 |
78 | from decimal import Decimal
79 | from pyspark.sql import SparkSession
80 | from pandera.pyspark import DataFrameModel
81 |
82 | spark = SparkSession.builder.getOrCreate()
83 |
84 | class PysparkPanderSchema(DataFrameModel):
85 | color: T.StringType() = pa.Field(isin=["red", "green", "blue"])
86 | length: T.IntegerType() = pa.Field(gt=10)
87 |
88 | data = [("red", 4), ("blue", 11), ("purple", 15), ("green", 39)]
89 |
90 | spark_schema = T.StructType(
91 | [
92 | T.StructField("color", T.StringType(), False),
93 | T.StructField("length", T.IntegerType(), False),
94 | ],
95 | )
96 |
97 | df = spark.createDataFrame(data, spark_schema)
98 | df_out = PysparkPanderSchema.validate(check_obj=df)
99 |
100 | print(json.dumps(dict(df_out.pandera.errors), indent=4))
101 | ```
102 |
--------------------------------------------------------------------------------
/docs/source/fastapi.md:
--------------------------------------------------------------------------------
1 | ```{eval-rst}
2 | .. currentmodule:: pandera
3 | ```
4 |
5 | (fastapi-integration)=
6 |
7 | # FastAPI
8 |
9 | *new in 0.9.0*
10 |
11 | Since both FastAPI and Pandera integrates seamlessly with Pydantic, you can
12 | use the {py:class}`~pandera.api.pandas.model.DataFrameModel` types to validate incoming
13 | or outgoing data with respect to your API endpoints.
14 |
15 | ## Using DataFrameModels to Validate Endpoint Inputs and Outputs
16 |
17 | Suppose we want to process transactions, where each transaction has an
18 | `id` and `cost`. We can model this with a pandera dataframe model:
19 |
20 | ```{literalinclude} ../../tests/fastapi/models.py
21 | :language: python
22 | :lines: 1-14
23 | ```
24 |
25 | Also suppose that we expect our endpoint to add a `name` to the transaction
26 | data:
27 |
28 | ```{literalinclude} ../../tests/fastapi/models.py
29 | :language: python
30 | :lines: 22-25
31 | ```
32 |
33 | Let's also assume that the output of the endpoint should be a list of dictionary
34 | records containing the named transactions data. We can do this easily with the
35 | `to_format` option in the dataframe model {py:class}`~pandera.typing.config.BaseConfig`.
36 |
37 | ```{literalinclude} ../../tests/fastapi/models.py
38 | :language: python
39 | :lines: 34-37
40 | ```
41 |
42 | Note that the `to_format_kwargs` is a dictionary of key-word arguments
43 | to be passed into the respective pandas `to_{format}` method.
44 |
45 | % TODO: create new page for the to/from_format config option
46 |
47 | Next we'll create a FastAPI app and define a `/transactions/` POST endpoint:
48 |
49 | ```{literalinclude} ../../tests/fastapi/app.py
50 | :language: python
51 | :lines: 2-6,14-21,28-34
52 | ```
53 |
54 | ## Reading File Uploads
55 |
56 | Similar to the `TransactionsDictOut` example to convert dataframes to a
57 | particular format as an endpoint response, pandera also provides a
58 | `from_format` dataframe model configuration option to read a dataframe from
59 | a particular serialization format.
60 |
61 | ```{literalinclude} ../../tests/fastapi/models.py
62 | :language: python
63 | :lines: 17-19
64 | ```
65 |
66 | Let's also define a response model for the `/file/` upload endpoint:
67 |
68 | ```{literalinclude} ../../tests/fastapi/models.py
69 | :language: python
70 | :lines: 28-32,46-48
71 | ```
72 |
73 | In the next example, we use the pandera
74 | {py:class}`~pandera.typing.fastapi.UploadFile` type to upload a parquet file
75 | to the `/file/` POST endpoint and return a response containing the filename
76 | and the modified data in json format.
77 |
78 | ```{literalinclude} ../../tests/fastapi/app.py
79 | :language: python
80 | :lines: 37-44
81 | ```
82 |
83 | Pandera's {py:class}`~pandera.typing.fastapi.UploadFile` type is a subclass of FastAPI's
84 | [UploadFile](https://fastapi.tiangolo.com/tutorial/request-files/?h=uploadfile#uploadfile)
85 | but it exposes a `.data` property containing the pandera-validated dataframe.
86 |
87 | ## Takeaway
88 |
89 | With the FastAPI and Pandera integration, you can use Pandera
90 | {py:class}`~pandera.api.pandas.model.DataFrameModel` types to validate the dataframe inputs
91 | and outputs of your FastAPI endpoints.
92 |
--------------------------------------------------------------------------------
/docs/source/frictionless.md:
--------------------------------------------------------------------------------
1 | ```{eval-rst}
2 | .. currentmodule:: pandera
3 | ```
4 |
5 | (frictionless-integration)=
6 |
7 | # Reading Third-Party Schema
8 |
9 | *new in 0.7.0*
10 |
11 | Pandera now accepts schema from other data validation frameworks. This requires
12 | a pandera installation with the `io` extension; please see the
13 | {ref}`installation` instructions for more details.
14 |
15 | ## Frictionless Data Schema
16 |
17 | :::{note}
18 | Please see the
19 | [Frictionless schema](https://specs.frictionlessdata.io/table-schema/)
20 | documentation for more information on this standard.
21 | :::
22 |
23 | ```{eval-rst}
24 | .. autofunction:: pandera.io.from_frictionless_schema
25 | ```
26 |
27 | under the hood, this uses the {class}`~pandera.io.pandas_io.FrictionlessFieldParser` class
28 | to parse each frictionless field (column):
29 |
30 | ```{eval-rst}
31 | .. autoclass:: pandera.io.pandas_io.FrictionlessFieldParser
32 | :members:
33 | ```
34 |
--------------------------------------------------------------------------------
/docs/source/geopandas.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: pandera
7 | ```
8 |
9 | (supported-lib-geopandas)=
10 |
11 | # Data Validation with GeoPandas
12 |
13 | *new in 0.9.0*
14 |
15 | [GeoPandas](https://geopandas.org/en/stable/docs.html) is an extension of Pandas that adds
16 | support for geospatial data. You can use pandera to validate {py:func}`~geopandas.GeoDataFrame`
17 | and {py:func}`~geopandas.GeoSeries` objects directly. First, install
18 | `pandera` with the `geopandas` extra:
19 |
20 | ```bash
21 | pip install 'pandera[geopandas]'
22 | ```
23 |
24 | Then you can use pandera schemas to validate geodataframes. In the example
25 | below we'll use the {ref}`class-based API ` to define a
26 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation.
27 |
28 | ```{code-cell} python
29 | import geopandas as gpd
30 | import pandas as pd
31 | import pandera.pandas as pa
32 | from shapely.geometry import Polygon
33 |
34 | geo_schema = pa.DataFrameSchema({
35 | "geometry": pa.Column("geometry"),
36 | "region": pa.Column(str),
37 | })
38 |
39 | geo_df = gpd.GeoDataFrame({
40 | "geometry": [
41 | Polygon(((0, 0), (0, 1), (1, 1), (1, 0))),
42 | Polygon(((0, 0), (0, -1), (-1, -1), (-1, 0)))
43 | ],
44 | "region": ["NA", "SA"]
45 | })
46 |
47 | geo_schema.validate(geo_df)
48 | ```
49 |
50 | You can also use the `GeometryDtype` data type in either instantiated or
51 | un-instantiated form:
52 |
53 | ```{code-cell} python
54 | geo_schema = pa.DataFrameSchema({
55 | "geometry": pa.Column(gpd.array.GeometryDtype),
56 | # or
57 | "geometry": pa.Column(gpd.array.GeometryDtype()),
58 | })
59 | ```
60 |
61 | If you want to validate-on-instantiation, you can use the
62 | {py:class}`~pandera.typing.geopangas.GeoDataFrame` generic type with the
63 | dataframe model defined above:
64 |
65 | ```{code-cell} python
66 | from pandera.typing import Series
67 | from pandera.typing.geopandas import GeoDataFrame, GeoSeries
68 |
69 |
70 | class Schema(pa.DataFrameModel):
71 | geometry: GeoSeries
72 | region: Series[str]
73 |
74 |
75 | # create a geodataframe that's validated on object initialization
76 | df = GeoDataFrame[Schema](
77 | {
78 | 'geometry': [
79 | Polygon(((0, 0), (0, 1), (1, 1), (1, 0))),
80 | Polygon(((0, 0), (0, -1), (-1, -1), (-1, 0)))
81 | ],
82 | 'region': ['NA','SA']
83 | }
84 | )
85 | df
86 | ```
87 |
--------------------------------------------------------------------------------
/docs/source/integrations.md:
--------------------------------------------------------------------------------
1 | (integrations)=
2 |
3 | # Integrations
4 |
5 | Pandera ships with integrations with other tools in the Python ecosystem, with
6 | the goal of interoperating with libraries that you know and love.
7 |
8 | ```{eval-rst}
9 | .. list-table::
10 | :widths: 25 75
11 |
12 | * - :ref:`FastAPI `
13 | - Use pandera DataFrameModels in your FastAPI app
14 | * - :ref:`Frictionless `
15 | - Convert frictionless schemas to pandera schemas
16 | * - :ref:`Hypothesis `
17 | - Use the hypothesis library to generate valid data under your schema's constraints.
18 | * - :ref:`Mypy `
19 | - Type-lint your pandas and pandera code with mypy for static type safety [experimental 🧪]
20 | * - :ref:`Pydantic `
21 | - Use pandera DataFrameModels when defining your pydantic BaseModels
22 | ```
23 |
24 | ```{toctree}
25 | :caption: Introduction
26 | :hidden: true
27 | :maxdepth: 1
28 |
29 | FastAPI
30 | Frictionless
31 | Hypothesis
32 | Mypy
33 | Pydantic
34 | ```
35 |
36 | :::{note}
37 | Don't see a library that you want supported? Check out the
38 | [github issues](https://github.com/pandera-dev/pandera/issues) to see if
39 | that library is in the roadmap. If it isn't, open up a
40 | [new issue](https://github.com/pandera-dev/pandera/issues/new?assignees=&labels=enhancement&template=feature_request.md&title=)
41 | to add support for it!
42 | :::
43 |
--------------------------------------------------------------------------------
/docs/source/jupyterlite_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "LiteBuildConfig": {
3 | "federated_extensions": [],
4 | "ignore_sys_prefix": true,
5 | "piplite_urls": []
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/docs/source/lazy_validation.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | ```{currentmodule} pandera
6 | ```
7 |
8 | (lazy-validation)=
9 |
10 | # Lazy Validation
11 |
12 | *New in version 0.4.0*
13 |
14 | By default, when you call the `validate` method on schema or schema component
15 | objects, a {class}`~pandera.errors.SchemaError` is raised as soon as one of the
16 | assumptions specified in the schema is falsified. For example, for a
17 | {class}`~pandera.api.pandas.container.DataFrameSchema` object, the following situations will raise an
18 | exception:
19 |
20 | - a column specified in the schema is not present in the dataframe.
21 | - if `strict=True`, a column in the dataframe is not specified in the schema.
22 | - the `data type` does not match.
23 | - if `coerce=True`, the dataframe column cannot be coerced into the specified
24 | `data type`.
25 | - the {class}`~pandera.api.checks.Check` specified in one of the columns returns `False` or
26 | a boolean series containing at least one `False` value.
27 |
28 | For example:
29 |
30 | ```{code-cell} python
31 | import pandas as pd
32 | import pandera.pandas as pa
33 |
34 |
35 | df = pd.DataFrame({"column": ["a", "b", "c"]})
36 |
37 | schema = pa.DataFrameSchema({"column": pa.Column(int)})
38 |
39 | try:
40 | schema.validate(df)
41 | except pa.errors.SchemaError as exc:
42 | print(exc)
43 | ```
44 |
45 | For more complex cases, it is useful to see all of the errors raised during
46 | the `validate` call so that you can debug the causes of errors on different
47 | columns and checks. The `lazy` keyword argument in the `validate` method
48 | of all schemas and schema components gives you the option of doing just this:
49 |
50 | ```{code-cell} python
51 | import json
52 |
53 | import pandas as pd
54 | import pandera.pandas as pa
55 |
56 |
57 | schema = pa.DataFrameSchema(
58 | columns={
59 | "int_column": pa.Column(int),
60 | "float_column": pa.Column(float, pa.Check.greater_than(0)),
61 | "str_column": pa.Column(str, pa.Check.equal_to("a")),
62 | "date_column": pa.Column(pa.DateTime),
63 | },
64 | strict=True
65 | )
66 |
67 | df = pd.DataFrame({
68 | "int_column": ["a", "b", "c"],
69 | "float_column": [0, 1, 2],
70 | "str_column": ["a", "b", "d"],
71 | "unknown_column": None,
72 | })
73 |
74 | try:
75 | schema.validate(df, lazy=True)
76 | except pa.errors.SchemaErrors as exc:
77 | print(json.dumps(exc.message, indent=2))
78 | ```
79 |
80 | As you can see from the output above, a {class}`~pandera.errors.SchemaErrors`
81 | exception is raised with a summary of the error counts and failure cases
82 | caught by the schema. This summary is called an {ref}`error-report`.
83 |
84 | You can also inspect the failure cases in a more granular form:
85 |
86 | ```{code-cell} python
87 | try:
88 | schema.validate(df, lazy=True)
89 | except pa.errors.SchemaErrors as exc:
90 | print("Schema errors and failure cases:")
91 | print(exc.failure_cases)
92 | print("\nDataFrame object that failed validation:")
93 | print(exc.data)
94 | ```
95 |
--------------------------------------------------------------------------------
/docs/source/modin.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | ```{currentmodule} pandera
6 | ```
7 |
8 | (scaling-modin)=
9 |
10 | # Data Validation with Modin
11 |
12 | *new in 0.8.0*
13 |
14 | [Modin](https://modin.readthedocs.io/en/latest/) is a distributed
15 | compute framework that offers a pandas drop-in replacement dataframe
16 | implementation. You can use pandera to validate {py:func}`~modin.pandas.DataFrame`
17 | and {py:func}`~modin.pandas.Series` objects directly. First, install
18 | `pandera` with the `dask` extra:
19 |
20 | ```bash
21 | pip install 'pandera[modin]' # installs both ray and dask backends
22 | pip install 'pandera[modin-ray]' # only ray backend
23 | pip install 'pandera[modin-dask]' # only dask backend
24 | ```
25 |
26 | Then you can use pandera schemas to validate modin dataframes. In the example
27 | below we'll use the {ref}`class-based API ` to define a
28 | {py:class}`~pandera.api.model.pandas.DataFrameModel` for validation.
29 |
30 | ```python
31 | import modin.pandas as pd
32 | import pandera.pandas as pa
33 |
34 | from pandera.typing.modin import DataFrame, Series
35 |
36 |
37 | class Schema(pa.DataFrameModel):
38 | state: Series[str]
39 | city: Series[str]
40 | price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})
41 |
42 |
43 | # create a modin dataframe that's validated on object initialization
44 | df = DataFrame[Schema](
45 | {
46 | 'state': ['FL','FL','FL','CA','CA','CA'],
47 | 'city': [
48 | 'Orlando',
49 | 'Miami',
50 | 'Tampa',
51 | 'San Francisco',
52 | 'Los Angeles',
53 | 'San Diego',
54 | ],
55 | 'price': [8, 12, 10, 16, 20, 18],
56 | }
57 | )
58 | print(df)
59 | ```
60 |
61 | ```
62 | state city price
63 | 0 FL Orlando 8
64 | 1 FL Miami 12
65 | 2 FL Tampa 10
66 | 3 CA San Francisco 16
67 | 4 CA Los Angeles 20
68 | 5 CA San Diego 18
69 | ```
70 |
71 | You can also use the {py:func}`~pandera.check_types` decorator to validate
72 | modin dataframes at runtime:
73 |
74 | ```python
75 | @pa.check_types
76 | def function(df: DataFrame[Schema]) -> DataFrame[Schema]:
77 | return df[df["state"] == "CA"]
78 |
79 | function(df)
80 | ```
81 |
82 | ```
83 | state city price
84 | 3 CA San Francisco 16
85 | 4 CA Los Angeles 20
86 | 5 CA San Diego 18
87 | ```
88 |
89 | And of course, you can use the object-based API to validate modin dataframes:
90 |
91 | ```python
92 | schema = pa.DataFrameSchema({
93 | "state": pa.Column(str),
94 | "city": pa.Column(str),
95 | "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20))
96 | })
97 | schema(df)
98 | ```
99 |
100 | ```
101 | state city price
102 | 0 FL Orlando 8
103 | 1 FL Miami 12
104 | 2 FL Tampa 10
105 | 3 CA San Francisco 16
106 | 4 CA Los Angeles 20
107 | 5 CA San Diego 18
108 | ```
109 |
--------------------------------------------------------------------------------
/docs/source/pyspark.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | ```{currentmodule} pandera
6 | ```
7 |
8 | (scaling-pyspark)=
9 |
10 | # Data Validation with Pyspark Pandas
11 |
12 | *new in 0.10.0*
13 |
14 | [Pyspark](https://spark.apache.org/docs/3.2.0/api/python/index.html) is a
15 | distributed compute framework that offers a pandas drop-in replacement dataframe
16 | implementation via the [pyspark.pandas API](https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/index.html) .
17 | You can use pandera to validate {py:func}`~pyspark.pandas.DataFrame`
18 | and {py:func}`~pyspark.pandas.Series` objects directly. First, install
19 | `pandera` with the `pyspark` extra:
20 |
21 | ```bash
22 | pip install 'pandera[pyspark]'
23 | ```
24 |
25 | Then you can use pandera schemas to validate pyspark dataframes. In the example
26 | below we'll use the {ref}`class-based API ` to define a
27 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation.
28 |
29 | ```{code-cell} python
30 | import pyspark.pandas as ps
31 | import pandas as pd
32 | import pandera.pandas as pa
33 |
34 | from pandera.typing.pyspark import DataFrame, Series
35 |
36 |
37 | class Schema(pa.DataFrameModel):
38 | state: Series[str]
39 | city: Series[str]
40 | price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})
41 |
42 |
43 | # create a pyspark.pandas dataframe that's validated on object initialization
44 | df = DataFrame[Schema](
45 | {
46 | 'state': ['FL','FL','FL','CA','CA','CA'],
47 | 'city': [
48 | 'Orlando',
49 | 'Miami',
50 | 'Tampa',
51 | 'San Francisco',
52 | 'Los Angeles',
53 | 'San Diego',
54 | ],
55 | 'price': [8, 12, 10, 16, 20, 18],
56 | }
57 | )
58 | print(df)
59 | ```
60 |
61 | You can also use the {py:func}`~pandera.check_types` decorator to validate
62 | pyspark pandas dataframes at runtime:
63 |
64 | ```{code-cell} python
65 | @pa.check_types
66 | def function(df: DataFrame[Schema]) -> DataFrame[Schema]:
67 | return df[df["state"] == "CA"]
68 |
69 | print(function(df))
70 | ```
71 |
72 | And of course, you can use the object-based API to validate dask dataframes:
73 |
74 | ```{code-cell} python
75 | schema = pa.DataFrameSchema({
76 | "state": pa.Column(str),
77 | "city": pa.Column(str),
78 | "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20))
79 | })
80 | schema(df)
81 | ```
82 |
--------------------------------------------------------------------------------
/docs/source/reference/core.rst:
--------------------------------------------------------------------------------
1 | .. _api-core:
2 |
3 | Core
4 | ====
5 |
6 | Schemas
7 | -------
8 |
9 | .. autosummary::
10 | :toctree: generated
11 | :template: class.rst
12 | :nosignatures:
13 |
14 | pandera.api.pandas.container.DataFrameSchema
15 | pandera.api.pandas.array.SeriesSchema
16 | pandera.api.polars.container.DataFrameSchema
17 | pandera.api.pyspark.container.DataFrameSchema
18 | pandera.api.dataframe.container.DataFrameSchema
19 |
20 | Schema Components
21 | -----------------
22 |
23 | .. autosummary::
24 | :toctree: generated
25 | :template: class.rst
26 | :nosignatures:
27 |
28 | pandera.api.pandas.components.Column
29 | pandera.api.pandas.components.Index
30 | pandera.api.pandas.components.MultiIndex
31 | pandera.api.polars.components.Column
32 | pandera.api.pyspark.components.Column
33 | pandera.api.dataframe.components.ComponentSchema
34 |
35 | Checks
36 | ------
37 |
38 | .. autosummary::
39 | :toctree: generated
40 | :template: class.rst
41 | :nosignatures:
42 |
43 | pandera.api.checks.Check
44 | pandera.api.hypotheses.Hypothesis
45 |
46 | Data Objects
47 | ------------
48 |
49 | .. autosummary::
50 | :toctree: generated
51 | :template: class.rst
52 | :nosignatures:
53 |
54 | pandera.api.polars.types.PolarsData
55 | pandera.api.pyspark.types.PysparkDataframeColumnObject
56 |
57 | Configuration
58 | -------------
59 |
60 | .. autosummary::
61 | :toctree: generated
62 | :template: class.rst
63 | :nosignatures:
64 |
65 | pandera.config.PanderaConfig
66 | pandera.config.ValidationDepth
67 | pandera.config.ValidationScope
68 | pandera.config.config_context
69 | pandera.config.get_config_context
70 |
--------------------------------------------------------------------------------
/docs/source/reference/dataframe_models.rst:
--------------------------------------------------------------------------------
1 | .. _api-dataframe-models:
2 |
3 | DataFrame Models
4 | ================
5 |
6 | DataFrame Model
7 | ---------------
8 |
9 | .. autosummary::
10 | :toctree: generated
11 | :template: class.rst
12 |
13 | pandera.api.pandas.model.DataFrameModel
14 | pandera.api.polars.model.DataFrameModel
15 | pandera.api.pyspark.model.DataFrameModel
16 | pandera.api.dataframe.model.DataFrameModel
17 |
18 | Model Components
19 | ----------------
20 |
21 | .. autosummary::
22 | :toctree: generated
23 |
24 | pandera.api.dataframe.model_components.Field
25 | pandera.api.dataframe.model_components.check
26 | pandera.api.dataframe.model_components.dataframe_check
27 | pandera.api.dataframe.model_components.parser
28 | pandera.api.dataframe.model_components.dataframe_parser
29 |
30 |
31 | Config
32 | ------
33 |
34 | .. autosummary::
35 | :toctree: generated
36 | :template: model_component_class.rst
37 | :nosignatures:
38 |
39 | pandera.api.pandas.model_config.BaseConfig
40 | pandera.api.polars.model_config.BaseConfig
41 | pandera.api.pyspark.model_config.BaseConfig
42 |
43 |
44 | Typing
45 | ------
46 |
47 | Pandas
48 | ******
49 |
50 | .. autosummary::
51 | :toctree: generated
52 | :template: class.rst
53 |
54 | pandera.typing.DataFrame
55 | pandera.typing.Series
56 | pandera.typing.Index
57 |
58 | Geopandas
59 | *********
60 |
61 | .. autosummary::
62 | :toctree: generated
63 | :template: class.rst
64 |
65 | pandera.typing.geopandas.GeoDataFrame
66 | pandera.typing.geopandas.GeoSeries
67 |
68 | Dask
69 | ****
70 |
71 | .. autosummary::
72 | :toctree: generated
73 | :template: class.rst
74 |
75 | pandera.typing.dask.DataFrame
76 | pandera.typing.dask.Series
77 | pandera.typing.dask.Index
78 |
79 | Pyspark
80 | *******
81 |
82 | .. autosummary::
83 | :toctree: generated
84 | :template: class.rst
85 |
86 | pandera.typing.pyspark.DataFrame
87 | pandera.typing.pyspark.Series
88 | pandera.typing.pyspark.Index
89 |
90 | Modin
91 | *****
92 |
93 | .. autosummary::
94 | :toctree: generated
95 | :template: class.rst
96 |
97 | pandera.typing.modin.DataFrame
98 | pandera.typing.modin.Series
99 | pandera.typing.modin.Index
100 |
101 | FastAPI
102 | *******
103 |
104 | .. autosummary::
105 | :toctree: generated
106 | :template: class.rst
107 |
108 | pandera.typing.fastapi.UploadFile
109 |
110 |
111 | Serialization Formats
112 | *********************
113 |
114 | .. autosummary::
115 | :toctree: generated
116 | :template: class.rst
117 |
118 | pandera.typing.formats.Formats
119 |
--------------------------------------------------------------------------------
/docs/source/reference/decorators.rst:
--------------------------------------------------------------------------------
1 | .. _api-decorators:
2 |
3 | Decorators
4 | ==========
5 |
6 | .. autosummary::
7 | :toctree: generated
8 | :nosignatures:
9 |
10 | pandera.decorators.check_input
11 | pandera.decorators.check_output
12 | pandera.decorators.check_io
13 | pandera.decorators.check_types
14 |
--------------------------------------------------------------------------------
/docs/source/reference/errors.rst:
--------------------------------------------------------------------------------
1 | .. _api-errors:
2 |
3 | Errors
4 | ======
5 |
6 | .. autosummary::
7 | :toctree: generated
8 | :template: class.rst
9 | :nosignatures:
10 |
11 | pandera.errors.SchemaError
12 | pandera.errors.SchemaErrors
13 | pandera.errors.SchemaInitError
14 | pandera.errors.SchemaDefinitionError
15 |
--------------------------------------------------------------------------------
/docs/source/reference/extensions.rst:
--------------------------------------------------------------------------------
1 | .. _api-extensions:
2 |
3 | Extensions
4 | ==========
5 |
6 | .. autosummary::
7 | :toctree: generated
8 | :template: module.rst
9 | :nosignatures:
10 |
11 | pandera.extensions
12 |
--------------------------------------------------------------------------------
/docs/source/reference/index.md:
--------------------------------------------------------------------------------
1 | % pandera package index documentation toctree
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: pandera
5 | ```
6 |
7 | # API
8 |
9 | ```{eval-rst}
10 | .. list-table::
11 | :widths: 30 70
12 |
13 | * - :ref:`Core `
14 | - The core objects for defining pandera schemas
15 | * - :ref:`Data Types `
16 | - Data types for type checking and coercion.
17 | * - :ref:`DataFrame Models `
18 | - Alternative class-based API for defining types for tabular/array-like data.
19 | * - :ref:`Decorators `
20 | - Decorators for integrating pandera schemas with python functions.
21 | * - :ref:`Schema Inference `
22 | - Bootstrap schemas from real data
23 | * - :ref:`IO Utilities `
24 | - Utility functions for reading/writing schemas
25 | * - :ref:`Data Synthesis Strategies `
26 | - Module of functions for generating data from schemas.
27 | * - :ref:`Extensions `
28 | - Utility functions for extending pandera functionality
29 | * - :ref:`Errors `
30 | - Pandera-specific exceptions
31 | ```
32 |
33 | ```{toctree}
34 | :hidden: true
35 |
36 | core
37 | dtypes
38 | dataframe_models
39 | decorators
40 | schema_inference
41 | io
42 | strategies
43 | extensions
44 | errors
45 | ```
46 |
--------------------------------------------------------------------------------
/docs/source/reference/io.rst:
--------------------------------------------------------------------------------
1 | .. _api-io-utils:
2 |
3 | IO Utilities
4 | ============
5 |
6 | The ``io`` module and built-in ``Hypothesis`` checks require a pandera
7 | installation with the corresponding extension, see the
8 | :ref:`installation` instructions for more details.
9 |
10 | .. autosummary::
11 | :toctree: generated
12 | :nosignatures:
13 |
14 | pandera.io.from_yaml
15 | pandera.io.to_yaml
16 | pandera.io.to_script
17 |
--------------------------------------------------------------------------------
/docs/source/reference/schema_inference.rst:
--------------------------------------------------------------------------------
1 | .. _api-schema-inference:
2 |
3 | Schema Inference
4 | ================
5 |
6 | .. autosummary::
7 | :toctree: generated
8 | :nosignatures:
9 |
10 | pandera.schema_inference.pandas.infer_schema
11 |
--------------------------------------------------------------------------------
/docs/source/reference/strategies.rst:
--------------------------------------------------------------------------------
1 | .. _api-strategies:
2 |
3 | Data Synthesis Strategies
4 | =========================
5 |
6 | .. autosummary::
7 | :toctree: generated
8 | :template: strategies_module.rst
9 | :nosignatures:
10 |
11 | pandera.strategies.pandas_strategies
12 |
--------------------------------------------------------------------------------
/docs/source/series_schemas.md:
--------------------------------------------------------------------------------
1 | ---
2 | file_format: mystnb
3 | ---
4 |
5 | % pandera documentation for seriesschemas
6 |
7 | ```{currentmodule} pandera
8 | ```
9 |
10 | (seriesschemas)=
11 |
12 | # Series Schemas
13 |
14 | The {class}`~pandera.api.pandas.array.SeriesSchema` class allows for the validation of pandas
15 | `Series` objects, and are very similar to {ref}`columns` and
16 | {ref}`indexes` described in {ref}`DataFrameSchemas`.
17 |
18 | ```{code-cell} python
19 | import pandas as pd
20 | import pandera.pandas as pa
21 |
22 | schema = pa.SeriesSchema(
23 | str,
24 | checks=[
25 | pa.Check(lambda s: s.str.startswith("foo")),
26 | pa.Check(lambda s: s.str.endswith("bar")),
27 | pa.Check(lambda x: len(x) > 3, element_wise=True)
28 | ],
29 | nullable=False,
30 | unique=False,
31 | name="my_series")
32 |
33 | validated_series = schema.validate(
34 | pd.Series(["foobar", "foobar", "foobar"], name="my_series")
35 | )
36 |
37 | validated_series
38 | ```
39 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: pandera-dev
2 | channels:
3 | - conda-forge
4 |
5 | dependencies:
6 | # environment management
7 | - pip
8 |
9 | # pandera dependencies
10 | - packaging >= 20.0
11 | - typing_extensions
12 | - hypothesis >= 6.92.7
13 | - pyyaml >= 5.1
14 | - typing_inspect >= 0.6.0
15 | - frictionless <= 4.40.8 # v5.* introduces breaking changes
16 | - pyarrow
17 | - pydantic
18 |
19 | # hypotheses extra
20 | - scipy
21 |
22 | # mypy extra
23 | - pandas-stubs
24 |
25 | # pyspark extra
26 | - pyspark[connect] >= 3.2.0, < 4.0.0
27 |
28 | # polars extra
29 | - polars >= 0.20.0
30 |
31 | # modin extra
32 | - modin
33 | - protobuf
34 |
35 | # geopandas extra
36 | - geopandas
37 | - shapely
38 |
39 | # fastapi extra
40 | - fastapi
41 |
42 | # testing and dependencies
43 | - black >= 24.0
44 |
45 | # testing
46 | - numpy >= 1.24.4
47 | - pandas >= 2.1.1
48 | - isort >= 5.7.0
49 | - joblib
50 | - mypy = 1.10.0
51 | - pylint < 3.3
52 | - pytest
53 | - pytest-cov
54 | - pytest-xdist
55 | - pytest-asyncio
56 | - pytz
57 | - xdoctest
58 | - nox
59 | - uv
60 | - setuptools # required in noxfile and not automatically provided by python >= 3.12
61 |
62 | # fastapi testing
63 | - uvicorn
64 | - python-multipart
65 |
66 | # documentation
67 | - sphinx
68 | - sphinx-design
69 | - sphinx-autodoc-typehints <= 1.14.1
70 | - sphinx-copybutton
71 | - recommonmark
72 | - myst-nb
73 |
74 | # packaging
75 | - twine
76 |
77 | # performance testing
78 | - asv >= 0.5.1
79 |
80 | # optional
81 | - pre_commit
82 |
83 | - pip:
84 | # dask extra
85 | - dask[dataframe]
86 | - distributed
87 |
88 | # docs
89 | - furo
90 | - sphinx-docsearch
91 | - grpcio
92 | - ray
93 | - typeguard
94 | - types-click
95 | - types-pytz
96 | - types-pyyaml
97 | - types-requests
98 | - types-setuptools
99 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | disable_error_code =annotation-unchecked
3 | ignore_missing_imports = True
4 | follow_imports = normal
5 | allow_redefinition = True
6 | warn_return_any = False
7 | warn_unused_configs = True
8 | show_error_codes = True
9 | exclude=(?x)(
10 | ^tests/mypy/pandas_modules
11 | | ^pandera/engines/pyspark_engine
12 | | ^pandera/api/pyspark
13 | | ^pandera/backends/pyspark
14 | | ^tests/pyspark
15 | )
16 | [mypy-pandera.api.pyspark.*]
17 | follow_imports = skip
18 |
19 | [mypy-docs.*]
20 | follow_imports = skip
21 |
--------------------------------------------------------------------------------
/pandera/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=wrong-import-position
2 | """A flexible and expressive dataframe validation library."""
3 |
4 | from pandera._version import __version__
5 |
6 |
7 | _warning_msg = """Pandas and numpy have been removed from the base pandera
8 | dependencies. Please install pandas as part of your environment's
9 | dependencies or install the pandas extra with:
10 |
11 | ```bash
12 | pip install pandas pandera
13 |
14 | # or
15 | pip install 'pandera[pandas]'
16 | ```
17 | """
18 |
19 |
20 | try:
21 | # Only add pandas to the top-level pandera namespace
22 | # if pandas and numpy are installed
23 | import pandas as pd
24 | import numpy as np
25 |
26 | from pandera._pandas_deprecated import *
27 | from pandera._pandas_deprecated import __all__ as _pandas_deprecated_all
28 | from pandera import dtypes
29 | from pandera import typing
30 |
31 | __all__ = [
32 | "__version__",
33 | *_pandas_deprecated_all,
34 | ]
35 |
36 | except ImportError as err:
37 | import warnings
38 |
39 | if "pandas" in str(err) or "numpy" in str(err):
40 | warnings.warn(_warning_msg, UserWarning)
41 | else:
42 | raise # Re-raise any other `ImportError` exceptions
43 |
44 | from pandera import dtypes
45 | from pandera import typing
46 | from pandera.api.checks import Check
47 | from pandera.api.dataframe.model_components import (
48 | Field,
49 | check,
50 | dataframe_check,
51 | dataframe_parser,
52 | parser,
53 | )
54 |
55 | __all__ = [
56 | "__version__",
57 | "Check",
58 | "Field",
59 | "check",
60 | "dataframe_check",
61 | "dataframe_parser",
62 | "parser",
63 | "dtypes",
64 | "typing",
65 | ]
66 |
--------------------------------------------------------------------------------
/pandera/_patch_numpy2.py:
--------------------------------------------------------------------------------
1 | """Patch numpy 2 to prevent errors."""
2 |
3 | from functools import lru_cache
4 |
5 |
6 | @lru_cache
7 | def _patch_numpy2():
8 | """This is a temporary fix for numpy 2.
9 |
10 | pyspark uses np.NaN, which is deprecated in numpy 2.
11 | """
12 | import numpy as np
13 |
14 | expired_attrs = getattr(np, "_expired_attrs_2_0", None)
15 |
16 | if expired_attrs:
17 | attrs_replacement = {
18 | "NaN": np.nan,
19 | "string_": np.bytes_,
20 | "float_": np.float64,
21 | "unicode_": np.str_,
22 | }
23 | for attr, replacement in attrs_replacement.items():
24 | has_attr = expired_attrs.__expired_attributes__.pop(attr, None)
25 | if has_attr:
26 | setattr(np, attr, replacement)
27 |
--------------------------------------------------------------------------------
/pandera/accessors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/accessors/__init__.py
--------------------------------------------------------------------------------
/pandera/accessors/dask_accessor.py:
--------------------------------------------------------------------------------
1 | """Register dask accessor for pandera schema metadata."""
2 |
3 | from dask.dataframe.extensions import (
4 | register_dataframe_accessor,
5 | register_series_accessor,
6 | )
7 |
8 | from pandera.accessors.pandas_accessor import (
9 | PanderaDataFrameAccessor,
10 | PanderaSeriesAccessor,
11 | )
12 |
13 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
14 | register_series_accessor("pandera")(PanderaSeriesAccessor)
15 |
--------------------------------------------------------------------------------
/pandera/accessors/modin_accessor.py:
--------------------------------------------------------------------------------
1 | """Custom accessor functionality for modin.
2 |
3 | Source code adapted from pyspark.pandas implementation:
4 | https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/api/pyspark.pandas.extensions.register_dataframe_accessor.html?highlight=register_dataframe_accessor#pyspark.pandas.extensions.register_dataframe_accessor
5 | """
6 |
7 | import warnings
8 |
9 | from pandera.accessors.pandas_accessor import (
10 | PanderaDataFrameAccessor,
11 | PanderaSeriesAccessor,
12 | )
13 |
14 |
15 | # pylint: disable=too-few-public-methods
16 | class CachedAccessor:
17 | """
18 | Custom property-like object.
19 |
20 | A descriptor for caching accessors:
21 |
22 | :param name: Namespace that accessor's methods, properties, etc will be
23 | accessed under, e.g. "foo" for a dataframe accessor yields the accessor
24 | ``df.foo``
25 | :param cls: Class with the extension methods.
26 |
27 | For accessor, the class's __init__ method assumes that you are registering
28 | an accessor for one of ``Series``, ``DataFrame``, or ``Index``.
29 | """
30 |
31 | def __init__(self, name, accessor):
32 | self._name = name
33 | self._accessor = accessor
34 |
35 | def __get__(self, obj, cls):
36 | if obj is None: # pragma: no cover
37 | return self._accessor
38 | accessor_obj = self._accessor(obj)
39 | object.__setattr__(obj, self._name, accessor_obj)
40 | return accessor_obj
41 |
42 |
43 | def _register_accessor(name, cls):
44 | """
45 | Register a custom accessor on {class} objects.
46 |
47 | :param name: Name under which the accessor should be registered. A warning
48 | is issued if this name conflicts with a preexisting attribute.
49 | :returns: A class decorator callable.
50 | """
51 |
52 | def decorator(accessor):
53 | if hasattr(cls, name):
54 | msg = (
55 | f"registration of accessor {accessor} under name '{name}' for "
56 | "type {cls.__name__} is overriding a preexisting attribute "
57 | "with the same name."
58 | )
59 |
60 | warnings.warn(
61 | msg,
62 | UserWarning,
63 | stacklevel=2,
64 | )
65 | setattr(cls, name, CachedAccessor(name, accessor))
66 | return accessor
67 |
68 | return decorator
69 |
70 |
71 | def register_dataframe_accessor(name):
72 | """
73 | Register a custom accessor with a DataFrame
74 |
75 | :param name: name used when calling the accessor after its registered
76 | :returns: a class decorator callable.
77 | """
78 | # pylint: disable=import-outside-toplevel
79 | from modin.pandas import DataFrame
80 |
81 | return _register_accessor(name, DataFrame)
82 |
83 |
84 | def register_series_accessor(name):
85 | """
86 | Register a custom accessor with a Series object
87 |
88 | :param name: name used when calling the accessor after its registered
89 | :returns: a callable class decorator
90 | """
91 | # pylint: disable=import-outside-toplevel
92 | from modin.pandas import Series
93 |
94 | return _register_accessor(name, Series)
95 |
96 |
97 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
98 | register_series_accessor("pandera")(PanderaSeriesAccessor)
99 |
--------------------------------------------------------------------------------
/pandera/accessors/pandas_accessor.py:
--------------------------------------------------------------------------------
1 | """Register pandas accessor for pandera schema metadata."""
2 |
3 | from typing import Optional, Union
4 |
5 | import pandas as pd
6 |
7 | from pandera.api.pandas.array import SeriesSchema
8 | from pandera.api.pandas.container import DataFrameSchema
9 |
10 | Schemas = Union[DataFrameSchema, SeriesSchema]
11 |
12 |
13 | class PanderaAccessor:
14 | """Pandera accessor for pandas object."""
15 |
16 | def __init__(self, pandas_obj):
17 | """Initialize the pandera accessor."""
18 | self._pandas_obj = pandas_obj
19 | self._schema: Optional[Schemas] = None
20 |
21 | @staticmethod
22 | def check_schema_type(schema: Schemas):
23 | """Abstract method for checking the schema type."""
24 | raise NotImplementedError
25 |
26 | def add_schema(self, schema):
27 | """Add a schema to the pandas object."""
28 | self.check_schema_type(schema)
29 | self._schema = schema
30 | return self._pandas_obj
31 |
32 | @property
33 | def schema(self) -> Optional[Schemas]:
34 | """Access schema metadata."""
35 | return self._schema
36 |
37 |
38 | @pd.api.extensions.register_dataframe_accessor("pandera")
39 | class PanderaDataFrameAccessor(PanderaAccessor):
40 | """Pandera accessor for pandas DataFrame."""
41 |
42 | @staticmethod
43 | def check_schema_type(schema):
44 | if not isinstance(schema, DataFrameSchema):
45 | raise TypeError(
46 | f"schema arg must be a {DataFrameSchema}, found {type(schema)}"
47 | )
48 |
49 |
50 | @pd.api.extensions.register_series_accessor("pandera")
51 | class PanderaSeriesAccessor(PanderaAccessor):
52 | """Pandera accessor for pandas Series."""
53 |
54 | @staticmethod
55 | def check_schema_type(schema):
56 | if not isinstance(schema, SeriesSchema):
57 | raise TypeError(
58 | f"schema arg must be a {SeriesSchema}, found {type(schema)}"
59 | )
60 |
--------------------------------------------------------------------------------
/pandera/accessors/polars_accessor.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/accessors/polars_accessor.py
--------------------------------------------------------------------------------
/pandera/accessors/pyspark_accessor.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | # NOTE: skip file since py=3.10 yields these errors:
3 | # https://github.com/pandera-dev/pandera/runs/4998710717?check_suite_focus=true
4 | """Register pyspark accessor for pandera schema metadata."""
5 |
6 | from pyspark.pandas.extensions import (
7 | register_dataframe_accessor,
8 | register_series_accessor,
9 | )
10 |
11 | from pandera.accessors.pandas_accessor import (
12 | PanderaDataFrameAccessor,
13 | PanderaSeriesAccessor,
14 | )
15 |
16 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
17 | register_series_accessor("pandera")(PanderaSeriesAccessor)
18 |
--------------------------------------------------------------------------------
/pandera/api/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandera api package.
2 |
3 | This package contains the public-facing api schema specifications for all
4 | supported data objects.
5 | """
6 |
--------------------------------------------------------------------------------
/pandera/api/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/base/__init__.py
--------------------------------------------------------------------------------
/pandera/api/base/model_config.py:
--------------------------------------------------------------------------------
1 | """Class-based dataframe model API configuration."""
2 |
3 | from typing import Any, Optional
4 |
5 |
6 | class BaseModelConfig: # pylint:disable=R0903
7 | """Model configuration base class."""
8 |
9 | #: datatype of the data container. This overrides the data types specified
10 | #: in any of the fields.
11 | dtype: Optional[Any] = None
12 |
13 | name: Optional[str] = None #: name of schema
14 | title: Optional[str] = None #: human-readable label for schema
15 | description: Optional[str] = None #: arbitrary textual description
16 | coerce: bool = False #: coerce types of all schema components
17 |
--------------------------------------------------------------------------------
/pandera/api/base/parsers.py:
--------------------------------------------------------------------------------
1 | """Data validation base parse."""
2 |
3 | import inspect
4 | from typing import Any, Dict, NamedTuple, Optional, Tuple, Type
5 |
6 | from pandera.backends.base import BaseParserBackend
7 |
8 |
9 | class ParserResult(NamedTuple):
10 | """Parser result for user-defined parsers."""
11 |
12 | parser_output: Any
13 | parsed_object: Any
14 |
15 |
16 | class MetaParser(type):
17 | """Parser metaclass."""
18 |
19 | BACKEND_REGISTRY: Dict[Tuple[Type, Type], Type[BaseParserBackend]] = {}
20 | """Registry of parser backends implemented for specific data objects."""
21 |
22 |
23 | class BaseParser(metaclass=MetaParser):
24 | """Parser base class."""
25 |
26 | def __init__(self, name: Optional[str] = None):
27 | self.name = name
28 |
29 | @classmethod
30 | def register_backend(cls, type_: Type, backend: Type[BaseParserBackend]):
31 | """Register a backend for the specified type."""
32 | cls.BACKEND_REGISTRY[(cls, type_)] = backend
33 |
34 | @classmethod
35 | def get_backend(cls, parse_obj: Any) -> Type[BaseParserBackend]:
36 | """Get the backend associated with the type of ``parse_obj`` ."""
37 |
38 | parse_obj_cls = type(parse_obj)
39 | classes = inspect.getmro(parse_obj_cls)
40 | for _class in classes:
41 | try:
42 | return cls.BACKEND_REGISTRY[(cls, _class)]
43 | except KeyError:
44 | pass
45 | raise KeyError(
46 | f"Backend not found for class: {parse_obj_cls}. Looked up the "
47 | f"following base classes: {classes}"
48 | )
49 |
50 | def __eq__(self, other: object) -> bool:
51 | if not isinstance(other, type(self)):
52 | return NotImplemented
53 |
54 | are_parser_fn_objects_equal = (
55 | self._get_parser_fn_code() == other._get_parser_fn_code()
56 | )
57 |
58 | are_all_other_parser_attributes_equal = {
59 | k: v for k, v in self.__dict__.items() if k != "_parser_fn"
60 | } == {k: v for k, v in other.__dict__.items() if k != "_parser_fn"}
61 |
62 | return (
63 | are_parser_fn_objects_equal
64 | and are_all_other_parser_attributes_equal
65 | )
66 |
67 | def _get_parser_fn_code(self):
68 | parser_fn = self.__dict__["_parser_fn"]
69 | code = parser_fn.__code__.co_code
70 |
71 | return code
72 |
73 | def __repr__(self) -> str:
74 | return f""
75 |
--------------------------------------------------------------------------------
/pandera/api/base/types.py:
--------------------------------------------------------------------------------
1 | """Base type definitions for pandera."""
2 |
3 | from typing import List, Union
4 |
5 | from pandera.api.checks import Check
6 | from pandera.api.hypotheses import Hypothesis
7 | from pandera.api.parsers import Parser
8 |
9 | try:
10 | # python 3.8+
11 | from typing import Literal # type: ignore[attr-defined]
12 | except ImportError: # pragma: no cover
13 | from typing_extensions import Literal # type: ignore[assignment]
14 |
15 |
16 | StrictType = Union[bool, Literal["filter"]]
17 | CheckList = Union[Check, List[Union[Check, Hypothesis]]]
18 | ParserList = Union[Parser, List[Parser]]
19 |
--------------------------------------------------------------------------------
/pandera/api/dataframe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/dataframe/__init__.py
--------------------------------------------------------------------------------
/pandera/api/function_dispatch.py:
--------------------------------------------------------------------------------
1 | """Multidispatcher implementation."""
2 |
3 | from inspect import signature
4 | from typing import Callable, Dict, Tuple, Type, Union
5 | import typing_inspect
6 |
7 |
8 | class Dispatcher:
9 | """Dispatch implementation."""
10 |
11 | def __init__(self):
12 | self._function_registry: Dict[Type, Callable] = {}
13 | self._name = None
14 |
15 | def register(self, fn):
16 | # Get function signature
17 | self._name = fn.__name__
18 | data_types = get_first_arg_type(fn)
19 | for data_type in data_types:
20 | self._function_registry[data_type] = fn
21 |
22 | def __call__(self, *args, **kwargs):
23 | input_data_type = type(args[0])
24 | fn = self._function_registry[input_data_type]
25 | return fn(*args, **kwargs)
26 |
27 | @property
28 | def co_code(self):
29 | """Method for getting bytecode of all the registered functions."""
30 | _code = b""
31 | for fn in self._function_registry.values():
32 | _code += fn.__code__.co_code
33 | return _code
34 |
35 | @property
36 | def __name__(self):
37 | return f"{self._name}"
38 |
39 | def __str__(self):
40 | return f"{self._name}"
41 |
42 | def __repr__(self):
43 | return f"{self._name}"
44 |
45 |
46 | def get_first_arg_type(fn):
47 | fn_sig = signature(fn)
48 |
49 | # register the check strategy for this particular check, identified
50 | # by the check `name`, and the data type of the check function. This
51 | # supports Union types. Also assume that the data type of the data
52 | # object to validate is the first argument.
53 | data_type = [*fn_sig.parameters.values()][0].annotation
54 |
55 | if typing_inspect.get_origin(data_type) in (tuple, Tuple):
56 | data_type, *_ = typing_inspect.get_args(data_type)
57 |
58 | if typing_inspect.get_origin(data_type) is Union:
59 | data_types = typing_inspect.get_args(data_type)
60 | else:
61 | data_types = (data_type,)
62 |
63 | return data_types
64 |
--------------------------------------------------------------------------------
/pandera/api/pandas/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandas core."""
2 |
--------------------------------------------------------------------------------
/pandera/api/pandas/model_config.py:
--------------------------------------------------------------------------------
1 | """Class-based dataframe model API configuration for pandas."""
2 |
3 | from typing import Optional
4 |
5 | from pandera.api.dataframe.model_config import BaseConfig as _BaseConfig
6 | from pandera.api.pandas.types import PandasDtypeInputTypes
7 |
8 |
9 | class BaseConfig(_BaseConfig): # pylint:disable=R0903
10 | """Define pandas DataFrameSchema-wide options."""
11 |
12 | #: datatype of the dataframe. This overrides the data types specified in
13 | #: any of the fields.
14 | dtype: Optional[PandasDtypeInputTypes] = None
15 |
--------------------------------------------------------------------------------
/pandera/api/polars/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/polars/__init__.py
--------------------------------------------------------------------------------
/pandera/api/polars/model_config.py:
--------------------------------------------------------------------------------
1 | """Class-based dataframe model API configuration for pandas."""
2 |
3 | from typing import Optional
4 |
5 | from pandera.api.dataframe.model_config import BaseConfig as _BaseConfig
6 | from pandera.api.polars.types import PolarsDtypeInputTypes
7 |
8 |
9 | class BaseConfig(_BaseConfig): # pylint:disable=R0903
10 | """Define polars DataFrameSchema-wide options."""
11 |
12 | #: datatype of the dataframe. This overrides the data types specified in
13 | #: any of the fields.
14 | dtype: Optional[PolarsDtypeInputTypes] = None
15 |
--------------------------------------------------------------------------------
/pandera/api/polars/types.py:
--------------------------------------------------------------------------------
1 | """Polars types."""
2 |
3 | from typing import NamedTuple, Union, TypeVar
4 |
5 | import polars as pl
6 |
7 |
8 | class PolarsData(NamedTuple):
9 | lazyframe: pl.LazyFrame
10 | key: str = "*"
11 |
12 |
13 | class CheckResult(NamedTuple):
14 | """Check result for user-defined checks."""
15 |
16 | check_output: pl.LazyFrame
17 | check_passed: pl.LazyFrame
18 | checked_object: pl.LazyFrame
19 | failure_cases: pl.LazyFrame
20 |
21 |
22 | PolarsCheckObjects = Union[pl.LazyFrame, pl.DataFrame]
23 | PolarsFrame = TypeVar("PolarsFrame", pl.LazyFrame, pl.DataFrame)
24 |
25 | PolarsDtypeInputTypes = Union[
26 | str,
27 | type,
28 | pl.datatypes.classes.DataTypeClass,
29 | ]
30 |
--------------------------------------------------------------------------------
/pandera/api/polars/utils.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=cyclic-import
2 | """Polars validation engine utilities."""
3 |
4 | from typing import Dict, List
5 |
6 | import polars as pl
7 |
8 | from pandera.api.polars.types import PolarsCheckObjects
9 | from pandera.engines.polars_engine import polars_version
10 | from pandera.config import (
11 | ValidationDepth,
12 | get_config_context,
13 | get_config_global,
14 | )
15 |
16 |
17 | def get_lazyframe_schema(lf: pl.LazyFrame) -> Dict[str, pl.DataType]:
18 | """Get a dict of column names and dtypes from a polars LazyFrame."""
19 | if polars_version().release >= (1, 0, 0):
20 | return lf.collect_schema()
21 | return lf.schema
22 |
23 |
24 | def get_lazyframe_column_dtypes(lf: pl.LazyFrame) -> List[pl.DataType]:
25 | """Get a list of column dtypes from a polars LazyFrame."""
26 | if polars_version().release >= (1, 0, 0):
27 | return lf.collect_schema().dtypes()
28 | return [*lf.schema.values()]
29 |
30 |
31 | def get_lazyframe_column_names(lf: pl.LazyFrame) -> List[str]:
32 | """Get a list of column names from a polars LazyFrame."""
33 | if polars_version().release >= (1, 0, 0):
34 | return lf.collect_schema().names()
35 | return lf.columns
36 |
37 |
38 | def get_validation_depth(check_obj: PolarsCheckObjects) -> ValidationDepth:
39 | """Get validation depth for a given polars check object."""
40 | is_dataframe = isinstance(check_obj, pl.DataFrame)
41 |
42 | config_global = get_config_global()
43 | config_ctx = get_config_context(validation_depth_default=None)
44 |
45 | if config_ctx.validation_depth is not None:
46 | # use context configuration if specified
47 | return config_ctx.validation_depth
48 |
49 | if config_global.validation_depth is not None:
50 | # use global configuration if specified
51 | return config_global.validation_depth
52 |
53 | if (
54 | isinstance(check_obj, pl.LazyFrame)
55 | and config_global.validation_depth is None
56 | ):
57 | # if global validation depth is not set, use schema only validation
58 | # when validating LazyFrames
59 | validation_depth = ValidationDepth.SCHEMA_ONLY
60 | elif is_dataframe and (
61 | config_ctx.validation_depth is None
62 | or config_ctx.validation_depth is None
63 | ):
64 | # if context validation depth is not set, use schema and data validation
65 | # when validating DataFrames
66 | validation_depth = ValidationDepth.SCHEMA_AND_DATA
67 | else:
68 | validation_depth = ValidationDepth.SCHEMA_ONLY
69 |
70 | return validation_depth
71 |
--------------------------------------------------------------------------------
/pandera/api/pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | """PySpark native core."""
2 |
3 | from pandera.api.pyspark.components import Column
4 | from pandera.api.pyspark.container import DataFrameSchema
5 |
--------------------------------------------------------------------------------
/pandera/api/pyspark/model_config.py:
--------------------------------------------------------------------------------
1 | """Class-based dataframe model API configuration for pyspark."""
2 |
3 | from typing import Any, Callable, Dict, List, Optional, Union
4 |
5 | from pandera.api.base.model_config import BaseModelConfig
6 | from pandera.api.base.types import StrictType
7 | from pandera.api.pyspark.types import PySparkDtypeInputTypes
8 | from pandera.typing.formats import Format
9 |
10 |
11 | class BaseConfig(BaseModelConfig): # pylint:disable=R0903
12 | """Define DataFrameSchema-wide options.
13 |
14 | *new in 0.16.0*
15 | """
16 |
17 | #: datatype of the dataframe. This overrides the data types specified in
18 | #: any of the fields.
19 | dtype: Optional[PySparkDtypeInputTypes] = None
20 |
21 | name: Optional[str] = None #: name of schema
22 | title: Optional[str] = None #: human-readable label for schema
23 | description: Optional[str] = None #: arbitrary textual description
24 | coerce: bool = False #: coerce types of all schema components
25 |
26 | #: make sure certain column combinations are unique
27 | unique: Optional[Union[str, List[str]]] = None
28 |
29 | #: make sure all specified columns are in the validated dataframe -
30 | #: if ``"filter"``, removes columns not specified in the schema
31 | strict: StrictType = False
32 |
33 | ordered: bool = False #: validate columns order
34 |
35 | #: make sure dataframe column names are unique
36 | unique_column_names: bool = False
37 |
38 | #: data format before validation. This option only applies to
39 | #: schemas used in the context of the pandera type constructor
40 | #: ``pa.typing.DataFrame[Schema](data)``. If None, assumes a data structure
41 | #: compatible with the ``pyspark.sql.DataFrame`` constructor.
42 | from_format: Optional[Union[Format, Callable]] = None
43 |
44 | #: a dictionary keyword arguments to pass into the reader function that
45 | #: converts the object of type ``from_format`` to a pandera-validate-able
46 | #: data structure. The reader function is implemented in the pandera.typing
47 | #: generic types via the ``from_format`` and ``to_format`` methods.
48 | from_format_kwargs: Optional[Dict[str, Any]] = None
49 |
50 | #: data format to serialize into after validation. This option only applies
51 | #: to schemas used in the context of the pandera type constructor
52 | #: ``pa.typing.DataFrame[Schema](data)``. If None, returns a dataframe.
53 | to_format: Optional[Union[Format, Callable]] = None
54 |
55 | #: Buffer to be provided when to_format is a custom callable. See docs for
56 | #: example of how to implement an example of a to format function.
57 | to_format_buffer: Optional[Union[str, Callable]] = None
58 |
59 | #: a dictionary keyword arguments to pass into the writer function that
60 | #: converts the pandera-validate-able object to type ``to_format``.
61 | #: The writer function is implemented in the pandera.typing
62 | #: generic types via the ``from_format`` and ``to_format`` methods.
63 | to_format_kwargs: Optional[Dict[str, Any]] = None
64 |
65 | #: a dictionary object to store key-value data at schema level
66 | metadata: Optional[dict] = None
67 |
--------------------------------------------------------------------------------
/pandera/api/pyspark/types.py:
--------------------------------------------------------------------------------
1 | """Utility functions for pyspark validation."""
2 |
3 | from functools import lru_cache
4 | from typing import List, NamedTuple, Tuple, Type, Union
5 | from numpy import bool_ as np_bool
6 | from packaging import version
7 |
8 | import pyspark.sql.types as pst
9 | from pyspark.sql import DataFrame
10 |
11 | import pyspark
12 | from pandera.api.checks import Check
13 | from pandera.dtypes import DataType
14 |
15 | # pylint: disable=reimported
16 | # Handles optional Spark Connect imports for pyspark>=3.4 (if available)
17 | if version.parse(pyspark.__version__) >= version.parse("3.4"):
18 | from pyspark.sql.connect.dataframe import DataFrame as psc_DataFrame
19 | from pyspark.sql.connect.group import GroupedData
20 | else:
21 | from pyspark.sql import (
22 | DataFrame as psc_DataFrame,
23 | )
24 | from pyspark.sql.group import GroupedData
25 |
26 | DataFrameTypes = Union[DataFrame, psc_DataFrame]
27 | GroupbyObject = GroupedData
28 |
29 | CheckList = Union[Check, List[Check]]
30 |
31 | PysparkDefaultTypes = Union[
32 | pst.BooleanType,
33 | pst.StringType,
34 | pst.IntegerType,
35 | pst.DecimalType,
36 | pst.FloatType,
37 | pst.DateType,
38 | pst.TimestampType,
39 | pst.DoubleType,
40 | pst.ShortType,
41 | pst.ByteType,
42 | pst.LongType,
43 | pst.BinaryType,
44 | ]
45 |
46 | PySparkDtypeInputTypes = Union[
47 | str,
48 | int,
49 | float,
50 | bool,
51 | type,
52 | DataType,
53 | Type,
54 | pst.BooleanType,
55 | pst.StringType,
56 | pst.IntegerType,
57 | pst.DecimalType,
58 | pst.FloatType,
59 | pst.DateType,
60 | pst.TimestampType,
61 | pst.DoubleType,
62 | pst.ShortType,
63 | pst.ByteType,
64 | pst.LongType,
65 | pst.BinaryType,
66 | ]
67 |
68 |
69 | class SupportedTypes(NamedTuple):
70 | table_types: Tuple[type, ...]
71 |
72 |
73 | class PysparkDataframeColumnObject(NamedTuple):
74 | """Pyspark Object which holds dataframe and column value in a named tuble"""
75 |
76 | dataframe: DataFrameTypes
77 | column_name: str
78 |
79 |
80 | @lru_cache(maxsize=None)
81 | def supported_types() -> SupportedTypes:
82 | """Get the types supported by pandera schemas."""
83 | # pylint: disable=import-outside-toplevel
84 | table_types = [DataFrame]
85 |
86 | try:
87 | table_types.append(DataFrame)
88 | table_types.append(psc_DataFrame)
89 |
90 | except ImportError: # pragma: no cover
91 | pass
92 |
93 | return SupportedTypes(
94 | tuple(table_types),
95 | )
96 |
97 |
98 | def is_table(obj):
99 | """Verifies whether an object is table-like.
100 |
101 | Where a table is a 2-dimensional data matrix of rows and columns, which
102 | can be indexed in multiple different ways.
103 | """
104 | return isinstance(obj, supported_types().table_types)
105 |
106 |
107 | def is_bool(x):
108 | """Verifies whether an object is a boolean type."""
109 | return isinstance(x, (bool, type(pst.BooleanType()), np_bool))
110 |
--------------------------------------------------------------------------------
/pandera/backends/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandera backends."""
2 |
--------------------------------------------------------------------------------
/pandera/backends/base/builtin_checks.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-function-docstring
2 | """Built-in check functions base implementation.
3 |
4 | This module contains check function abstract definitions that correspond to
5 | the pandera.api.base.checks.Check methods. These functions do not actually
6 | implement any validation logic and serve as the entrypoint for dispatching
7 | specific implementations based on the data object type, e.g.
8 | `pandas.DataFrame`s.
9 | """
10 |
11 | import re
12 | from typing import Any, Iterable, Optional, TypeVar, Union
13 |
14 | from pandera.api.checks import Check
15 |
16 | T = TypeVar("T")
17 |
18 |
19 | @Check.register_builtin_check_fn
20 | def equal_to(data: Any, value: Any) -> Any:
21 | raise NotImplementedError
22 |
23 |
24 | @Check.register_builtin_check_fn
25 | def not_equal_to(data: Any, value: Any) -> Any:
26 | raise NotImplementedError
27 |
28 |
29 | @Check.register_builtin_check_fn
30 | def greater_than(data: Any, min_value: Any) -> Any:
31 | raise NotImplementedError
32 |
33 |
34 | @Check.register_builtin_check_fn
35 | def greater_than_or_equal_to(data: Any, min_value: Any) -> Any:
36 | raise NotImplementedError
37 |
38 |
39 | @Check.register_builtin_check_fn
40 | def less_than(data: Any, max_value: Any) -> Any:
41 | raise NotImplementedError
42 |
43 |
44 | @Check.register_builtin_check_fn
45 | def less_than_or_equal_to(data: Any, max_value: Any) -> Any:
46 | raise NotImplementedError
47 |
48 |
49 | @Check.register_builtin_check_fn
50 | def in_range(
51 | data: Any,
52 | min_value: T,
53 | max_value: T,
54 | include_min: bool = True,
55 | include_max: bool = True,
56 | ) -> Any:
57 | raise NotImplementedError
58 |
59 |
60 | @Check.register_builtin_check_fn
61 | def isin(data: Any, allowed_values: Iterable) -> Any:
62 | raise NotImplementedError
63 |
64 |
65 | @Check.register_builtin_check_fn
66 | def notin(data: Any, forbidden_values: Iterable) -> Any:
67 | raise NotImplementedError
68 |
69 |
70 | @Check.register_builtin_check_fn
71 | def str_matches(data: Any, pattern: Union[str, re.Pattern]) -> Any:
72 | raise NotImplementedError
73 |
74 |
75 | @Check.register_builtin_check_fn
76 | def str_contains(data: Any, pattern: Union[str, re.Pattern]) -> Any:
77 | raise NotImplementedError
78 |
79 |
80 | @Check.register_builtin_check_fn
81 | def str_startswith(data: Any, string: str) -> Any:
82 | raise NotImplementedError
83 |
84 |
85 | @Check.register_builtin_check_fn
86 | def str_endswith(data: Any, string: str) -> Any:
87 | raise NotImplementedError
88 |
89 |
90 | @Check.register_builtin_check_fn
91 | def str_length(
92 | data: Any,
93 | min_value: Optional[int] = None,
94 | max_value: Optional[int] = None,
95 | ) -> Any:
96 | raise NotImplementedError
97 |
98 |
99 | @Check.register_builtin_check_fn
100 | def unique_values_eq(data: Any, values: Iterable) -> Any:
101 | raise NotImplementedError
102 |
--------------------------------------------------------------------------------
/pandera/backends/base/builtin_hypotheses.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-function-docstring
2 | """Built-in hypothesis functions base implementation.
3 |
4 | This module contains hypothesis function abstract definitions that
5 | correspond to the pandera.api.base.checks.Check methods. These functions do not
6 | actually implement any validation logic and serve as the entrypoint for
7 | dispatching specific implementations based on the data object type, e.g.
8 | `pandas.DataFrame`s.
9 | """
10 |
11 | from typing import Any, Tuple
12 |
13 | from pandera.api.hypotheses import Hypothesis
14 |
15 |
16 | @Hypothesis.register_builtin_check_fn
17 | def two_sample_ttest(
18 | *samples: Tuple[Any, ...],
19 | equal_var: bool = True,
20 | nan_policy: str = "propagate",
21 | ):
22 | raise NotImplementedError
23 |
24 |
25 | @Hypothesis.register_builtin_check_fn
26 | def one_sample_ttest(
27 | *samples: Tuple[Any, ...],
28 | popmean: float,
29 | nan_policy: str = "propagate",
30 | ):
31 | raise NotImplementedError
32 |
--------------------------------------------------------------------------------
/pandera/backends/pandas/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandas backend implementation for schemas and checks."""
2 |
--------------------------------------------------------------------------------
/pandera/backends/pandas/builtin_hypotheses.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-function-docstring
2 | """Pandas implementation of built-in hypotheses."""
3 |
4 | from typing import Tuple
5 |
6 | from pandera.api.extensions import register_builtin_hypothesis
7 | from pandera.backends.pandas.builtin_checks import PandasData
8 |
9 |
10 | @register_builtin_hypothesis(
11 | error="failed two sample ttest between '{sample1}' and '{sample2}'",
12 | samples_kwtypes={"sample1": str, "sample2": str},
13 | )
14 | def two_sample_ttest(
15 | *samples: Tuple[PandasData, ...],
16 | equal_var: bool = True,
17 | nan_policy: str = "propagate",
18 | ) -> Tuple[float, float]:
19 | from scipy import stats # pylint: disable=import-outside-toplevel
20 |
21 | assert (
22 | len(samples) == 2
23 | ), "Expected two sample ttest data to contain exactly two samples"
24 | return stats.ttest_ind(
25 | samples[0],
26 | samples[1],
27 | equal_var=equal_var,
28 | nan_policy=nan_policy,
29 | )
30 |
31 |
32 | @register_builtin_hypothesis(
33 | error="failed one sample ttest for column '{sample}'",
34 | samples_kwtypes={"sample": str},
35 | )
36 | def one_sample_ttest(
37 | *samples: Tuple[PandasData, ...],
38 | popmean: float,
39 | nan_policy: str = "propagate",
40 | ) -> Tuple[float, float]:
41 | from scipy import stats # pylint: disable=import-outside-toplevel
42 |
43 | assert (
44 | len(samples) == 1
45 | ), "Expected one sample ttest data to contain only one sample"
46 | return stats.ttest_1samp(
47 | samples[0], popmean=popmean, nan_policy=nan_policy
48 | )
49 |
--------------------------------------------------------------------------------
/pandera/backends/pandas/parsers.py:
--------------------------------------------------------------------------------
1 | """Parser backend for pandas"""
2 |
3 | from functools import partial
4 | from typing import Dict, Optional, Union
5 |
6 | import pandas as pd
7 |
8 | from pandera.api.base.parsers import ParserResult
9 | from pandera.api.pandas.types import is_field, is_table
10 | from pandera.api.parsers import Parser
11 | from pandera.backends.base import BaseParserBackend
12 |
13 |
14 | class PandasParserBackend(BaseParserBackend):
15 | """Parser backend of pandas."""
16 |
17 | def __init__(self, parser: Parser):
18 | """Initializes a parser backend object."""
19 | super().__init__(parser)
20 | assert parser._parser_fn is not None, "Parser._parser_fn must be set."
21 | self.parser = parser
22 | self.parser_fn = partial(parser._parser_fn, **parser._parser_kwargs)
23 |
24 | def preprocess(
25 | self, parse_obj, key
26 | ) -> pd.Series: # pylint:disable=unused-argument
27 | """Preprocesses a parser object before applying the parse function."""
28 | if is_table(parse_obj) and key is not None:
29 | return self.preprocess_table_with_key(parse_obj, key)
30 | elif is_table(parse_obj) and key is None:
31 | return self.preprocess_table(parse_obj)
32 | else:
33 | return parse_obj
34 |
35 | def preprocess_table_with_key(
36 | self,
37 | parse_obj,
38 | key,
39 | ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
40 | return parse_obj[key]
41 |
42 | def preprocess_table(
43 | self, parse_obj
44 | ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
45 | return parse_obj
46 |
47 | def apply(self, parse_obj):
48 | """Apply the parse function to a parser object."""
49 | if is_field(parse_obj):
50 | return self.apply_field(parse_obj)
51 | elif is_table(parse_obj):
52 | return self.apply_table(parse_obj)
53 | else:
54 | raise NotImplementedError
55 |
56 | def apply_field(self, parse_obj):
57 | if self.parser.element_wise:
58 | return parse_obj.map(self.parser_fn)
59 | return self.parser_fn(parse_obj)
60 |
61 | def apply_table(self, parse_obj):
62 | if self.parser.element_wise:
63 | return getattr(parse_obj, "map", parse_obj.applymap)(
64 | self.parser_fn
65 | )
66 | return self.parser_fn(parse_obj)
67 |
68 | def postprocess(
69 | self,
70 | parse_obj,
71 | parser_output,
72 | ) -> ParserResult:
73 | """Postprocesses the result of applying the parser function."""
74 | return ParserResult(
75 | parser_output=parser_output, parsed_object=parse_obj
76 | )
77 |
78 | def __call__(
79 | self,
80 | parse_obj: Union[pd.Series, pd.DataFrame],
81 | key: Optional[str] = None,
82 | ):
83 | parse_obj = self.preprocess(parse_obj, key)
84 | parser_output = self.apply(parse_obj)
85 | return self.postprocess(parse_obj, parser_output)
86 |
--------------------------------------------------------------------------------
/pandera/backends/pandas/register.py:
--------------------------------------------------------------------------------
1 | """Register pandas backends."""
2 |
3 | from functools import lru_cache
4 | from typing import Optional
5 |
6 | from pandera.backends.pandas.array import SeriesSchemaBackend
7 | from pandera.backends.pandas.checks import PandasCheckBackend
8 | from pandera.backends.pandas.components import (
9 | ColumnBackend,
10 | IndexBackend,
11 | MultiIndexBackend,
12 | )
13 | from pandera.backends.pandas.container import DataFrameSchemaBackend
14 | from pandera.backends.pandas.hypotheses import PandasHypothesisBackend
15 | from pandera.backends.pandas.parsers import PandasParserBackend
16 |
17 |
18 | @lru_cache
19 | def register_pandas_backends(
20 | check_cls_fqn: Optional[str] = None,
21 | ): # pylint: disable=unused-argument
22 | """Register pandas backends.
23 |
24 | This function is called at schema initialization in the _register_*_backends
25 | method.
26 |
27 | :param framework_name: name of the framework to register backends for.
28 | Allowable types are "pandas", "dask", "modin", "pyspark", and
29 | "geopandas".
30 | """
31 |
32 | # pylint: disable=import-outside-toplevel,unused-import,cyclic-import
33 | from pandera._patch_numpy2 import _patch_numpy2
34 |
35 | _patch_numpy2()
36 |
37 | from pandera.api.checks import Check
38 | from pandera.api.hypotheses import Hypothesis
39 | from pandera.api.pandas.array import SeriesSchema
40 | from pandera.api.pandas.components import Column, Index, MultiIndex
41 | from pandera.api.pandas.container import DataFrameSchema
42 | from pandera.api.parsers import Parser
43 | from pandera.api.pandas.types import get_backend_types
44 |
45 | # NOTE: This registers the deprecated DataFrameSchema class. Remove this
46 | # once the deprecated class is removed.
47 | from pandera._pandas_deprecated import (
48 | DataFrameSchema as _DataFrameSchemaDeprecated,
49 | )
50 |
51 | assert check_cls_fqn is not None, (
52 | "pandas backend registration requires passing in the fully qualified "
53 | "check class name"
54 | )
55 | backend_types = get_backend_types(check_cls_fqn)
56 |
57 | from pandera.backends.pandas import builtin_checks, builtin_hypotheses
58 |
59 | for t in backend_types.check_backend_types:
60 | Check.register_backend(t, PandasCheckBackend)
61 | Hypothesis.register_backend(t, PandasHypothesisBackend)
62 | Parser.register_backend(t, PandasParserBackend)
63 |
64 | for t in backend_types.dataframe_datatypes:
65 | DataFrameSchema.register_backend(t, DataFrameSchemaBackend)
66 | _DataFrameSchemaDeprecated.register_backend(t, DataFrameSchemaBackend)
67 | Column.register_backend(t, ColumnBackend)
68 | MultiIndex.register_backend(t, MultiIndexBackend)
69 | Index.register_backend(t, IndexBackend)
70 |
71 | for t in backend_types.series_datatypes:
72 | SeriesSchema.register_backend(t, SeriesSchemaBackend)
73 | Column.register_backend(t, ColumnBackend)
74 | MultiIndex.register_backend(t, MultiIndexBackend)
75 | Index.register_backend(t, IndexBackend)
76 |
77 | for t in backend_types.index_datatypes:
78 | Index.register_backend(t, IndexBackend)
79 |
80 | for t in backend_types.multiindex_datatypes:
81 | MultiIndex.register_backend(t, MultiIndexBackend)
82 |
--------------------------------------------------------------------------------
/pandera/backends/polars/__init__.py:
--------------------------------------------------------------------------------
1 | """Polars backend implementation for schemas and checks."""
2 |
--------------------------------------------------------------------------------
/pandera/backends/polars/error_formatters.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/backends/polars/error_formatters.py
--------------------------------------------------------------------------------
/pandera/backends/polars/register.py:
--------------------------------------------------------------------------------
1 | """Register polars backends."""
2 |
3 | from functools import lru_cache
4 | from typing import Optional
5 |
6 | import polars as pl
7 |
8 |
9 | @lru_cache
10 | def register_polars_backends(
11 | check_cls_fqn: Optional[str] = None,
12 | ): # pylint: disable=unused-argument
13 | """Register polars backends.
14 |
15 | This function is called at schema initialization in the _register_*_backends
16 | method.
17 | """
18 |
19 | # pylint: disable=import-outside-toplevel,unused-import,cyclic-import
20 | from pandera.api.checks import Check
21 | from pandera.api.polars.components import Column
22 | from pandera.api.polars.container import DataFrameSchema
23 | from pandera.backends.polars import builtin_checks
24 | from pandera.backends.polars.checks import PolarsCheckBackend
25 | from pandera.backends.polars.components import ColumnBackend
26 | from pandera.backends.polars.container import DataFrameSchemaBackend
27 |
28 | DataFrameSchema.register_backend(pl.LazyFrame, DataFrameSchemaBackend)
29 | DataFrameSchema.register_backend(pl.DataFrame, DataFrameSchemaBackend)
30 | Column.register_backend(pl.LazyFrame, ColumnBackend)
31 | Check.register_backend(pl.LazyFrame, PolarsCheckBackend)
32 |
--------------------------------------------------------------------------------
/pandera/backends/pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | """PySpark native backend implementation for schemas and checks."""
2 |
--------------------------------------------------------------------------------
/pandera/backends/pyspark/checks.py:
--------------------------------------------------------------------------------
1 | """Check backend for pyspark."""
2 |
3 | from functools import partial
4 | from typing import Dict, List, Optional, Union
5 |
6 | from pandera.api.base.checks import CheckResult
7 | from pandera.api.checks import Check
8 | from pandera.api.pyspark.types import (
9 | PysparkDataframeColumnObject,
10 | is_bool,
11 | is_table,
12 | GroupbyObject,
13 | )
14 | from pandera.backends.base import BaseCheckBackend
15 | from pandera.api.pyspark.types import DataFrameTypes
16 |
17 |
18 | class PySparkCheckBackend(BaseCheckBackend):
19 | """Check backend for PySpark."""
20 |
21 | def __init__(self, check: Check):
22 | """Initializes a check backend object."""
23 | super().__init__(check)
24 | assert check._check_fn is not None, "Check._check_fn must be set."
25 | self.check = check
26 | self.check_fn = partial(check._check_fn, **check._check_kwargs)
27 |
28 | def groupby(self, check_obj: DataFrameTypes): # pragma: no cover
29 | """Implements groupby behavior for check object."""
30 | assert self.check.groupby is not None, "Check.groupby must be set."
31 | if isinstance(self.check.groupby, (str, list)):
32 | return check_obj.groupby(self.check.groupby)
33 | return self.check.groupby(check_obj)
34 |
35 | def query(self, check_obj):
36 | """Implements querying behavior to produce subset of check object."""
37 | raise NotImplementedError
38 |
39 | def aggregate(self, check_obj):
40 | """Implements aggregation behavior for check object."""
41 | raise NotImplementedError
42 |
43 | @staticmethod
44 | def _format_groupby_input(
45 | groupby_obj: GroupbyObject,
46 | groups: Optional[List[str]],
47 | ) -> Dict[str, DataFrameTypes]: # pragma: no cover
48 | raise NotImplementedError
49 |
50 | def preprocess(
51 | self,
52 | check_obj: DataFrameTypes,
53 | key: str, # type: ignore [valid-type]
54 | ) -> DataFrameTypes:
55 | return check_obj
56 |
57 | def apply(
58 | self,
59 | check_obj: Union[DataFrameTypes, is_table],
60 | column_name: str = None,
61 | kwargs: dict = None,
62 | ):
63 | if column_name and kwargs:
64 | check_obj_and_col_name = PysparkDataframeColumnObject(
65 | check_obj, column_name
66 | )
67 | return self.check._check_fn(check_obj_and_col_name, **kwargs)
68 |
69 | else:
70 | return self.check_fn(check_obj) # pragma: no cover
71 |
72 | def postprocess(
73 | self,
74 | check_obj: DataFrameTypes,
75 | check_output: is_bool, # type: ignore [valid-type]
76 | ) -> CheckResult:
77 | """Postprocesses the result of applying the check function."""
78 | return CheckResult(
79 | check_output=check_output,
80 | check_passed=check_output,
81 | checked_object=check_obj,
82 | failure_cases=None,
83 | )
84 |
85 | def __call__(
86 | self,
87 | check_obj: DataFrameTypes,
88 | key: Optional[str] = None,
89 | ) -> CheckResult:
90 | check_obj = self.preprocess(check_obj, key)
91 |
92 | check_output = self.apply( # pylint:disable=too-many-function-args
93 | check_obj, key, self.check._check_kwargs
94 | )
95 |
96 | return self.postprocess(check_obj, check_output)
97 |
--------------------------------------------------------------------------------
/pandera/backends/pyspark/error_formatters.py:
--------------------------------------------------------------------------------
1 | """Make schema error messages human-friendly."""
2 |
3 |
4 | def format_generic_error_message(
5 | parent_schema,
6 | check,
7 | ) -> str:
8 | """Construct an error message when a check validator fails.
9 |
10 | :param parent_schema: class of schema being validated.
11 | :param check: check that generated error.
12 | """
13 | return f"{parent_schema} failed validation " f"{check.error}"
14 |
15 |
16 | def scalar_failure_case(x) -> dict:
17 | """Construct failure case from a scalar value.
18 |
19 | :param x: a scalar value representing failure case.
20 | :returns: Dictionary used for error reporting with ``SchemaErrors``.
21 | """
22 | return {
23 | "index": [None],
24 | "failure_case": [x],
25 | }
26 |
--------------------------------------------------------------------------------
/pandera/backends/pyspark/register.py:
--------------------------------------------------------------------------------
1 | """Register pyspark backends."""
2 |
3 | from functools import lru_cache
4 | from typing import Optional
5 | from packaging import version
6 |
7 | import pyspark
8 | import pyspark.sql as ps
9 |
10 | # Handles optional Spark Connect imports for pyspark>=3.4 (if available)
11 | CURRENT_PYSPARK_VERSION = version.parse(pyspark.__version__)
12 | if CURRENT_PYSPARK_VERSION >= version.parse("3.4"):
13 | from pyspark.sql.connect import dataframe as psc
14 |
15 |
16 | @lru_cache
17 | def register_pyspark_backends(
18 | check_cls_fqn: Optional[str] = None,
19 | ): # pylint: disable=unused-argument
20 | """Register pyspark backends.
21 |
22 | This function is called at schema initialization in the _register_*_backends
23 | method.
24 | """
25 |
26 | # pylint: disable=import-outside-toplevel,unused-import,cyclic-import
27 | from pandera._patch_numpy2 import _patch_numpy2
28 |
29 | _patch_numpy2()
30 |
31 | from pandera.api.checks import Check
32 | from pandera.api.pyspark.column_schema import ColumnSchema
33 | from pandera.api.pyspark.components import Column
34 | from pandera.api.pyspark.container import DataFrameSchema
35 | from pandera.backends.pyspark import builtin_checks
36 | from pandera.backends.pyspark.checks import PySparkCheckBackend
37 | from pandera.backends.pyspark.column import ColumnSchemaBackend
38 | from pandera.backends.pyspark.components import ColumnBackend
39 | from pandera.backends.pyspark.container import DataFrameSchemaBackend
40 |
41 | # Register classical DataFrame
42 | Check.register_backend(ps.DataFrame, PySparkCheckBackend)
43 | ColumnSchema.register_backend(ps.DataFrame, ColumnSchemaBackend)
44 | Column.register_backend(ps.DataFrame, ColumnBackend)
45 | DataFrameSchema.register_backend(ps.DataFrame, DataFrameSchemaBackend)
46 | # Register Spark Connect DataFrame, if available
47 | if CURRENT_PYSPARK_VERSION >= version.parse("3.4"):
48 | Check.register_backend(psc.DataFrame, PySparkCheckBackend)
49 | ColumnSchema.register_backend(psc.DataFrame, ColumnSchemaBackend)
50 | Column.register_backend(psc.DataFrame, ColumnBackend)
51 | DataFrameSchema.register_backend(psc.DataFrame, DataFrameSchemaBackend)
52 |
--------------------------------------------------------------------------------
/pandera/backends/pyspark/utils.py:
--------------------------------------------------------------------------------
1 | """pyspark backend utilities."""
2 |
3 |
4 | def convert_to_list(*args):
5 | """Converts arguments to a list"""
6 | converted_list = []
7 | for arg in args:
8 | if isinstance(arg, list):
9 | converted_list.extend(arg)
10 | else:
11 | converted_list.append(arg)
12 |
13 | return converted_list
14 |
--------------------------------------------------------------------------------
/pandera/backends/utils.py:
--------------------------------------------------------------------------------
1 | """Pandas backend utilities."""
2 |
3 | from typing import Union
4 |
5 | from pandera.dtypes import UniqueSettings
6 |
7 |
8 | def convert_uniquesettings(unique: UniqueSettings) -> Union[bool, str]:
9 | """
10 | Converts UniqueSettings object to string that can be passed onto pandas .duplicated() call
11 | """
12 | # Default `keep` argument for pandas .duplicated() function
13 | keep_argument: Union[bool, str]
14 | if unique == "exclude_first":
15 | keep_argument = "first"
16 | elif unique == "exclude_last":
17 | keep_argument = "last"
18 | elif unique == "all":
19 | keep_argument = False
20 | else:
21 | raise ValueError(
22 | str(unique) + " is not a recognized report_duplicates value"
23 | )
24 | return keep_argument
25 |
--------------------------------------------------------------------------------
/pandera/constants.py:
--------------------------------------------------------------------------------
1 | """Pandera constants."""
2 |
3 | CHECK_OUTPUT_KEY = "check_output"
4 | FAILURE_CASE_KEY = "failure_case"
5 |
--------------------------------------------------------------------------------
/pandera/engines/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandera type engines."""
2 |
3 | import pydantic
4 | from packaging import version
5 |
6 |
7 | def pydantic_version():
8 | """Return the pydantic version."""
9 |
10 | return version.parse(pydantic.__version__)
11 |
12 |
13 | PYDANTIC_V2 = pydantic_version().release >= (2, 0, 0)
14 |
--------------------------------------------------------------------------------
/pandera/engines/type_aliases.py:
--------------------------------------------------------------------------------
1 | """Custom type aliases."""
2 |
3 | from typing import Union
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | PandasObject = Union[pd.Series, pd.DataFrame]
10 | PandasExtensionType = pd.core.dtypes.base.ExtensionDtype
11 | PandasDataType = Union[pd.core.dtypes.base.ExtensionDtype, np.dtype, type]
12 |
--------------------------------------------------------------------------------
/pandera/extensions.py:
--------------------------------------------------------------------------------
1 | """Extensions module, for backwards compatibility."""
2 |
3 | # pylint: disable=unused-import
4 | from pandera.api.extensions import (
5 | CheckType,
6 | register_builtin_check,
7 | register_builtin_hypothesis,
8 | register_check_method,
9 | register_check_statistics,
10 | )
11 |
--------------------------------------------------------------------------------
/pandera/external_config.py:
--------------------------------------------------------------------------------
1 | """Configuration for external packages."""
2 |
3 | import os
4 |
5 |
6 | def _set_pyspark_environment_variables():
7 | """Sets environment variables for pyspark."""
8 |
9 | is_spark_local_ip_dirty = False
10 | is_pyarrow_ignore_timezone_dirty = False
11 |
12 | try:
13 | # try importing pyspark to see if it exists. This is important because the
14 | # pandera.typing module defines a Series type that inherits from
15 | # pandas.Series, and pyspark v1+ injects a __getitem__ method to pandas
16 | # Series and DataFrames to support type hinting:
17 | # https://spark.apache.org/docs/3.2.0/api/python/user_guide/pandas_on_spark/typehints.html#type-hinting-with-names
18 | # pylint: disable=unused-import
19 | if os.getenv("SPARK_LOCAL_IP") is None:
20 | is_spark_local_ip_dirty = True
21 | os.environ["SPARK_LOCAL_IP"] = "127.0.0.1"
22 | if os.getenv("PYARROW_IGNORE_TIMEZONE") is None:
23 | is_pyarrow_ignore_timezone_dirty = True
24 | # This can be overridden by the user
25 | os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
26 |
27 | import pyspark.pandas
28 | except (ImportError, ModuleNotFoundError):
29 | pass
30 | finally:
31 | if is_spark_local_ip_dirty:
32 | os.environ.pop("SPARK_LOCAL_IP")
33 | if is_pyarrow_ignore_timezone_dirty:
34 | os.environ.pop("PYARROW_IGNORE_TIMEZONE")
35 |
--------------------------------------------------------------------------------
/pandera/import_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions for importing optional dependencies."""
2 |
3 | from functools import wraps
4 | from typing import Callable, TypeVar, cast
5 |
6 |
7 | F = TypeVar("F", bound=Callable)
8 |
9 |
10 | def strategy_import_error(fn: F) -> F:
11 | """Decorator to generate input error if dependency is missing."""
12 |
13 | @wraps(fn)
14 | def _wrapper(*args, **kwargs):
15 |
16 | try:
17 | # pylint: disable=unused-import
18 | import hypothesis
19 | except ImportError as exc:
20 | raise ImportError(
21 | 'Strategies for generating data requires "hypothesis" to be \n'
22 | "installed. You can install pandera together with the strategies \n"
23 | "dependencies with:\n"
24 | "pip install pandera[strategies]"
25 | ) from exc
26 |
27 | return fn(*args, **kwargs)
28 |
29 | return cast(F, _wrapper)
30 |
--------------------------------------------------------------------------------
/pandera/inspection_utils.py:
--------------------------------------------------------------------------------
1 | """Decorators for integrating pandera into existing data pipelines."""
2 |
3 | from inspect import ismethod
4 | from typing import Callable
5 |
6 |
7 | def _is_like_classmethod(fn: Callable) -> bool:
8 | """A regular method defined on a metaclass behaves the same way as
9 | a method decorated with @classmethod defined on a regular class.
10 |
11 | This function covers both use cases.
12 | """
13 | is_method = ismethod(fn)
14 | return is_method and isinstance(fn.__self__, type) # type: ignore[attr-defined]
15 |
16 |
17 | def is_decorated_classmethod(fn: Callable) -> bool:
18 | """Check if fn is a classmethod declared with the @classmethod decorator.
19 |
20 | Adapted from:
21 | https://stackoverflow.com/questions/19227724/check-if-a-function-uses-classmethod
22 | """
23 | if not _is_like_classmethod(fn):
24 | return False
25 | bound_to = fn.__self__ # type: ignore[attr-defined]
26 | assert isinstance(bound_to, type)
27 | name = fn.__name__
28 | for cls in bound_to.__mro__:
29 | descriptor = vars(cls).get(name)
30 | if descriptor is not None:
31 | return isinstance(descriptor, classmethod)
32 | return False
33 |
34 |
35 | def is_classmethod_from_meta(fn: Callable) -> bool:
36 | """Check if fn is a regular method defined on a metaclass
37 | (which behaves like an @classmethod method defined on a regular class)."""
38 | return not is_decorated_classmethod(fn) and _is_like_classmethod(fn)
39 |
--------------------------------------------------------------------------------
/pandera/io/__init__.py:
--------------------------------------------------------------------------------
1 | """Subpackage for serializing/deserializing pandera schemas to other formats."""
2 |
3 | from pandera.io.pandas_io import (
4 | _deserialize_check_stats,
5 | _deserialize_component_stats,
6 | _format_checks,
7 | _format_index,
8 | _format_script,
9 | _get_dtype_string_alias,
10 | _serialize_check_stats,
11 | _serialize_component_stats,
12 | _serialize_dataframe_stats,
13 | deserialize_schema,
14 | from_frictionless_schema,
15 | from_json,
16 | from_yaml,
17 | serialize_schema,
18 | to_json,
19 | to_script,
20 | to_yaml,
21 | )
22 |
--------------------------------------------------------------------------------
/pandera/polars.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=unused-import
2 | """A flexible and expressive polars validation library for Python."""
3 |
4 | from pandera import errors
5 | from pandera.api.checks import Check
6 | from pandera.api.dataframe.model_components import (
7 | Field,
8 | check,
9 | dataframe_check,
10 | )
11 | from pandera.api.polars.components import Column
12 | from pandera.api.polars.container import DataFrameSchema
13 | from pandera.api.polars.model import DataFrameModel
14 | from pandera.api.polars.types import PolarsData
15 | from pandera.backends.polars.register import register_polars_backends
16 | from pandera.decorators import check_input, check_io, check_output, check_types
17 | from pandera.typing import polars as typing
18 |
19 | register_polars_backends()
20 |
21 |
22 | __all__ = [
23 | "check_input",
24 | "check_io",
25 | "check_output",
26 | "check_types",
27 | "check",
28 | "Check",
29 | "Column",
30 | "dataframe_check",
31 | "DataFrameModel",
32 | "DataFrameSchema",
33 | "errors",
34 | "Field",
35 | "PolarsData",
36 | ]
37 |
--------------------------------------------------------------------------------
/pandera/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/py.typed
--------------------------------------------------------------------------------
/pandera/pyspark.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=unused-import,wrong-import-position,shadowed-import,reimported
2 | """A flexible and expressive pyspark validation library."""
3 |
4 | from pandera._patch_numpy2 import _patch_numpy2
5 |
6 | _patch_numpy2()
7 |
8 | import pandera.backends.pyspark
9 | from pandera import errors, external_config
10 | from pandera.accessors import pyspark_sql_accessor
11 | from pandera.api.checks import Check
12 | from pandera.api.pyspark import Column, DataFrameSchema
13 | from pandera.api.pyspark.model import DataFrameModel
14 | from pandera.api.pyspark.model_components import Field, check, dataframe_check
15 | from pandera.decorators import check_input, check_io, check_output, check_types
16 | from pandera.dtypes import (
17 | Bool,
18 | Category,
19 | Complex,
20 | Complex64,
21 | Complex128,
22 | Complex256,
23 | DataType,
24 | Date,
25 | DateTime,
26 | Decimal,
27 | Float,
28 | Float16,
29 | Float32,
30 | Float64,
31 | Float128,
32 | Int,
33 | Int8,
34 | Int16,
35 | Int32,
36 | Int64,
37 | String,
38 | Timedelta,
39 | Timestamp,
40 | UInt,
41 | UInt8,
42 | UInt16,
43 | UInt32,
44 | UInt64,
45 | )
46 | from pandera.errors import PysparkSchemaError, SchemaInitError
47 | from pandera.schema_inference.pandas import infer_schema
48 | from pandera.typing import pyspark_sql
49 | from pandera._version import __version__
50 | from pandera.typing import pyspark_sql as typing
51 |
52 |
53 | external_config._set_pyspark_environment_variables()
54 |
55 | __all__ = [
56 | # dtypes
57 | "Bool",
58 | "Category",
59 | "Complex",
60 | "Complex64",
61 | "Complex128",
62 | "Complex256",
63 | "DataType",
64 | "DateTime",
65 | "Float",
66 | "Float16",
67 | "Float32",
68 | "Float64",
69 | "Float128",
70 | "Int",
71 | "Int8",
72 | "Int16",
73 | "Int32",
74 | "Int64",
75 | "String",
76 | "Timedelta",
77 | "Timestamp",
78 | "UInt",
79 | "UInt8",
80 | "UInt16",
81 | "UInt32",
82 | "UInt64",
83 | # checks
84 | "Check",
85 | # decorators
86 | "check_input",
87 | "check_io",
88 | "check_output",
89 | "check_types",
90 | # model
91 | "DataFrameModel",
92 | # model_components
93 | "Field",
94 | "check",
95 | "dataframe_check",
96 | # schema_components
97 | "Column",
98 | # schema_inference
99 | "infer_schema",
100 | # schemas
101 | "DataFrameSchema",
102 | # version
103 | "__version__",
104 | ]
105 |
--------------------------------------------------------------------------------
/pandera/schema_inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/schema_inference/__init__.py
--------------------------------------------------------------------------------
/pandera/schema_inference/pandas.py:
--------------------------------------------------------------------------------
1 | """Module for inferring dataframe/series schema."""
2 |
3 | from typing import overload
4 |
5 | import pandas as pd
6 |
7 | from pandera.api.pandas.array import SeriesSchema
8 | from pandera.api.pandas.components import Column, Index, MultiIndex
9 | from pandera.api.pandas.container import DataFrameSchema
10 | from pandera.schema_statistics.pandas import (
11 | infer_dataframe_statistics,
12 | infer_series_statistics,
13 | parse_check_statistics,
14 | )
15 |
16 |
17 | @overload
18 | def infer_schema(
19 | pandas_obj: pd.Series,
20 | ) -> SeriesSchema: # pragma: no cover
21 | ...
22 |
23 |
24 | @overload
25 | def infer_schema( # type: ignore[misc]
26 | pandas_obj: pd.DataFrame,
27 | ) -> DataFrameSchema: # pragma: no cover
28 | ...
29 |
30 |
31 | def infer_schema(pandas_obj):
32 | """Infer schema for pandas DataFrame or Series object.
33 |
34 | :param pandas_obj: DataFrame or Series object to infer.
35 | :returns: DataFrameSchema or SeriesSchema
36 | :raises: TypeError if pandas_obj is not expected type.
37 | """
38 | if isinstance(pandas_obj, pd.DataFrame):
39 | return infer_dataframe_schema(pandas_obj)
40 | elif isinstance(pandas_obj, pd.Series):
41 | return infer_series_schema(pandas_obj)
42 | else:
43 | raise TypeError(
44 | "pandas_obj type not recognized. Expected a pandas DataFrame or "
45 | f"Series, found {type(pandas_obj)}"
46 | )
47 |
48 |
49 | def _create_index(index_statistics):
50 | index = [
51 | Index(
52 | properties["dtype"],
53 | checks=parse_check_statistics(properties["checks"]),
54 | nullable=properties["nullable"],
55 | name=properties["name"],
56 | )
57 | for properties in index_statistics
58 | ]
59 | if len(index) == 1:
60 | index = index[0] # type: ignore
61 | else:
62 | index = MultiIndex(index) # type: ignore
63 |
64 | return index
65 |
66 |
67 | def infer_dataframe_schema(df: pd.DataFrame) -> DataFrameSchema:
68 | """Infer a DataFrameSchema from a pandas DataFrame.
69 |
70 | :param df: DataFrame object to infer.
71 | :returns: DataFrameSchema
72 | """
73 | df_statistics = infer_dataframe_statistics(df)
74 | schema = DataFrameSchema(
75 | columns={
76 | colname: Column(
77 | properties["dtype"],
78 | checks=parse_check_statistics(properties["checks"]),
79 | nullable=properties["nullable"],
80 | )
81 | for colname, properties in df_statistics["columns"].items()
82 | },
83 | index=_create_index(df_statistics["index"]),
84 | coerce=True,
85 | )
86 | return schema
87 |
88 |
89 | def infer_series_schema(series) -> SeriesSchema:
90 | """Infer a SeriesSchema from a pandas DataFrame.
91 |
92 | :param series: Series object to infer.
93 | :returns: SeriesSchema
94 | """
95 | series_statistics = infer_series_statistics(series)
96 | schema = SeriesSchema(
97 | dtype=series_statistics["dtype"],
98 | checks=parse_check_statistics(series_statistics["checks"]),
99 | nullable=series_statistics["nullable"],
100 | name=series_statistics["name"],
101 | coerce=True,
102 | )
103 | return schema
104 |
--------------------------------------------------------------------------------
/pandera/schema_statistics/__init__.py:
--------------------------------------------------------------------------------
1 | """Module to extract schema statsitics from schema objects."""
2 |
3 | from pandera.schema_statistics.pandas import (
4 | get_dataframe_schema_statistics,
5 | get_index_schema_statistics,
6 | get_series_schema_statistics,
7 | infer_dataframe_statistics,
8 | infer_index_statistics,
9 | infer_series_statistics,
10 | parse_check_statistics,
11 | parse_checks,
12 | )
13 |
--------------------------------------------------------------------------------
/pandera/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=unused-import
2 | """Data synthesis strategies for pandera, powered by the hypothesis package."""
3 |
4 | import warnings
5 |
6 | try:
7 | import pandas
8 | from pandera.strategies.pandas_strategies import *
9 | except ImportError:
10 | pass
11 |
--------------------------------------------------------------------------------
/pandera/strategies/base_strategies.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=unused-import
2 | """Base module for `hypothesis`-based strategies for data synthesis."""
3 |
4 | from functools import wraps
5 | from typing import Callable, Dict, Generic, Tuple, Type, TypeVar, cast
6 |
7 | import pandera.backends.base.builtin_checks
8 |
9 |
10 | F = TypeVar("F", bound=Callable)
11 |
12 |
13 | try:
14 | # pylint: disable=unused-import
15 | from hypothesis.strategies import SearchStrategy, composite
16 | except ImportError: # pragma: no cover
17 | T = TypeVar("T")
18 |
19 | # pylint: disable=too-few-public-methods
20 | class SearchStrategy(Generic[T]): # type: ignore
21 | """placeholder type."""
22 |
23 | def composite(fn): # type: ignore
24 | """placeholder composite strategy."""
25 | return fn
26 |
27 | HAS_HYPOTHESIS = False
28 | else:
29 | HAS_HYPOTHESIS = True
30 |
31 |
32 | # This strategy registry maps (check_name, data_type) -> strategy_function
33 | # For example: ("greater_than", pd.DataFrame) -> ()
34 | STRATEGY_DISPATCHER: Dict[Tuple[str, Type], Callable] = {}
35 |
36 |
37 | def strategy_import_error(fn: F) -> F:
38 | """Decorator to generate input error if dependency is missing."""
39 |
40 | @wraps(fn)
41 | def _wrapper(*args, **kwargs):
42 | if not HAS_HYPOTHESIS: # pragma: no cover
43 | raise ImportError(
44 | 'Strategies for generating data requires "hypothesis" to be \n'
45 | "installed. You can install pandera together with the strategies \n"
46 | "dependencies with:\n"
47 | "pip install pandera[strategies]"
48 | )
49 | return fn(*args, **kwargs)
50 |
51 | return cast(F, _wrapper)
52 |
--------------------------------------------------------------------------------
/pandera/system.py:
--------------------------------------------------------------------------------
1 | """Global variables relating to OS."""
2 |
3 | import numpy as np
4 |
5 | # Windows and Mac M1 don't support floats of this precision:
6 | # https://github.com/pandera-dev/pandera/issues/623
7 | FLOAT_128_AVAILABLE = hasattr(np, "float128")
8 |
--------------------------------------------------------------------------------
/pandera/typing/__init__.py:
--------------------------------------------------------------------------------
1 | """Typing module.
2 |
3 | For backwards compatibility, pandas types are exposed to the top-level scope of
4 | the typing module.
5 | """
6 |
7 | from functools import lru_cache
8 | from typing import Set, Type
9 | from pandera.typing.common import AnnotationInfo
10 |
11 | try:
12 | from pandera.typing.pandas import (
13 | DataFrame,
14 | Index,
15 | Series,
16 | Bool,
17 | Category,
18 | Date,
19 | DateTime,
20 | Decimal,
21 | Float,
22 | Float16,
23 | Float32,
24 | Float64,
25 | Int,
26 | Int8,
27 | Int16,
28 | Int32,
29 | Int64,
30 | Object,
31 | String,
32 | Timedelta,
33 | UInt8,
34 | UInt16,
35 | UInt32,
36 | UInt64,
37 | INT8,
38 | INT16,
39 | INT32,
40 | INT64,
41 | UINT8,
42 | UINT16,
43 | UINT32,
44 | UINT64,
45 | STRING,
46 | )
47 | except ImportError:
48 | pass
49 |
50 |
51 | @lru_cache
52 | def get_dataframe_types():
53 | from pandera.typing import (
54 | dask,
55 | geopandas,
56 | modin,
57 | pyspark,
58 | pyspark_sql,
59 | )
60 |
61 | dataframe_types: Set[Type] = {DataFrame}
62 | if dask.DASK_INSTALLED:
63 | dataframe_types.update({dask.DataFrame})
64 |
65 | if modin.MODIN_INSTALLED:
66 | dataframe_types.update({modin.DataFrame})
67 |
68 | if pyspark.PYSPARK_INSTALLED:
69 | dataframe_types.update({pyspark.DataFrame})
70 |
71 | if pyspark_sql.PYSPARK_SQL_INSTALLED:
72 | dataframe_types.update({pyspark_sql.DataFrame})
73 |
74 | if geopandas.GEOPANDAS_INSTALLED:
75 | dataframe_types.update({geopandas.GeoDataFrame})
76 |
77 | return dataframe_types
78 |
79 |
80 | @lru_cache
81 | def get_series_types():
82 | from pandera.typing import (
83 | dask,
84 | geopandas,
85 | modin,
86 | pyspark,
87 | )
88 |
89 | series_types: Set[Type] = {Series}
90 | if dask.DASK_INSTALLED:
91 | series_types.update({dask.Series})
92 |
93 | if modin.MODIN_INSTALLED:
94 | series_types.update({modin.Series})
95 |
96 | if pyspark.PYSPARK_INSTALLED:
97 | series_types.update({pyspark.Series})
98 |
99 | if geopandas.GEOPANDAS_INSTALLED:
100 | series_types.update({geopandas.GeoSeries})
101 |
102 | return series_types
103 |
104 |
105 | @lru_cache
106 | def get_index_types():
107 | from pandera.typing import dask, modin, pyspark
108 |
109 | index_types: Set[Type] = {Index}
110 | if dask.DASK_INSTALLED:
111 | index_types.update({dask.Index})
112 |
113 | if modin.MODIN_INSTALLED:
114 | index_types.update({modin.Index})
115 |
116 | if pyspark.PYSPARK_INSTALLED:
117 | index_types.update({pyspark.Index}) # type: ignore [arg-type]
118 |
119 | return index_types
120 |
121 |
122 | __all__ = [
123 | "AnnotationInfo",
124 | "DataFrame",
125 | "Series",
126 | "Index",
127 | "get_dataframe_types",
128 | "get_index_types",
129 | "get_series_types",
130 | ]
131 |
--------------------------------------------------------------------------------
/pandera/typing/dask.py:
--------------------------------------------------------------------------------
1 | """Pandera type annotations for Dask."""
2 |
3 | from typing import TYPE_CHECKING, Generic, TypeVar
4 |
5 | from pandera.typing.common import DataFrameBase, IndexBase, SeriesBase
6 | from pandera.typing.pandas import DataFrameModel, GenericDtype
7 |
8 | try:
9 | import dask.dataframe as dd
10 |
11 | DASK_INSTALLED = True
12 | except ImportError:
13 | DASK_INSTALLED = False
14 |
15 |
16 | # pylint:disable=invalid-name
17 | if TYPE_CHECKING:
18 | T = TypeVar("T") # pragma: no cover
19 | else:
20 | T = DataFrameModel
21 |
22 |
23 | if DASK_INSTALLED:
24 | # pylint: disable=too-few-public-methods,abstract-method
25 | class DataFrame(DataFrameBase, dd.DataFrame, Generic[T]):
26 | """
27 | Representation of dask.dataframe.DataFrame, only used for type
28 | annotation.
29 |
30 | *new in 0.8.0*
31 | """
32 |
33 | # pylint:disable=too-few-public-methods
34 | class Series(SeriesBase, dd.Series, Generic[GenericDtype]): # type: ignore
35 | """Representation of pandas.Series, only used for type annotation.
36 |
37 | *new in 0.8.0*
38 | """
39 |
40 | # pylint:disable=too-few-public-methods
41 | class Index(IndexBase, dd.Index, Generic[GenericDtype]):
42 | """Representation of pandas.Index, only used for type annotation.
43 |
44 | *new in 0.8.0*
45 | """
46 |
--------------------------------------------------------------------------------
/pandera/typing/formats.py:
--------------------------------------------------------------------------------
1 | """Serialization formats for dataframes."""
2 |
3 | from enum import Enum
4 | from typing import Union
5 |
6 | try:
7 | # python 3.8+
8 | from typing import Literal # type: ignore[attr-defined]
9 | except ImportError: # pragma: no cover
10 | from typing_extensions import Literal # type: ignore[assignment]
11 |
12 |
13 | class Formats(Enum):
14 | """Data container serialization formats.
15 |
16 | The values of this enum specify the valid values taken by the ``to_format``
17 | and ``from_format`` attributes in
18 | :py:class:`~pandera.typing.config.BaseConfig` when specifying a
19 | :py:class:`~pandera.api.pandas.model.DataFrameModel`.
20 | """
21 |
22 | # pylint: disable=invalid-name
23 |
24 | #: comma-separated values file
25 | csv = "csv"
26 |
27 | #: python dictionary
28 | dict = "dict"
29 |
30 | #: json file
31 | json = "json"
32 |
33 | #: feather file format. See
34 | #: `here `__ for more
35 | #: details
36 | feather = "feather"
37 |
38 | #: parquet file format. See `here `__ for more
39 | #: details
40 | parquet = "parquet"
41 |
42 | #: python pickle file format
43 | pickle = "pickle"
44 |
45 | #: python json_normalize
46 | json_normalize = "json_normalize"
47 |
48 |
49 | Format = Union[
50 | Literal[Formats.csv],
51 | Literal[Formats.dict],
52 | Literal[Formats.json],
53 | Literal[Formats.feather],
54 | Literal[Formats.parquet],
55 | Literal[Formats.pickle],
56 | Literal[Formats.json_normalize],
57 | ]
58 |
--------------------------------------------------------------------------------
/pandera/typing/modin.py:
--------------------------------------------------------------------------------
1 | """Pandera type annotations for Modin."""
2 |
3 | from typing import TYPE_CHECKING, Generic, TypeVar
4 |
5 | from packaging import version
6 |
7 | from pandera.typing.common import DataFrameBase, IndexBase, SeriesBase
8 | from pandera.typing.pandas import DataFrameModel, GenericDtype
9 |
10 | try:
11 | import modin
12 | import modin.pandas as mpd
13 |
14 | MODIN_INSTALLED = True
15 | except ImportError:
16 | MODIN_INSTALLED = False
17 |
18 |
19 | def modin_version():
20 | """Return the modin version."""
21 | return version.parse(modin.__version__)
22 |
23 |
24 | # pylint:disable=invalid-name
25 | if TYPE_CHECKING:
26 | T = TypeVar("T") # pragma: no cover
27 | else:
28 | T = DataFrameModel
29 |
30 |
31 | if MODIN_INSTALLED:
32 | # pylint: disable=too-few-public-methods
33 | class DataFrame(DataFrameBase, mpd.DataFrame, Generic[T]):
34 | """
35 | Representation of dask.dataframe.DataFrame, only used for type
36 | annotation.
37 |
38 | *new in 0.8.0*
39 | """
40 |
41 | # pylint:disable=too-few-public-methods,abstract-method
42 | class Series(SeriesBase, mpd.Series, Generic[GenericDtype]):
43 | """Representation of pandas.Series, only used for type annotation.
44 |
45 | *new in 0.8.0*
46 | """
47 |
48 | # pylint:disable=too-few-public-methods,abstract-method
49 | class Index(IndexBase, mpd.Index, Generic[GenericDtype]):
50 | """Representation of pandas.Index, only used for type annotation.
51 |
52 | *new in 0.8.0*
53 | """
54 |
--------------------------------------------------------------------------------
/pandera/typing/pyspark.py:
--------------------------------------------------------------------------------
1 | """Pandera type annotations for Pyspark Pandas."""
2 |
3 | from typing import TYPE_CHECKING, Generic, TypeVar
4 |
5 | from pandera.typing.common import (
6 | DataFrameBase,
7 | GenericDtype,
8 | IndexBase,
9 | SeriesBase,
10 | )
11 | from pandera.typing.pandas import DataFrameModel, _GenericAlias
12 |
13 | try:
14 | import pyspark.pandas as ps
15 |
16 | PYSPARK_INSTALLED = True
17 | except ImportError: # pragma: no cover
18 | PYSPARK_INSTALLED = False
19 |
20 |
21 | # pylint:disable=invalid-name
22 | if TYPE_CHECKING:
23 | T = TypeVar("T") # pragma: no cover
24 | else:
25 | T = DataFrameModel
26 |
27 |
28 | if PYSPARK_INSTALLED:
29 | # pylint: disable=too-few-public-methods,arguments-renamed
30 | class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]):
31 | """
32 | Representation of dask.dataframe.DataFrame, only used for type
33 | annotation.
34 |
35 | *new in 0.8.0*
36 | """
37 |
38 | def __class_getitem__(cls, item):
39 | """Define this to override's pyspark.pandas generic type."""
40 | return _GenericAlias(cls, item)
41 |
42 | # pylint:disable=too-few-public-methods,arguments-renamed
43 | class Series(SeriesBase, ps.Series, Generic[GenericDtype]): # type: ignore [misc] # noqa
44 | """Representation of pandas.Series, only used for type annotation.
45 |
46 | *new in 0.8.0*
47 | """
48 |
49 | def __class_getitem__(cls, item):
50 | """Define this to override pyspark.pandas generic type"""
51 | return _GenericAlias(cls, item)
52 |
53 | # pylint:disable=too-few-public-methods
54 | class Index(IndexBase, ps.Index, Generic[GenericDtype]):
55 | """Representation of pandas.Index, only used for type annotation.
56 |
57 | *new in 0.8.0*
58 | """
59 |
--------------------------------------------------------------------------------
/pandera/typing/pyspark_sql.py:
--------------------------------------------------------------------------------
1 | """Pandera type annotations for Pyspark."""
2 |
3 | from typing import TypeVar, Union
4 |
5 | from pandera.typing.common import DataFrameBase
6 | from pandera.typing.pandas import DataFrameModel, _GenericAlias
7 |
8 | try:
9 | import pyspark.sql as ps
10 |
11 | PYSPARK_SQL_INSTALLED = True
12 | except ImportError: # pragma: no cover
13 | PYSPARK_SQL_INSTALLED = False
14 |
15 | if PYSPARK_SQL_INSTALLED:
16 | from pandera.engines import pyspark_engine
17 |
18 | PysparkString = pyspark_engine.String
19 | PysparkInt = pyspark_engine.Int
20 | PysparkLongInt = pyspark_engine.BigInt
21 | PysparkShortInt = pyspark_engine.ShortInt
22 | PysparkByteInt = pyspark_engine.ByteInt
23 | PysparkDouble = pyspark_engine.Double
24 | PysparkFloat = pyspark_engine.Float
25 | PysparkDecimal = pyspark_engine.Decimal
26 | PysparkDate = pyspark_engine.Date
27 | PysparkTimestamp = pyspark_engine.Timestamp
28 | PysparkBinary = pyspark_engine.Binary
29 |
30 | PysparkDType = TypeVar( # type: ignore
31 | "PysparkDType",
32 | bound=Union[
33 | PysparkString, # type: ignore
34 | PysparkInt, # type: ignore
35 | PysparkLongInt, # type: ignore
36 | PysparkShortInt, # type: ignore
37 | PysparkByteInt, # type: ignore
38 | PysparkDouble, # type: ignore
39 | PysparkFloat, # type: ignore
40 | PysparkDecimal, # type: ignore
41 | PysparkDate, # type: ignore
42 | PysparkTimestamp, # type: ignore
43 | PysparkBinary, # type: ignore
44 | ],
45 | )
46 | from typing import TYPE_CHECKING, Generic
47 |
48 | # pylint:disable=invalid-name
49 | if TYPE_CHECKING:
50 | T = TypeVar("T") # pragma: no cover
51 | else:
52 | T = DataFrameModel
53 |
54 | if PYSPARK_SQL_INSTALLED:
55 | # pylint: disable=too-few-public-methods,arguments-renamed
56 | class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]):
57 | """
58 | Representation of dask.dataframe.DataFrame, only used for type
59 | annotation.
60 |
61 | *new in 0.8.0*
62 | """
63 |
64 | def __class_getitem__(cls, item):
65 | """Define this to override's pyspark.pandas generic type."""
66 | return _GenericAlias(cls, item) # pragma: no cover
67 |
--------------------------------------------------------------------------------
/pandera/utils.py:
--------------------------------------------------------------------------------
1 | """General utility functions"""
2 |
3 | from typing import Any, Callable, TypeVar
4 |
5 | F = TypeVar("F", bound=Callable)
6 |
7 |
8 | def docstring_substitution(*args: Any, **kwargs: Any) -> Callable[[F], F]:
9 | """Typed wrapper around pandas.util.Substitution."""
10 |
11 | def decorator(func: F) -> F:
12 | # handle case when pandera is run in optimized mode:
13 | # https://docs.python.org/3/using/cmdline.html#cmdoption-OO
14 | if func.__doc__ is None:
15 | return func
16 |
17 | if args:
18 | _doc = func.__doc__ % tuple(args) # type: ignore[operator]
19 | elif kwargs:
20 | _doc = func.__doc__ % kwargs # type: ignore[operator]
21 | func.__doc__ = _doc # pylint:disable=possibly-used-before-assignment
22 | return func
23 |
24 | return decorator
25 |
26 |
27 | def is_regex(name: str):
28 | """
29 | Checks whether a string is a regex pattern, as defined as starting with
30 | '^' and ending with '$'.
31 | """
32 | return name.startswith("^") and name.endswith("$")
33 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file is auto-generated from environment.yml, do not modify.
2 | # See that file for comments about the need/usage of each dependency.
3 |
4 | pip
5 | packaging >= 20.0
6 | typing_extensions
7 | hypothesis >= 6.92.7
8 | pyyaml >= 5.1
9 | typing_inspect >= 0.6.0
10 | frictionless <= 4.40.8
11 | pyarrow
12 | pydantic
13 | scipy
14 | pandas-stubs
15 | pyspark[connect] >= 3.2.0, < 4.0.0
16 | polars >= 0.20.0
17 | modin
18 | protobuf
19 | geopandas
20 | shapely
21 | fastapi
22 | black >= 24.0
23 | numpy >= 1.24.4
24 | pandas >= 2.1.1
25 | isort >= 5.7.0
26 | joblib
27 | mypy == 1.10.0
28 | pylint < 3.3
29 | pytest
30 | pytest-cov
31 | pytest-xdist
32 | pytest-asyncio
33 | pytz
34 | xdoctest
35 | nox
36 | uv
37 | setuptools
38 | uvicorn
39 | python-multipart
40 | sphinx
41 | sphinx-design
42 | sphinx-autodoc-typehints <= 1.14.1
43 | sphinx-copybutton
44 | recommonmark
45 | myst-nb
46 | twine
47 | asv >= 0.5.1
48 | pre_commit
49 | dask[dataframe]
50 | distributed
51 | furo
52 | sphinx-docsearch
53 | grpcio
54 | ray
55 | typeguard
56 | types-click
57 | types-pytz
58 | types-pyyaml
59 | types-requests
60 | types-setuptools
61 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | float_to_top = true
3 | profile = black
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup()
4 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/__init__.py
--------------------------------------------------------------------------------
/tests/base/test_base_schema.py:
--------------------------------------------------------------------------------
1 | """Base schema unit tests."""
2 |
3 | import pytest
4 |
5 | from pandera.api.base.schema import BaseSchema
6 | from pandera.backends.base import BaseSchemaBackend
7 |
8 |
9 | class MockSchema(BaseSchema):
10 | """Mock schema"""
11 |
12 |
13 | class MockSchemaBackend(BaseSchemaBackend):
14 | """Mock schema backend"""
15 |
16 |
17 | def test_get_backend_error():
18 | """Raise value error when no arguments are passed."""
19 |
20 | schema = MockSchema()
21 | with pytest.raises(ValueError):
22 | schema.get_backend()
23 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Pytest configuration."""
2 |
3 | import os
4 |
5 | try:
6 | # pylint: disable=unused-import
7 | import hypothesis # noqa F401
8 | from hypothesis import settings
9 | except ImportError:
10 | HAS_HYPOTHESIS = False
11 | else:
12 | HAS_HYPOTHESIS = True
13 |
14 | # ignore test files associated with hypothesis strategies
15 | collect_ignore = []
16 |
17 | if not HAS_HYPOTHESIS:
18 | collect_ignore.append("test_strategies.py")
19 | else:
20 | suppressed_health_checks = [
21 | hypothesis.HealthCheck.data_too_large,
22 | hypothesis.HealthCheck.too_slow,
23 | hypothesis.HealthCheck.filter_too_much,
24 | ]
25 |
26 | settings.register_profile(
27 | "ci",
28 | max_examples=10,
29 | deadline=None,
30 | suppress_health_check=suppressed_health_checks,
31 | )
32 | settings.register_profile(
33 | "dev",
34 | max_examples=30,
35 | deadline=None,
36 | suppress_health_check=suppressed_health_checks,
37 | )
38 | settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "dev"))
39 |
--------------------------------------------------------------------------------
/tests/dask/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/dask/__init__.py
--------------------------------------------------------------------------------
/tests/dask/test_dask_accessor.py:
--------------------------------------------------------------------------------
1 | """Unit tests for dask_accessor module."""
2 |
3 | from typing import Union
4 |
5 | import dask.dataframe as dd
6 | import pandas as pd
7 | import pytest
8 |
9 | import pandera.pandas as pa
10 |
11 |
12 | @pytest.mark.parametrize(
13 | "schema1, schema2, data, invalid_data",
14 | [
15 | [
16 | pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True),
17 | pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True),
18 | dd.from_pandas(pd.DataFrame({"col": [1, 2, 3]}), npartitions=1),
19 | dd.from_pandas(pd.Series([1, 2, 3]), npartitions=1),
20 | ],
21 | [
22 | pa.SeriesSchema(int, coerce=True),
23 | pa.SeriesSchema(float, coerce=True),
24 | dd.from_pandas(pd.Series([1, 2, 3]), npartitions=1),
25 | dd.from_pandas(pd.DataFrame({"col": [1, 2, 3]}), npartitions=1),
26 | ],
27 | ],
28 | )
29 | @pytest.mark.parametrize("inplace", [False, True])
30 | def test_dataframe_series_add_schema(
31 | schema1: Union[pa.DataFrameSchema, pa.SeriesSchema],
32 | schema2: Union[pa.DataFrameSchema, pa.SeriesSchema],
33 | data: Union[pd.DataFrame, pd.Series],
34 | invalid_data: Union[pd.DataFrame, pd.Series],
35 | inplace: bool,
36 | ) -> None:
37 | """
38 | Test that pandas object contains schema metadata after pandera validation.
39 | """
40 | validated_data_1 = schema1(data, inplace=inplace) # type: ignore[arg-type]
41 | if inplace:
42 | assert data.pandera.schema == schema1
43 | else:
44 | assert data.pandera.schema is None
45 | assert validated_data_1.pandera.schema == schema1
46 |
47 | validated_data_2 = schema2(validated_data_1, inplace=inplace) # type: ignore[arg-type]
48 | if inplace:
49 | assert validated_data_1.pandera.schema == schema2
50 | else:
51 | assert validated_data_1.pandera.schema == schema1
52 | assert validated_data_2.pandera.schema == schema2
53 |
54 | with pytest.raises(TypeError):
55 | schema1(invalid_data) # type: ignore[arg-type]
56 |
57 | with pytest.raises(TypeError):
58 | schema2(invalid_data) # type: ignore[arg-type]
59 |
--------------------------------------------------------------------------------
/tests/dask/test_dask_not_installed.py:
--------------------------------------------------------------------------------
1 | """Tests behavior when dask is not installed."""
2 |
3 | import sys
4 | from unittest import mock
5 |
6 | import pandas as pd
7 | import pytest
8 |
9 |
10 | def test_dask_not_installed() -> None:
11 | """
12 | Test that Pandera and its modules can be imported and continue to work
13 | without dask.
14 | """
15 | with mock.patch.dict("sys.modules", {"dask": None}):
16 | with pytest.raises(ImportError):
17 | # pylint: disable=import-outside-toplevel,unused-import
18 | import dask.dataframe
19 |
20 | for module in ["pandera", "pandera.accessors.dask_accessor"]:
21 | try:
22 | del sys.modules[module]
23 | except KeyError:
24 | ...
25 |
26 | # pylint: disable=import-outside-toplevel,unused-import
27 | import pandera
28 |
29 | assert "pandera.accessors.dask_accessor" not in sys.modules
30 |
31 | del sys.modules["pandera"]
32 | del sys.modules["pandera.api.pandas.types"]
33 | # pylint: disable=import-outside-toplevel
34 | from pandera.api.pandas.types import is_table
35 |
36 | assert not is_table(pd.Series([1]))
37 |
38 | for module in ["pandera", "pandera.typing"]:
39 | try:
40 | del sys.modules[module]
41 | except KeyError:
42 | ...
43 |
44 | # pylint: disable=import-outside-toplevel
45 | import pandera.typing
46 |
47 | annotation = pandera.typing.DataFrame[int]
48 | assert pandera.typing.AnnotationInfo(annotation).is_generic_df
49 |
--------------------------------------------------------------------------------
/tests/fastapi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/fastapi/__init__.py
--------------------------------------------------------------------------------
/tests/fastapi/app.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | from fastapi import Body, FastAPI, File
3 | from fastapi.responses import HTMLResponse
4 |
5 | from pandera.typing import DataFrame
6 | from pandera.typing.fastapi import UploadFile
7 | from tests.fastapi.models import (
8 | Item,
9 | ResponseModel,
10 | Transactions,
11 | TransactionsDictOut,
12 | TransactionsParquet,
13 | )
14 |
15 | try:
16 | from typing import Annotated # type: ignore[attr-defined]
17 | except ImportError:
18 | from typing_extensions import Annotated # type: ignore[assignment]
19 |
20 | app = FastAPI()
21 |
22 |
23 | @app.post("/items/", response_model=Item)
24 | def create_item(item: Item):
25 | return item
26 |
27 |
28 | @app.post("/transactions/", response_model=DataFrame[TransactionsDictOut])
29 | def create_transactions(
30 | transactions: Annotated[DataFrame[Transactions], Body()],
31 | ):
32 | output = transactions.assign(name="foo")
33 | ... # do other stuff, e.g. update backend database with transactions
34 | return output
35 |
36 |
37 | @app.post("/file/", response_model=ResponseModel)
38 | def create_upload_file(
39 | file: Annotated[UploadFile[DataFrame[TransactionsParquet]], File()],
40 | ):
41 | return {
42 | "filename": file.filename,
43 | "df": file.data.assign(name="foo"),
44 | }
45 |
46 |
47 | @app.get("/")
48 | def main():
49 | content = """
50 |
51 |
55 |
56 | """
57 | return HTMLResponse(content=content)
58 |
--------------------------------------------------------------------------------
/tests/fastapi/models.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | from typing import Optional
3 |
4 | from pydantic import BaseModel, Field
5 |
6 | import pandera.pandas as pa
7 |
8 |
9 | class Transactions(pa.DataFrameModel):
10 | id: pa.typing.Series[int]
11 | cost: pa.typing.Series[float] = pa.Field(ge=0, le=1000)
12 |
13 | class Config:
14 | coerce = True
15 |
16 |
17 | class TransactionsParquet(Transactions):
18 | class Config:
19 | from_format = "parquet"
20 |
21 |
22 | class TransactionsOut(Transactions):
23 | id: pa.typing.Series[int]
24 | cost: pa.typing.Series[float]
25 | name: pa.typing.Series[str]
26 |
27 |
28 | class TransactionsJsonOut(TransactionsOut):
29 | class Config:
30 | to_format = "json"
31 | to_format_kwargs = {"orient": "records"}
32 |
33 |
34 | class TransactionsDictOut(TransactionsOut):
35 | class Config:
36 | to_format = "dict"
37 | to_format_kwargs = {"orient": "records"}
38 |
39 |
40 | class Item(BaseModel):
41 | name: str
42 | value: int = Field(ge=0)
43 | description: Optional[str] = None
44 |
45 |
46 | class ResponseModel(BaseModel):
47 | filename: str
48 | df: pa.typing.DataFrame[TransactionsJsonOut]
49 |
--------------------------------------------------------------------------------
/tests/fastapi/test_app.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=redefined-outer-name,unused-argument
2 | """Unit tests for using pandera types in fastapi endpoints."""
3 |
4 | import io
5 | import subprocess
6 | import time
7 | from copy import deepcopy
8 |
9 | import pandas as pd
10 | import pytest
11 | import requests
12 | from hypothesis import given
13 |
14 | from tests.fastapi.models import Transactions, TransactionsOut
15 |
16 |
17 | @pytest.fixture(scope="module")
18 | def app():
19 | """Transient app server for testing."""
20 | # pylint: disable=consider-using-with
21 | process = subprocess.Popen(
22 | ["uvicorn", "tests.fastapi.app:app", "--port", "8000"],
23 | stdout=subprocess.PIPE,
24 | )
25 | _wait_to_exist()
26 | yield process
27 | process.terminate()
28 |
29 |
30 | def _wait_to_exist():
31 | for _ in range(20):
32 | try:
33 | requests.post("http://127.0.0.1:8000/")
34 | break
35 | except Exception: # pylint: disable=broad-except
36 | time.sleep(3.0)
37 |
38 |
39 | def test_items_endpoint(app):
40 | """Happy path test with pydantic type annotations."""
41 | data = {"name": "Book", "value": 10, "description": "Hello"}
42 | for _ in range(10):
43 | response = requests.post("http://127.0.0.1:8000/items/", json=data)
44 | if response.status_code != 200:
45 | time.sleep(3.0)
46 | assert response.json() == data
47 |
48 |
49 | def test_transactions_endpoint(app):
50 | """Happy path test with pandera type endpoint type annotation."""
51 | data = {"id": [1], "cost": [10.99]}
52 | response = requests.post(
53 | "http://127.0.0.1:8000/transactions/",
54 | json=data,
55 | )
56 | expected_output = deepcopy(data)
57 | expected_output = [{"id": 1, "cost": 10.99, "name": "foo"}]
58 | assert response.json() == expected_output
59 |
60 |
61 | @given(Transactions.strategy(size=10))
62 | def test_upload_file_endpoint(app, sample):
63 | """
64 | Test upload file endpoint with Upload[DataFrame[DataFrameModel]] input.
65 | """
66 | buf = io.BytesIO()
67 | sample.to_parquet(buf)
68 | buf.seek(0)
69 |
70 | expected_result = pd.read_parquet(buf).assign(name="foo")
71 | buf.seek(0)
72 |
73 | response = requests.post(
74 | "http://127.0.0.1:8000/file/", files={"file": buf}
75 | )
76 | output = response.json()
77 | assert output["filename"] == "file"
78 | output_df = pd.read_json(output["df"])
79 | cost_notna = ~output_df["cost"].isna()
80 | pd.testing.assert_frame_equal(
81 | TransactionsOut.validate(output_df[cost_notna]),
82 | TransactionsOut.validate(expected_result[cost_notna]),
83 | )
84 |
--------------------------------------------------------------------------------
/tests/hypotheses/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/hypotheses/__init__.py
--------------------------------------------------------------------------------
/tests/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/io/__init__.py
--------------------------------------------------------------------------------
/tests/modin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/modin/__init__.py
--------------------------------------------------------------------------------
/tests/modin/conftest.py:
--------------------------------------------------------------------------------
1 | """Registers fixtures for core"""
2 |
3 | import os
4 | from typing import Generator
5 |
6 | import pytest
7 | from pandera.api.checks import Check
8 |
9 | # pylint: disable=unused-import
10 | ENGINES = os.getenv("CI_MODIN_ENGINES", "").split(",")
11 | if ENGINES == [""]:
12 | ENGINES = ["dask"]
13 |
14 |
15 | @pytest.fixture(scope="function")
16 | def custom_check_teardown() -> Generator[None, None, None]:
17 | """Remove all custom checks after execution of each pytest function."""
18 | yield
19 | for check_name in list(Check.REGISTERED_CUSTOM_CHECKS):
20 | del Check.REGISTERED_CUSTOM_CHECKS[check_name]
21 |
22 |
23 | @pytest.fixture(scope="session", params=ENGINES, autouse=True)
24 | def setup_modin_engine(request):
25 | """Set up the modin engine.
26 |
27 | Eventually this will also support dask execution backend.
28 | """
29 | engine = request.param
30 | os.environ["MODIN_ENGINE"] = engine
31 | os.environ["MODIN_STORAGE_FORMAT"] = "pandas"
32 | os.environ["MODIN_MEMORY"] = "100000000"
33 | os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
34 |
35 | if engine == "ray":
36 | # pylint: disable=import-outside-toplevel
37 | import ray
38 |
39 | ray.init(
40 | runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}}
41 | )
42 | yield
43 | ray.shutdown()
44 |
45 | elif engine == "dask":
46 | # pylint: disable=import-outside-toplevel
47 | from distributed import Client
48 |
49 | client = Client()
50 | yield
51 | client.close()
52 | else:
53 | raise ValueError(f"Not supported engine: {engine}")
54 |
--------------------------------------------------------------------------------
/tests/modin/test_modin_accessor.py:
--------------------------------------------------------------------------------
1 | """Unit tests of modin accessor functionality.
2 |
3 | Since modin doesn't currently support the pandas accessor extension API,
4 | pandera implements it.
5 | """
6 |
7 | import pytest
8 |
9 | from pandera.accessors import modin_accessor
10 |
11 |
12 | # pylint: disable=too-few-public-methods
13 | class CustomAccessor:
14 | """Mock accessor class"""
15 |
16 | def __init__(self, obj):
17 | self._obj = obj
18 |
19 |
20 | def test_modin_accessor_warning():
21 | """Test that modin accessor raises warning when name already exists."""
22 | modin_accessor.register_dataframe_accessor("foo")(CustomAccessor)
23 | with pytest.warns(UserWarning):
24 | modin_accessor.register_dataframe_accessor("foo")(CustomAccessor)
25 |
--------------------------------------------------------------------------------
/tests/mypy/config/no_plugin.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | follow_imports = silent
4 | show_error_codes = True
5 | allow_redefinition = True
6 | warn_return_any = False
7 | warn_unused_configs = True
8 |
--------------------------------------------------------------------------------
/tests/mypy/config/plugin_mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pandera.mypy
3 | ignore_missing_imports = True
4 | follow_imports = skip
5 | show_error_codes = True
6 | allow_redefinition = True
7 | warn_return_any = False
8 | warn_unused_configs = True
9 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_concat.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | import pandas as pd
3 |
4 | df = pd.DataFrame([[1]])
5 | sr = pd.Series([1])
6 |
7 |
8 | df_concat = pd.concat([df, df])
9 | sr_concat = pd.concat([sr, sr])
10 | sr_axis1_concat = pd.concat([sr, sr], axis=1)
11 |
12 | # mypy error without pandera plugin
13 | df_generator_concat: pd.DataFrame = pd.concat(df for _ in range(3))
14 |
15 | # mypy error without pandera plugin
16 | sr_generator_concat: pd.Series = pd.concat(sr for _ in range(3))
17 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_dataframe.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | """Unit tests for static type checking of dataframes.
3 |
4 | This test module uses https://github.com/davidfritzsche/pytest-mypy-testing to
5 | run statically check the functions marked pytest.mark.mypy_testing
6 | """
7 |
8 | from typing import Optional, cast
9 |
10 | import pandas as pd
11 |
12 | import pandera.pandas as pa
13 | from pandera.typing import DataFrame, Series
14 |
15 |
16 | class Schema(pa.DataFrameModel):
17 | id: Series[int]
18 | name: Series[str]
19 |
20 |
21 | class SchemaOut(pa.DataFrameModel):
22 | age: Series[int]
23 |
24 |
25 | class AnotherSchema(pa.DataFrameModel):
26 | id: Series[int]
27 | first_name: Optional[Series[str]]
28 |
29 |
30 | def fn(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
31 | return df.assign(age=30).pipe(DataFrame[SchemaOut]) # mypy okay
32 |
33 |
34 | def fn_pipe_incorrect_type(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
35 | return df.assign(age=30).pipe(DataFrame[AnotherSchema]) # mypy error
36 | # error: Argument 1 to "pipe" of "NDFrame" has incompatible type "Type[DataFrame[Any]]"; # noqa
37 | # expected "Union[Callable[..., DataFrame[SchemaOut]], Tuple[Callable[..., DataFrame[SchemaOut]], str]]" [arg-type] # noqa
38 |
39 |
40 | def fn_assign_copy(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
41 | return df.assign(age=30) # mypy error
42 | # error: Incompatible return value type (got "pandas.core.frame.DataFrame",
43 | # expected "pandera.typing.pandas.DataFrame[SchemaOut]") [return-value]
44 |
45 |
46 | # Define a few dataframe objects
47 | schema_df = DataFrame[Schema]({"id": [1], "name": ["foo"]})
48 | pandas_df = pd.DataFrame({"id": [1], "name": ["foo"]})
49 | another_df = DataFrame[AnotherSchema]({"id": [1], "first_name": ["foo"]})
50 |
51 |
52 | fn(schema_df) # mypy okay
53 |
54 | fn(pandas_df) # mypy error
55 | # error: Argument 1 to "fn" has incompatible type "pandas.core.frame.DataFrame"; # noqa
56 | # expected "pandera.typing.pandas.DataFrame[Schema]" [arg-type]
57 |
58 | fn(another_df) # mypy error
59 | # error: Argument 1 to "fn" has incompatible type "DataFrame[AnotherSchema]";
60 | # expected "DataFrame[Schema]" [arg-type]
61 |
62 |
63 | def fn_pipe_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
64 | return df.assign(age=30).pipe(DataFrame[SchemaOut]) # mypy okay
65 |
66 |
67 | def fn_cast_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
68 | return cast(DataFrame[SchemaOut], df.assign(age=30)) # mypy okay
69 |
70 |
71 | @pa.check_types
72 | def fn_mutate_inplace(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
73 | out = df.assign(age=30).pipe(DataFrame[SchemaOut])
74 | out.drop(columns="age", inplace=True)
75 | return out # okay for mypy, pandera raises error
76 |
77 |
78 | @pa.check_types
79 | def fn_assign_and_get_index(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
80 | return df.assign(foo=30).iloc[:3] # mypy error
81 | # error: Incompatible return value type (got "pandas.core.frame.DataFrame",
82 | # expected "pandera.typing.pandas.DataFrame[SchemaOut]") [return-value]
83 |
84 |
85 | @pa.check_types
86 | def fn_cast_dataframe_invalid(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
87 | return cast(
88 | DataFrame[SchemaOut], df
89 | ) # okay for mypy, pandera raises error
90 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_index.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | import pandas as pd
3 |
4 | df = pd.DataFrame({"a": [1, 2, 3]})
5 | sr = pd.Series([1, 2, 3])
6 | idx = pd.Index([1, 2, 3])
7 |
8 | df_index_unique: bool = df.index.is_unique
9 | sr_index_unique: bool = df["a"].index.is_unique
10 | idx_unique: bool = idx.is_unique
11 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_series.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | import pandas as pd
3 |
4 |
5 | def fn(s: pd.Series[str]) -> bool:
6 | return True
7 |
8 |
9 | fn(s=pd.Series([1.0, 1.0, 1.0], dtype=float)) # mypy okay
10 |
11 | series = pd.Series([1.0, 1.0, 1.0], dtype=float)
12 | fn(series) # mypy able to determine `series` type, raises error
13 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_time.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | import pandas as pd
3 |
4 | pd.Timestamp.now() + pd.tseries.offsets.YearEnd(1)
5 |
6 | pd.Timedelta(minutes=2)
7 | pd.Timedelta(2, unit="minutes")
8 |
9 | pd.Timedelta(minutes=2, seconds=30)
10 | pd.Timedelta(2.5, unit="minutes") # mypy error
11 | pd.Timedelta(2, unit="minutes") + pd.Timedelta(30, unit="seconds")
12 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandera_inheritance.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | """With the pandera.mypy plugin, mypy ignores type overrides."""
3 |
4 | import pandera.pandas as pa
5 |
6 |
7 | class Schema(pa.DataFrameModel):
8 | a: pa.typing.Series[int]
9 | b: pa.typing.Series[str]
10 | c: pa.typing.Series[bool]
11 |
12 |
13 | class Schema2(Schema):
14 | a: pa.typing.Series[str]
15 | b: pa.typing.Series[float]
16 | c: pa.typing.Series[int]
17 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandera_types.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | import pandas as pd
3 |
4 | import pandera.pandas as pa
5 |
6 |
7 | def fn(series: pa.typing.Series[int]) -> None:
8 | pass
9 |
10 |
11 | df = pd.DataFrame({"a": [1, 2, 3]})
12 | sr = pd.Series([1, 2, 3])
13 |
14 | fn(sr)
15 | fn(df["a"])
16 |
--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/python_slice.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | import pandas as pd
3 |
4 | df = pd.DataFrame({"a": [1, 2, 3]}, index=[*"abc"])
5 | df.loc["a":"c"]
6 |
--------------------------------------------------------------------------------
/tests/pandas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/pandas/__init__.py
--------------------------------------------------------------------------------
/tests/pandas/checks_fixtures.py:
--------------------------------------------------------------------------------
1 | """Pytest fixtures for testing custom checks."""
2 |
3 | from typing import Generator
4 | from unittest import mock
5 |
6 | import pandas as pd
7 | import pytest
8 |
9 | import pandera.pandas as pa
10 | import pandera.api.extensions as pa_ext
11 |
12 | __all__ = "custom_check_teardown", "extra_registered_checks"
13 |
14 |
15 | @pytest.fixture(scope="function")
16 | def custom_check_teardown() -> Generator[None, None, None]:
17 | """Remove all custom checks after execution of each pytest function."""
18 | yield
19 | for check_name in list(pa.Check.REGISTERED_CUSTOM_CHECKS):
20 | del pa.Check.REGISTERED_CUSTOM_CHECKS[check_name]
21 |
22 |
23 | @pytest.fixture(scope="function")
24 | def extra_registered_checks() -> Generator[None, None, None]:
25 | """temporarily registers custom checks onto the Check class"""
26 | # pylint: disable=unused-variable
27 | with mock.patch(
28 | "pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict
29 | ):
30 | # register custom checks here
31 | @pa_ext.register_check_method()
32 | def no_param_check(_: pd.DataFrame) -> bool:
33 | return True
34 |
35 | @pa_ext.register_check_method()
36 | def no_param_check_ellipsis(_: pd.DataFrame) -> bool:
37 | return True
38 |
39 | @pa_ext.register_check_method()
40 | def raise_an_error_check(_: pd.DataFrame) -> bool:
41 | raise TypeError("Test error in custom check")
42 |
43 | yield
44 |
--------------------------------------------------------------------------------
/tests/pandas/conftest.py:
--------------------------------------------------------------------------------
1 | """Registers fixtures for core"""
2 |
3 | # pylint: disable=unused-import
4 | from tests.pandas.checks_fixtures import (
5 | custom_check_teardown,
6 | extra_registered_checks,
7 | )
8 |
--------------------------------------------------------------------------------
/tests/pandas/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/pandas/modules/__init__.py
--------------------------------------------------------------------------------
/tests/pandas/modules/validate_on_init.py:
--------------------------------------------------------------------------------
1 | """Module for unit testing validation on initialization."""
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pandera.pandas as pa
6 | from pandera.typing import DataFrame
7 |
8 |
9 | class ExampleSchema(pa.DataFrameModel):
10 | class Config:
11 | coerce = True
12 |
13 | a: np.int64
14 |
15 |
16 | ExampleDataFrame = DataFrame[ExampleSchema]
17 | validated_dataframe = ExampleDataFrame(pd.DataFrame([], columns=["a"]))
18 |
--------------------------------------------------------------------------------
/tests/pandas/test__pandas_deprecated__test_model.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=wrong-import-position,wildcard-import,unused-wildcard-import
2 | """Unit tests for the deprecated top-level pandera DataFrameModel class.
3 |
4 | Delete this file once the top-level pandera._pandas_deprecated module is
5 | removed.
6 | """
7 |
8 | import pytest
9 | from pandera._pandas_deprecated import DataFrameModel as _DataFrameModel
10 |
11 |
12 | @pytest.fixture(autouse=True)
13 | def monkeypatch_dataframe_model(monkeypatch):
14 | """Monkeypatch DataFrameModel before importing test_schemas"""
15 | monkeypatch.setattr(
16 | "tests.pandas.test_schemas.DataFrameModel", _DataFrameModel
17 | )
18 |
19 |
20 | from tests.pandas.test_schemas import *
21 |
--------------------------------------------------------------------------------
/tests/pandas/test__pandas_deprecated__test_schemas.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=wrong-import-position,wildcard-import,unused-wildcard-import
2 | """Unit tests for the deprecated top-level pandera DataFrameSchema class.
3 |
4 | Delete this file once the top-level pandera._pandas_deprecated module is
5 | removed.
6 | """
7 |
8 | import pytest
9 | from pandera._pandas_deprecated import DataFrameSchema as _DataFrameSchema
10 |
11 |
12 | @pytest.fixture(autouse=True)
13 | def monkeypatch_dataframe_schema(monkeypatch):
14 | """Monkeypatch DataFrameSchema before importing test_schemas"""
15 | monkeypatch.setattr(
16 | "tests.pandas.test_schemas.DataFrameSchema", _DataFrameSchema
17 | )
18 |
19 |
20 | from tests.pandas.test_schemas import *
21 |
--------------------------------------------------------------------------------
/tests/pandas/test_docs_setting_column_widths.py:
--------------------------------------------------------------------------------
1 | """Some of the doctest examples only work if the terminal is the correct width
2 | because of the way __str__/__repr__ works in pandas. This checks that
3 | conditions necessary for the doctests to pass properly exist on the host
4 | system."""
5 |
6 | import pandas as pd
7 |
8 | from docs.source import conf
9 |
10 |
11 | def test_sphinx_doctest_setting_global_pandas_conditions() -> None:
12 | """Checks that no limit is set on the height/width of the __repr__/__str__
13 | print of a pd.DataFrame to ensure doctest performs consistently across
14 | different Operating Systems."""
15 | # pylint: disable=W0122
16 | exec(conf.doctest_global_setup)
17 |
18 | max_cols_after_being_set = pd.options.display.max_columns
19 | max_rows_after_being_set = pd.options.display.max_rows
20 | assert max_cols_after_being_set is None
21 | assert max_rows_after_being_set is None
22 |
--------------------------------------------------------------------------------
/tests/pandas/test_engine_utils.py:
--------------------------------------------------------------------------------
1 | """Unit tests for engine module utility functions."""
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pytest
6 |
7 | from pandera.engines import utils
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "data_container, data_type, expected_failure_cases",
12 | [
13 | [pd.Series(list("ab1cd3")), int, [False, False, True] * 2],
14 | [pd.Series(list("12345")), int, [True] * 5],
15 | [pd.Series([1, 2, "foo", "bar"]), float, [True, True, False, False]],
16 | ],
17 | )
18 | def test_numpy_pandas_coercible(
19 | data_container, data_type, expected_failure_cases
20 | ):
21 | """Test that the correct boolean Series outputs are returned."""
22 | assert (
23 | expected_failure_cases
24 | == utils.numpy_pandas_coercible(data_container, data_type).tolist()
25 | )
26 |
27 |
28 | @pytest.mark.parametrize(
29 | "data_container",
30 | [
31 | pd.Series([1, 2, 3, 4]),
32 | np.array([1, 2, 3, 4]),
33 | pd.DataFrame({0: [1, 2, 3, 4]}),
34 | np.array([[1], [2], [3], [4]]),
35 | ],
36 | )
37 | def test_numpy_pandas_coerce_failure_cases(data_container):
38 | """
39 | Test that different data container types can be checked for coerce failure
40 | cases.
41 | """
42 | failure_cases = utils.numpy_pandas_coerce_failure_cases(
43 | data_container, int
44 | )
45 | assert failure_cases is None
46 |
47 |
48 | @pytest.mark.parametrize(
49 | "invalid_data_container, exception_type",
50 | [
51 | [1, TypeError],
52 | [5.1, TypeError],
53 | ["foobar", TypeError],
54 | [[1, 2, 3], TypeError],
55 | [{0: 1}, TypeError],
56 | # pylint: disable=too-many-function-args
57 | [np.array([1]).reshape(1, 1, 1), ValueError],
58 | ],
59 | )
60 | def test_numpy_pandas_coerce_failure_cases_exceptions(
61 | invalid_data_container, exception_type
62 | ):
63 | """
64 | Test exceptions of trying to get failure cases for invalid input types.
65 | """
66 | error_msg = {
67 | TypeError: "type of data_container .+ not understood",
68 | ValueError: "only numpy arrays of 1 or 2 dimensions are supported",
69 | }[exception_type]
70 | with pytest.raises(exception_type, match=error_msg):
71 | utils.numpy_pandas_coerce_failure_cases(invalid_data_container, int)
72 |
--------------------------------------------------------------------------------
/tests/pandas/test_extension_modules.py:
--------------------------------------------------------------------------------
1 | """Tests for extension module imports."""
2 |
3 | import pandas as pd
4 | import pytest
5 |
6 | from pandera.api.hypotheses import Hypothesis
7 |
8 |
9 | try:
10 | from scipy import stats # pylint: disable=unused-import
11 | except ImportError: # pragma: no cover
12 | SCIPY_INSTALLED = False
13 | else:
14 | SCIPY_INSTALLED = True
15 |
16 |
17 | def test_hypotheses_module_import() -> None:
18 | """Test that Hypothesis built-in methods raise import error."""
19 | data = pd.Series([1, 2, 3])
20 | if not SCIPY_INSTALLED:
21 | for fn, check_args in [
22 | (
23 | lambda: Hypothesis.two_sample_ttest("sample1", "sample2"),
24 | pd.DataFrame({"sample1": data, "sample2": data}),
25 | ),
26 | (lambda: Hypothesis.one_sample_ttest(popmean=10), data),
27 | ]:
28 | with pytest.raises(ImportError):
29 | check = fn()
30 | check(check_args)
31 |
--------------------------------------------------------------------------------
/tests/pandas/test_model_components.py:
--------------------------------------------------------------------------------
1 | """Tests individual model components."""
2 |
3 | from typing import Any
4 |
5 | import pytest
6 |
7 | import pandera.pandas as pa
8 | from pandera.engines.pandas_engine import Engine
9 |
10 |
11 | def test_field_to_column() -> None:
12 | """Test that Field outputs the correct column options."""
13 | for flag in ["nullable", "unique", "coerce", "regex"]:
14 | for value in [True, False]:
15 | col_kwargs = pa.Field(**{flag: value}).column_properties( # type: ignore[arg-type]
16 | pa.DateTime, required=value
17 | )
18 | col = pa.Column(**col_kwargs)
19 | assert col.dtype == Engine.dtype(pa.DateTime)
20 | assert col.properties[flag] == value
21 | assert col.required == value
22 |
23 |
24 | def test_field_to_index() -> None:
25 | """Test that Field outputs the correct index options."""
26 | for flag in ["nullable", "unique"]:
27 | for value in [True, False]:
28 | index_kwargs = pa.Field(**{flag: value}).index_properties( # type: ignore[arg-type]
29 | pa.DateTime
30 | )
31 | index = pa.Index(**index_kwargs)
32 | assert index.dtype == Engine.dtype(pa.DateTime)
33 | assert getattr(index, flag) == value
34 |
35 |
36 | def test_field_no_checks() -> None:
37 | """Test Field without checks."""
38 | assert not pa.Field().column_properties(str)["checks"]
39 |
40 |
41 | @pytest.mark.parametrize(
42 | "arg,value,expected",
43 | [
44 | ("eq", 9, pa.Check.equal_to(9)),
45 | ("ne", 9, pa.Check.not_equal_to(9)),
46 | ("gt", 9, pa.Check.greater_than(9)),
47 | ("ge", 9, pa.Check.greater_than_or_equal_to(9)),
48 | ("lt", 9, pa.Check.less_than(9)),
49 | ("le", 9, pa.Check.less_than_or_equal_to(9)),
50 | (
51 | "in_range",
52 | {"min_value": 1, "max_value": 9},
53 | pa.Check.in_range(1, 9),
54 | ),
55 | ("isin", [9, "a"], pa.Check.isin([9, "a"])),
56 | ("notin", [9, "a"], pa.Check.notin([9, "a"])),
57 | ("str_contains", "a", pa.Check.str_contains("a")),
58 | ("str_endswith", "a", pa.Check.str_endswith("a")),
59 | ("str_matches", "a", pa.Check.str_matches("a")),
60 | (
61 | "str_length",
62 | {"min_value": 1, "max_value": 9},
63 | pa.Check.str_length(1, 9),
64 | ),
65 | ("str_startswith", "a", pa.Check.str_startswith("a")),
66 | ],
67 | )
68 | def test_field_checks(arg: str, value: Any, expected: pa.Check) -> None:
69 | """Test that all built-in checks are available in a Field."""
70 | checks = pa.Field(**{arg: value}).column_properties(str)["checks"]
71 | assert len(checks) == 1
72 | assert checks[0] == expected
73 |
--------------------------------------------------------------------------------
/tests/pandas/test_multithreaded.py:
--------------------------------------------------------------------------------
1 | """Test that pandera schemas are thread safe."""
2 |
3 | import pandas as pd
4 | import numpy as np
5 | from joblib import Parallel, delayed
6 |
7 | import pandera.pandas as pa
8 |
9 |
10 | class Model(pa.DataFrameModel):
11 | time: pa.typing.Series[np.float32] = pa.Field(coerce=True)
12 |
13 |
14 | def validate_df(df):
15 | validated_df = Model.to_schema().validate(df)
16 | assert validated_df.dtypes["time"] == np.float32
17 | return validated_df
18 |
19 |
20 | def test_single_thread():
21 | df = pd.DataFrame({"time": np.array([1.0, 2.0, 3.0], dtype=np.float64)})
22 | validate_df(df)
23 |
24 |
25 | def test_multithreading():
26 | df = pd.DataFrame({"time": np.array([1.0, 2.0, 3.0], dtype=np.float64)})
27 | n_tries = 10
28 | total = 8
29 | n_jobs = 4
30 |
31 | for _ in range(n_tries):
32 | results = Parallel(n_jobs=n_jobs, prefer="threads")(
33 | delayed(validate_df)(df) for _ in range(total)
34 | )
35 | for res in results:
36 | assert res.dtypes["time"] == np.float32
37 |
--------------------------------------------------------------------------------
/tests/pandas/test_numpy_engine.py:
--------------------------------------------------------------------------------
1 | """Test numpy engine."""
2 |
3 | import numpy as np
4 | import pytest
5 |
6 | from pandera.engines import numpy_engine
7 |
8 |
9 | @pytest.mark.parametrize(
10 | "data_type", list(numpy_engine.Engine.get_registered_dtypes())
11 | )
12 | def test_numpy_data_type(data_type):
13 | """Test base numpy engine DataType."""
14 | numpy_engine.Engine.dtype(data_type)
15 | numpy_engine.Engine.dtype(data_type.type)
16 | numpy_engine.Engine.dtype(str(data_type.type))
17 | with pytest.warns(UserWarning):
18 | np_dtype = numpy_engine.DataType(data_type.type)
19 | with pytest.warns(UserWarning):
20 | np_dtype_from_str = numpy_engine.DataType(str(data_type.type))
21 | assert np_dtype == np_dtype_from_str
22 |
23 |
24 | @pytest.mark.parametrize("data_type", ["foo", "bar", 1, 2, 3.14, np.void])
25 | def test_numpy_engine_dtype_exceptions(data_type):
26 | """Test invalid inputs to numpy data-types."""
27 | if data_type != np.void:
28 | with pytest.raises(
29 | TypeError, match="data type '.+' not understood by"
30 | ):
31 | numpy_engine.Engine.dtype(data_type)
32 | else:
33 | numpy_engine.Engine._registered_dtypes = set()
34 | numpy_engine.Engine.dtype(data_type)
35 |
36 |
37 | def test_numpy_string():
38 | """Test numpy engine String data type coercion."""
39 | # pylint: disable=no-value-for-parameter
40 | string_type = numpy_engine.String()
41 | assert (
42 | string_type.coerce(np.array([1, 2, 3, 4, 5], dtype=int))
43 | == np.array(list("12345"))
44 | ).all()
45 | assert string_type.check(numpy_engine.String())
46 |
--------------------------------------------------------------------------------
/tests/pandas/test_pandas_accessor.py:
--------------------------------------------------------------------------------
1 | """Unit tests for pandas_accessor module."""
2 |
3 | from typing import Union
4 | from unittest.mock import patch
5 |
6 | import pandas as pd
7 | import pytest
8 |
9 | import pandera.pandas as pa
10 | import pandera.api.pandas.container
11 | from pandera.errors import BackendNotFoundError
12 |
13 |
14 | @pytest.mark.parametrize(
15 | "schema1, schema2, data, invalid_data",
16 | [
17 | [
18 | pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True),
19 | pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True),
20 | pd.DataFrame({"col": [1, 2, 3]}),
21 | pd.Series([1, 2, 3]),
22 | ],
23 | [
24 | pa.SeriesSchema(int, coerce=True),
25 | pa.SeriesSchema(float, coerce=True),
26 | pd.Series([1, 2, 3]),
27 | pd.DataFrame({"col": [1, 2, 3]}),
28 | ],
29 | ],
30 | )
31 | @pytest.mark.parametrize("inplace", [False, True])
32 | def test_dataframe_series_add_schema(
33 | schema1: Union[pa.DataFrameSchema, pa.SeriesSchema],
34 | schema2: Union[pa.DataFrameSchema, pa.SeriesSchema],
35 | data: Union[pd.DataFrame, pd.Series],
36 | invalid_data: Union[pd.DataFrame, pd.Series],
37 | inplace: bool,
38 | ) -> None:
39 | """
40 | Test that pandas object contains schema metadata after pandera validation.
41 | """
42 | validated_data_1 = schema1(data, inplace=inplace) # type: ignore
43 | if inplace:
44 | assert data.pandera.schema == schema1
45 | else:
46 | assert data.pandera.schema is None
47 | assert validated_data_1.pandera.schema == schema1
48 |
49 | validated_data_2 = schema2(validated_data_1, inplace=inplace) # type: ignore
50 | if inplace:
51 | assert validated_data_1.pandera.schema == schema2
52 | else:
53 | assert validated_data_1.pandera.schema == schema1
54 | assert validated_data_2.pandera.schema == schema2
55 |
56 | with pytest.raises((BackendNotFoundError, TypeError)):
57 | schema1(invalid_data) # type: ignore
58 |
59 | with pytest.raises((BackendNotFoundError, TypeError)):
60 | schema2(invalid_data) # type: ignore
61 |
62 | with patch.object(
63 | pandera.backends.pandas.container,
64 | "is_table",
65 | return_value=True,
66 | ):
67 | with patch.object(
68 | pandera.api.pandas.array,
69 | "is_field",
70 | return_value=True,
71 | ):
72 | with pytest.raises(BackendNotFoundError):
73 | schema1(invalid_data) # type: ignore
74 |
75 | with pytest.raises(BackendNotFoundError):
76 | schema2(invalid_data) # type: ignore
77 |
--------------------------------------------------------------------------------
/tests/pandas/test_pandas_config.py:
--------------------------------------------------------------------------------
1 | """This module is to test the behaviour change based on defined config in pandera"""
2 |
3 | # pylint:disable=import-outside-toplevel,abstract-method,redefined-outer-name
4 |
5 | from dataclasses import asdict
6 |
7 | import pandas as pd
8 | import pytest
9 |
10 | import pandera.pandas as pa
11 | from pandera.pandas import DataFrameModel, DataFrameSchema, SeriesSchema
12 | from pandera.config import ValidationDepth, config_context, get_config_context
13 |
14 |
15 | @pytest.fixture(autouse=True, scope="function")
16 | def disable_validation():
17 | """Fixture to disable validation and clean up after the test is finished"""
18 | with config_context(validation_enabled=False):
19 | yield
20 |
21 |
22 | class TestPandasDataFrameConfig:
23 | """Class to test all the different configs types"""
24 |
25 | sample_data = pd.DataFrame(
26 | (("Bread", 9), ("Cutter", 15)), columns=["product", "price_val"]
27 | )
28 |
29 | # pylint: disable=unused-argument
30 | def test_disable_validation(self):
31 | """This function validates that a none object is loaded if validation is disabled"""
32 |
33 | pandera_schema = DataFrameSchema(
34 | {
35 | "product": pa.Column(
36 | str, pa.Check(lambda s: s.str.startswith("B"))
37 | ),
38 | "price_val": pa.Column(int),
39 | }
40 | )
41 |
42 | class TestSchema(DataFrameModel):
43 | """Test Schema class"""
44 |
45 | product: str = pa.Field(str_startswith="B")
46 | price_val: int = pa.Field()
47 |
48 | expected = {
49 | "cache_dataframe": False,
50 | "keep_cached_dataframe": False,
51 | "validation_enabled": False,
52 | "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
53 | }
54 |
55 | assert asdict(get_config_context()) == expected
56 | assert pandera_schema.validate(self.sample_data) is self.sample_data
57 | assert TestSchema.validate(self.sample_data) is self.sample_data
58 |
59 |
60 | class TestPandasSeriesConfig:
61 | """Class to test all the different configs types"""
62 |
63 | sample_data = pd.Series([1, 1, 2, 2, 3, 3])
64 |
65 | # pylint: disable=unused-argument
66 | def test_disable_validation(self):
67 | """This function validates that a none object is loaded if validation is disabled"""
68 | expected = {
69 | "cache_dataframe": False,
70 | "keep_cached_dataframe": False,
71 | "validation_enabled": False,
72 | "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
73 | }
74 | pandera_schema = SeriesSchema(
75 | int, pa.Check(lambda s: s.value_counts() == 2, element_wise=False)
76 | )
77 | assert asdict(get_config_context()) == expected
78 | assert pandera_schema.validate(self.sample_data) is self.sample_data
79 |
--------------------------------------------------------------------------------
/tests/pandas/test_pandas_parallel.py:
--------------------------------------------------------------------------------
1 | """Test parallelization with pandas using joblib."""
2 |
3 | import pandas as pd
4 | from joblib import Parallel, delayed
5 |
6 | from pandera.pandas import Column, DataFrameSchema
7 |
8 | schema = DataFrameSchema({"a": Column("int64")}, coerce=True)
9 |
10 |
11 | def test_polars_parallel():
12 | def fn():
13 | return schema.validate(pd.DataFrame({"a": [1]}))
14 |
15 | results = Parallel(2)([delayed(fn)() for _ in range(10)])
16 | assert len(results) == 10
17 | for result in results:
18 | assert result.dtypes["a"] == "int64"
19 |
--------------------------------------------------------------------------------
/tests/pandas/test_pydantic_dtype.py:
--------------------------------------------------------------------------------
1 | """Unit tests for pydantic datatype."""
2 |
3 | from typing import Type
4 |
5 | import pandas as pd
6 | import pytest
7 | from pydantic import BaseModel
8 |
9 | import pandera.pandas as pa
10 | from pandera.api.pandas.array import ArraySchema
11 | from pandera.engines.pandas_engine import PydanticModel
12 |
13 |
14 | class Record(BaseModel):
15 | """Pydantic record model."""
16 |
17 | name: str
18 | xcoord: int
19 | ycoord: int
20 |
21 |
22 | class PydanticSchema(pa.DataFrameModel):
23 | """Pandera schema using the pydantic model."""
24 |
25 | class Config:
26 | """Config with dataframe-level data type."""
27 |
28 | dtype = PydanticModel(Record)
29 |
30 |
31 | class PanderaSchema(pa.DataFrameModel):
32 | """Pandera schema that's equivalent to PydanticSchema."""
33 |
34 | name: pa.typing.Series[str]
35 | xcoord: pa.typing.Series[int]
36 | ycoord: pa.typing.Series[int]
37 |
38 |
39 | def test_pydantic_model():
40 | """Test that pydantic model correctly validates data."""
41 |
42 | @pa.check_types
43 | def func(df: pa.typing.DataFrame[PydanticSchema]):
44 | return df
45 |
46 | valid_df = pd.DataFrame(
47 | {
48 | "name": ["foo", "bar", "baz"],
49 | "xcoord": [1.0, 2, 3],
50 | "ycoord": [4, 5.0, 6],
51 | }
52 | )
53 |
54 | invalid_df = pd.DataFrame(
55 | {
56 | "name": ["foo", "bar", "baz"],
57 | "xcoord": [1, 2, "c"],
58 | "ycoord": [4, 5, "d"],
59 | }
60 | )
61 |
62 | validated = func(valid_df)
63 | PanderaSchema.validate(validated)
64 |
65 | expected_failure_cases = pd.DataFrame(
66 | {"index": [2], "failure_case": ["{'xcoord': 'c', 'ycoord': 'd'}"]}
67 | )
68 |
69 | try:
70 | func(invalid_df)
71 | except pa.errors.SchemaError as exc:
72 | pd.testing.assert_frame_equal(
73 | exc.failure_cases, expected_failure_cases
74 | )
75 |
76 |
77 | @pytest.mark.parametrize("series_type", [pa.SeriesSchema, pa.Column, pa.Index])
78 | def test_pydantic_model_init_errors(series_type: Type[ArraySchema]):
79 | """
80 | Should raise SchemaInitError with PydanticModel as `SeriesSchemaBase.dtype`
81 | """
82 | with pytest.raises(pa.errors.SchemaInitError):
83 | series_type(dtype=PydanticModel(Record))
84 |
85 |
86 | @pytest.mark.parametrize("coerce", [True, False])
87 | def test_pydantic_model_coerce(coerce: bool):
88 | """Test that DataFrameSchema.coerce is always True with pydantic model"""
89 |
90 | dataframe_schema = pa.DataFrameSchema(
91 | dtype=PydanticModel(Record), coerce=coerce
92 | )
93 | assert dataframe_schema.coerce is True
94 |
--------------------------------------------------------------------------------
/tests/pandas/test_validation_depth.py:
--------------------------------------------------------------------------------
1 | """Unit tests for granular control based on validation depth."""
2 |
3 | import pytest
4 |
5 | from pandera.backends.base import CoreCheckResult
6 | from pandera.config import ValidationDepth, ValidationScope, config_context
7 | from pandera.validation_depth import validate_scope
8 |
9 |
10 | def custom_backend():
11 | class CustomBackend:
12 |
13 | # pylint: disable=unused-argument
14 | @validate_scope(ValidationScope.SCHEMA)
15 | def check_schema(self, check_obj):
16 | # core check result is passed as True when validation scope doesn't
17 | # include schema checks
18 | return CoreCheckResult(passed=False)
19 |
20 | # pylint: disable=unused-argument
21 | @validate_scope(ValidationScope.DATA)
22 | def check_data(self, check_obj):
23 | # core check result is passed as True when validation scope doesn't
24 | # include data checks
25 | return CoreCheckResult(passed=False)
26 |
27 | return CustomBackend()
28 |
29 |
30 | @pytest.mark.parametrize(
31 | "validation_depth,expected",
32 | [
33 | [ValidationDepth.SCHEMA_ONLY, [False, True]],
34 | [ValidationDepth.DATA_ONLY, [True, False]],
35 | [ValidationDepth.SCHEMA_AND_DATA, [False, False]],
36 | [None, [False, False]],
37 | ],
38 | )
39 | def test_validate_scope(validation_depth, expected):
40 |
41 | with config_context(validation_depth=validation_depth):
42 | backend = custom_backend()
43 | schema_result = backend.check_schema("foo")
44 | data_result = backend.check_data("foo")
45 | results = [schema_result.passed, data_result.passed]
46 | assert results == expected
47 |
--------------------------------------------------------------------------------
/tests/polars/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/polars/__init__.py
--------------------------------------------------------------------------------
/tests/polars/conftest.py:
--------------------------------------------------------------------------------
1 | """Polars unit test-specific configuration."""
2 |
3 | import pytest
4 |
5 | from pandera.config import CONFIG, ValidationDepth, reset_config_context
6 |
7 |
8 | @pytest.fixture(scope="function", autouse=True)
9 | def validation_depth_schema_and_data():
10 | """
11 | These tests ensure that the validation depth is set to SCHEMA_AND_DATA
12 | for unit tests.
13 | """
14 | _validation_depth = CONFIG.validation_depth
15 | CONFIG.validation_depth = ValidationDepth.SCHEMA_AND_DATA
16 | try:
17 | yield
18 | finally:
19 | CONFIG.validation_depth = _validation_depth
20 | reset_config_context()
21 |
--------------------------------------------------------------------------------
/tests/polars/test_polars_dataframe_generic.py:
--------------------------------------------------------------------------------
1 | """Unit tests for polars LazyFrame generic."""
2 |
3 | import polars as pl
4 | import pytest
5 |
6 | import pandera.polars as pa
7 | from pandera.typing.polars import LazyFrame, Series
8 |
9 |
10 | def test_series_annotation():
11 | class Model(pa.DataFrameModel):
12 | col1: Series[pl.Int64]
13 |
14 | data = pl.LazyFrame(
15 | {
16 | "col1": [1, 2, 3],
17 | }
18 | )
19 |
20 | assert data.collect().equals(Model.validate(data).collect())
21 |
22 | invalid_data = data.cast({"col1": pl.Float64})
23 | with pytest.raises(pa.errors.SchemaError):
24 | Model.validate(invalid_data).collect()
25 |
26 |
27 | def test_lazyframe_generic_simple():
28 | class Model(pa.DataFrameModel):
29 | col1: pl.Int64
30 | col2: pl.Utf8
31 | col3: pl.Float64
32 |
33 | @pa.check_types
34 | def fn(lf: LazyFrame[Model]) -> LazyFrame[Model]:
35 | return lf
36 |
37 | data = pl.LazyFrame(
38 | {
39 | "col1": [1, 2, 3],
40 | "col2": [*"abc"],
41 | "col3": [1.0, 2.0, 3.0],
42 | }
43 | )
44 |
45 | assert data.collect().equals(fn(data).collect())
46 |
47 | invalid_data = data.cast({"col3": pl.Int64})
48 | with pytest.raises(pa.errors.SchemaError):
49 | fn(invalid_data).collect()
50 |
51 |
52 | def test_lazyframe_generic_transform():
53 | class Input(pa.DataFrameModel):
54 | col1: pl.Int64
55 | col2: pl.Utf8
56 |
57 | class Output(Input):
58 | col3: pl.Float64
59 |
60 | @pa.check_types
61 | def fn(lf: LazyFrame[Input]) -> LazyFrame[Output]:
62 | return lf.with_columns(col3=pl.lit(3.0)) # type: ignore
63 |
64 | @pa.check_types
65 | def invalid_fn(lf: LazyFrame[Input]) -> LazyFrame[Output]:
66 | return lf # type: ignore
67 |
68 | data = pl.LazyFrame(
69 | {
70 | "col1": [1, 2, 3],
71 | "col2": [*"abc"],
72 | }
73 | )
74 |
75 | assert isinstance(fn(data).collect(), pl.DataFrame)
76 |
77 | with pytest.raises(pa.errors.SchemaError):
78 | invalid_fn(data).collect()
79 |
--------------------------------------------------------------------------------
/tests/polars/test_polars_decorators.py:
--------------------------------------------------------------------------------
1 | """Unit tests for using schemas with polars and function decorators."""
2 |
3 | import polars as pl
4 | import pytest
5 |
6 | import pandera.polars as pa
7 | import pandera.typing.polars as pa_typing
8 |
9 |
10 | @pytest.fixture
11 | def data() -> pl.DataFrame:
12 | return pl.DataFrame({"a": [1, 2, 3]})
13 |
14 |
15 | @pytest.fixture
16 | def invalid_data(data) -> pl.DataFrame:
17 | return data.rename({"a": "b"})
18 |
19 |
20 | def test_polars_dataframe_check_io(data, invalid_data):
21 | # pylint: disable=unused-argument
22 |
23 | schema = pa.DataFrameSchema({"a": pa.Column(int)})
24 |
25 | @pa.check_input(schema)
26 | def fn_check_input(x): ...
27 |
28 | @pa.check_output(schema)
29 | def fn_check_output(x):
30 | return x
31 |
32 | @pa.check_io(x=schema, out=schema)
33 | def fn_check_io(x):
34 | return x
35 |
36 | @pa.check_io(x=schema, out=schema)
37 | def fn_check_io_invalid(x):
38 | return x.rename({"a": "b"})
39 |
40 | # valid data should pass
41 | fn_check_input(data)
42 | fn_check_output(data)
43 | fn_check_io(data)
44 |
45 | # invalid data or invalid function should not pass
46 | with pytest.raises(pa.errors.SchemaError):
47 | fn_check_input(invalid_data)
48 |
49 | with pytest.raises(pa.errors.SchemaError):
50 | fn_check_output(invalid_data)
51 |
52 | with pytest.raises(pa.errors.SchemaError):
53 | fn_check_io_invalid(data)
54 |
55 |
56 | def test_polars_dataframe_check_types(data, invalid_data):
57 | # pylint: disable=unused-argument
58 |
59 | class Model(pa.DataFrameModel):
60 | a: int
61 |
62 | @pa.check_types
63 | def fn_check_input(x: pa_typing.DataFrame[Model]): ...
64 |
65 | @pa.check_types
66 | def fn_check_output(x) -> pa_typing.DataFrame[Model]:
67 | return x
68 |
69 | @pa.check_types
70 | def fn_check_io(
71 | x: pa_typing.DataFrame[Model],
72 | ) -> pa_typing.DataFrame[Model]:
73 | return x
74 |
75 | @pa.check_types
76 | def fn_check_io_invalid(
77 | x: pa_typing.DataFrame[Model],
78 | ) -> pa_typing.DataFrame[Model]:
79 | return x.rename({"a": "b"}) # type: ignore
80 |
81 | # valid data should pass
82 | fn_check_input(data)
83 | fn_check_output(data)
84 | fn_check_io(data)
85 |
86 | # invalid data or invalid function should not pass
87 | with pytest.raises(pa.errors.SchemaError):
88 | fn_check_input(invalid_data)
89 |
90 | with pytest.raises(pa.errors.SchemaError):
91 | fn_check_output(invalid_data)
92 |
93 | with pytest.raises(pa.errors.SchemaError):
94 | fn_check_io_invalid(data)
95 |
--------------------------------------------------------------------------------
/tests/polars/test_polars_parallel.py:
--------------------------------------------------------------------------------
1 | """Test parallelization with polars using joblib."""
2 |
3 | import polars as pl
4 | from joblib import Parallel, delayed
5 |
6 | from pandera.polars import Column, DataFrameSchema
7 |
8 | schema = DataFrameSchema({"a": Column(pl.Int32)}, coerce=True)
9 |
10 |
11 | def test_polars_parallel():
12 | def fn():
13 | return schema.validate(pl.DataFrame({"a": [1]}))
14 |
15 | results = Parallel(2)([delayed(fn)() for _ in range(10)])
16 | assert len(results) == 10
17 | for result in results:
18 | assert result.schema["a"] == pl.Int32
19 |
--------------------------------------------------------------------------------
/tests/polars/test_polars_strategies.py:
--------------------------------------------------------------------------------
1 | """Unit tests for polars strategy methods."""
2 |
3 | import pytest
4 |
5 | import pandera.polars as pa
6 |
7 |
8 | def test_dataframe_schema_strategy():
9 | schema = pa.DataFrameSchema()
10 |
11 | with pytest.raises(NotImplementedError):
12 | schema.strategy()
13 |
14 | with pytest.raises(NotImplementedError):
15 | schema.example()
16 |
17 |
18 | def test_column_schema_strategy():
19 | column_schema = pa.Column(str)
20 |
21 | with pytest.raises(NotImplementedError):
22 | column_schema.strategy()
23 |
24 | with pytest.raises(NotImplementedError):
25 | column_schema.example()
26 |
27 | with pytest.raises(NotImplementedError):
28 | column_schema.strategy_component()
29 |
--------------------------------------------------------------------------------
/tests/pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | """Init file for pyspark tests"""
2 |
--------------------------------------------------------------------------------
/tests/pyspark/test_pyspark_accessor.py:
--------------------------------------------------------------------------------
1 | """Unit tests for pyspark_accessor module."""
2 |
3 | from typing import Union
4 |
5 | import pytest
6 | from pyspark.sql import DataFrame, SparkSession
7 | from pyspark.sql.functions import col
8 | from pyspark.sql.types import FloatType, LongType
9 |
10 | import pandera.pyspark as pa
11 | from pandera.config import PanderaConfig, ValidationDepth
12 | from pandera.pyspark import pyspark_sql_accessor
13 |
14 | spark = SparkSession.builder.getOrCreate()
15 |
16 |
17 | @pytest.mark.parametrize(
18 | "schema1, schema2, data, invalid_data",
19 | [
20 | [
21 | pa.DataFrameSchema({"col": pa.Column("long")}, coerce=True),
22 | pa.DataFrameSchema({"col": pa.Column("float")}, coerce=False),
23 | spark.createDataFrame([{"col": 1}, {"col": 2}, {"col": 3}]),
24 | spark.createDataFrame([{"col": 1}, {"col": 2}, {"col": 3}]),
25 | ],
26 | ],
27 | )
28 | def test_dataframe_add_schema(
29 | schema1: pa.DataFrameSchema,
30 | schema2: pa.DataFrameSchema,
31 | data: Union[DataFrame, col],
32 | invalid_data: Union[DataFrame, col],
33 | config_params: PanderaConfig,
34 | ) -> None:
35 | """
36 | Test that pyspark object contains schema metadata after pandera validation.
37 | """
38 | schema1(data) # type: ignore[arg-type]
39 |
40 | assert data.pandera.schema == schema1
41 | assert isinstance(schema1.validate(data), DataFrame)
42 | assert isinstance(schema1(data), DataFrame)
43 | if config_params.validation_depth != ValidationDepth.DATA_ONLY:
44 | assert dict(schema2(invalid_data).pandera.errors["SCHEMA"]) == {
45 | "WRONG_DATATYPE": [
46 | {
47 | "schema": None,
48 | "column": "col",
49 | "check": f"dtype('{str(FloatType())}')",
50 | "error": f"expected column 'col' to have type {str(FloatType())}, got {str(LongType())}",
51 | }
52 | ]
53 | } # type: ignore[arg-type]
54 |
55 |
56 | class CustomAccessor:
57 | """Mock accessor class"""
58 |
59 | def __init__(self, obj):
60 | self._obj = obj
61 |
62 |
63 | def test_modin_accessor_warning():
64 | """Test that modin accessor raises warning when name already exists."""
65 | pyspark_sql_accessor.register_dataframe_accessor("foo")(CustomAccessor)
66 | with pytest.warns(UserWarning):
67 | pyspark_sql_accessor.register_dataframe_accessor("foo")(CustomAccessor)
68 |
--------------------------------------------------------------------------------
/tests/pyspark/test_pyspark_engine.py:
--------------------------------------------------------------------------------
1 | """Tests Engine subclassing and registering DataTypes.Test pyspark engine."""
2 |
3 | # pylint:disable=redefined-outer-name,unused-argument
4 |
5 | import pytest
6 |
7 | from pandera.engines import pyspark_engine
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "data_type",
12 | list(
13 | pyspark_engine.Engine.get_registered_dtypes()
14 | ), # pylint:disable=no-value-for-parameter
15 | )
16 | def test_pyspark_data_type(data_type):
17 | """Test pyspark engine DataType base class."""
18 | if data_type.type is None:
19 | # don't test data types that require parameters e.g. Category
20 | return
21 | parameterized_datatypes = ["decimal", "array", "map"]
22 |
23 | pyspark_engine.Engine.dtype(
24 | data_type
25 | ) # pylint:disable=no-value-for-parameter
26 | pyspark_engine.Engine.dtype(
27 | data_type.type
28 | ) # pylint:disable=no-value-for-parameter
29 | if data_type.type.typeName() not in parameterized_datatypes:
30 | pyspark_engine.Engine.dtype(
31 | str(data_type.type)
32 | ) # pylint:disable=no-value-for-parameter
33 |
34 | with pytest.warns(UserWarning):
35 | pd_dtype = pyspark_engine.DataType(data_type.type)
36 | if data_type.type.typeName() not in parameterized_datatypes:
37 | with pytest.warns(UserWarning):
38 | pd_dtype_from_str = pyspark_engine.DataType(str(data_type.type))
39 | assert pd_dtype == pd_dtype_from_str
40 |
--------------------------------------------------------------------------------
/tests/strategies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/strategies/__init__.py
--------------------------------------------------------------------------------
/tests/test_inspection_utils.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-function-docstring,missing-module-docstring
2 | # pylint: disable=missing-class-docstring,bad-mcs-classmethod-argument
3 | from pandera.inspection_utils import (
4 | is_classmethod_from_meta,
5 | is_decorated_classmethod,
6 | )
7 |
8 |
9 | class SomeMeta(type):
10 | def __new__(mcs, *args, **kwargs):
11 | return super().__new__(mcs, *args, **kwargs)
12 |
13 | def __init__(cls, *args, **kwargs):
14 | super().__init__(*args, **kwargs)
15 |
16 | def regular_method_meta(cls):
17 | return cls
18 |
19 | @classmethod
20 | def class_method_meta(mcs):
21 | return mcs
22 |
23 | @staticmethod
24 | def static_method_meta():
25 | return 1
26 |
27 |
28 | class SomeClass(metaclass=SomeMeta):
29 | def regular_method(self):
30 | return self
31 |
32 | @classmethod
33 | def class_method(cls):
34 | return cls
35 |
36 | @staticmethod
37 | def static_method():
38 | return 2
39 |
40 |
41 | class SomeChild(SomeClass):
42 | def regular_method_child(self):
43 | return self
44 |
45 | @classmethod
46 | def class_method_child(cls):
47 | return cls
48 |
49 | @staticmethod
50 | def static_method_child():
51 | return 3
52 |
53 |
54 | def test_is_decorated_classmethod() -> None:
55 | some_instance = SomeClass()
56 | some_child = SomeChild()
57 |
58 | cls_methods_with_deco = {
59 | SomeMeta.class_method_meta,
60 | SomeClass.class_method_meta,
61 | SomeClass.class_method,
62 | SomeChild.class_method_meta,
63 | SomeChild.class_method,
64 | SomeChild.class_method_child,
65 | }
66 |
67 | cls_methods_from_meta = {
68 | SomeClass.regular_method_meta,
69 | SomeChild.regular_method_meta,
70 | }
71 |
72 | all_methods = {
73 | # from meta
74 | SomeMeta.class_method_meta,
75 | SomeMeta.static_method_meta,
76 | # from parent
77 | SomeClass.class_method_meta,
78 | SomeClass.regular_method_meta,
79 | SomeClass.static_method_meta,
80 | SomeClass.class_method,
81 | some_instance.regular_method,
82 | SomeClass.static_method,
83 | # from child
84 | SomeChild.class_method_meta,
85 | SomeChild.regular_method_meta,
86 | SomeChild.static_method_meta,
87 | SomeChild.class_method,
88 | some_child.regular_method,
89 | SomeChild.static_method,
90 | SomeChild.class_method_child,
91 | some_child.regular_method_child,
92 | SomeChild.static_method_child,
93 | }
94 |
95 | for method in cls_methods_with_deco:
96 | assert is_decorated_classmethod(method), f"{method} is decorated"
97 | for method in all_methods - cls_methods_with_deco:
98 | assert not is_decorated_classmethod(
99 | method
100 | ), f"{method} is not decorated"
101 | for method in cls_methods_from_meta:
102 | assert is_classmethod_from_meta(method), f"{method} comes from meta"
103 | for method in all_methods - cls_methods_from_meta:
104 | assert not is_classmethod_from_meta(
105 | method
106 | ), f"{method} does not come from meta"
107 |
--------------------------------------------------------------------------------