├── .coveragerc
├── .github
    ├── CONTRIBUTING.md
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation-improvement.md
    │   ├── feature_request.md
    │   └── submit-question.md
    ├── config.yml
    ├── dependabot.yml
    └── workflows
    │   ├── ci-tests.yml
    │   └── publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── Makefile
├── README.md
├── asv_bench
    ├── README.md
    └── benchmarks
    │   ├── __init__.py
    │   ├── dataframe_schema.py
    │   └── series_schema.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _static
    │       ├── custom.js
    │       ├── default.css
    │       ├── docsearch_config.js_t
    │       ├── pandera-banner.png
    │       ├── pandera-favicon.png
    │       └── pandera-logo.png
    │   ├── _templates
    │       ├── class.rst
    │       ├── dtype.rst
    │       ├── model_component_class.rst
    │       ├── module.rst
    │       ├── page.html
    │       ├── sidebar
    │       │   └── search.html
    │       └── strategies_module.rst
    │   ├── checks.md
    │   ├── conf.py
    │   ├── configuration.md
    │   ├── dask.md
    │   ├── data_format_conversion.md
    │   ├── data_synthesis_strategies.md
    │   ├── dataframe_models.md
    │   ├── dataframe_schemas.md
    │   ├── decorators.md
    │   ├── drop_invalid_rows.md
    │   ├── dtype_validation.md
    │   ├── dtypes.md
    │   ├── error_report.md
    │   ├── extensions.md
    │   ├── fastapi.md
    │   ├── frictionless.md
    │   ├── fugue.md
    │   ├── geopandas.md
    │   ├── hypothesis.md
    │   ├── index.md
    │   ├── integrations.md
    │   ├── jupyterlite_config.json
    │   ├── lazy_validation.md
    │   ├── modin.md
    │   ├── mypy_integration.md
    │   ├── notebooks
    │       └── try_pandera.ipynb
    │   ├── parsers.md
    │   ├── polars.md
    │   ├── pydantic_integration.md
    │   ├── pyspark.md
    │   ├── pyspark_sql.md
    │   ├── reference
    │       ├── core.rst
    │       ├── dataframe_models.rst
    │       ├── decorators.rst
    │       ├── dtypes.rst
    │       ├── errors.rst
    │       ├── extensions.rst
    │       ├── index.md
    │       ├── io.rst
    │       ├── schema_inference.rst
    │       └── strategies.rst
    │   ├── schema_inference.md
    │   ├── series_schemas.md
    │   └── supported_libraries.md
├── environment.yml
├── mypy.ini
├── noxfile.py
├── pandera
    ├── __init__.py
    ├── _pandas_deprecated.py
    ├── _patch_numpy2.py
    ├── accessors
    │   ├── __init__.py
    │   ├── dask_accessor.py
    │   ├── modin_accessor.py
    │   ├── pandas_accessor.py
    │   ├── polars_accessor.py
    │   ├── pyspark_accessor.py
    │   └── pyspark_sql_accessor.py
    ├── api
    │   ├── __init__.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── checks.py
    │   │   ├── error_handler.py
    │   │   ├── model.py
    │   │   ├── model_components.py
    │   │   ├── model_config.py
    │   │   ├── parsers.py
    │   │   ├── schema.py
    │   │   └── types.py
    │   ├── checks.py
    │   ├── dataframe
    │   │   ├── __init__.py
    │   │   ├── components.py
    │   │   ├── container.py
    │   │   ├── model.py
    │   │   ├── model_components.py
    │   │   └── model_config.py
    │   ├── extensions.py
    │   ├── function_dispatch.py
    │   ├── hypotheses.py
    │   ├── pandas
    │   │   ├── __init__.py
    │   │   ├── array.py
    │   │   ├── components.py
    │   │   ├── container.py
    │   │   ├── model.py
    │   │   ├── model_config.py
    │   │   └── types.py
    │   ├── parsers.py
    │   ├── polars
    │   │   ├── __init__.py
    │   │   ├── components.py
    │   │   ├── container.py
    │   │   ├── model.py
    │   │   ├── model_config.py
    │   │   ├── types.py
    │   │   └── utils.py
    │   └── pyspark
    │   │   ├── __init__.py
    │   │   ├── column_schema.py
    │   │   ├── components.py
    │   │   ├── container.py
    │   │   ├── model.py
    │   │   ├── model_components.py
    │   │   ├── model_config.py
    │   │   └── types.py
    ├── backends
    │   ├── __init__.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── builtin_checks.py
    │   │   └── builtin_hypotheses.py
    │   ├── pandas
    │   │   ├── __init__.py
    │   │   ├── array.py
    │   │   ├── base.py
    │   │   ├── builtin_checks.py
    │   │   ├── builtin_hypotheses.py
    │   │   ├── checks.py
    │   │   ├── components.py
    │   │   ├── container.py
    │   │   ├── error_formatters.py
    │   │   ├── hypotheses.py
    │   │   ├── parsers.py
    │   │   └── register.py
    │   ├── polars
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── builtin_checks.py
    │   │   ├── checks.py
    │   │   ├── components.py
    │   │   ├── container.py
    │   │   ├── error_formatters.py
    │   │   └── register.py
    │   ├── pyspark
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── builtin_checks.py
    │   │   ├── checks.py
    │   │   ├── column.py
    │   │   ├── components.py
    │   │   ├── container.py
    │   │   ├── decorators.py
    │   │   ├── error_formatters.py
    │   │   ├── register.py
    │   │   └── utils.py
    │   └── utils.py
    ├── config.py
    ├── constants.py
    ├── decorators.py
    ├── dtypes.py
    ├── engines
    │   ├── __init__.py
    │   ├── engine.py
    │   ├── geopandas_engine.py
    │   ├── numpy_engine.py
    │   ├── pandas_engine.py
    │   ├── polars_engine.py
    │   ├── pyarrow_engine.py
    │   ├── pyspark_engine.py
    │   ├── type_aliases.py
    │   └── utils.py
    ├── errors.py
    ├── extensions.py
    ├── external_config.py
    ├── import_utils.py
    ├── inspection_utils.py
    ├── io
    │   ├── __init__.py
    │   └── pandas_io.py
    ├── mypy.py
    ├── pandas.py
    ├── polars.py
    ├── py.typed
    ├── pyspark.py
    ├── schema_inference
    │   ├── __init__.py
    │   └── pandas.py
    ├── schema_statistics
    │   ├── __init__.py
    │   └── pandas.py
    ├── strategies
    │   ├── __init__.py
    │   ├── base_strategies.py
    │   └── pandas_strategies.py
    ├── system.py
    ├── typing
    │   ├── __init__.py
    │   ├── common.py
    │   ├── dask.py
    │   ├── fastapi.py
    │   ├── formats.py
    │   ├── geopandas.py
    │   ├── modin.py
    │   ├── pandas.py
    │   ├── polars.py
    │   ├── pyspark.py
    │   └── pyspark_sql.py
    ├── utils.py
    └── validation_depth.py
├── pyproject.toml
├── requirements.txt
├── scripts
    └── generate_pip_deps_from_conda.py
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── base
        └── test_base_schema.py
    ├── conftest.py
    ├── dask
        ├── __init__.py
        ├── test_dask.py
        ├── test_dask_accessor.py
        └── test_dask_not_installed.py
    ├── fastapi
        ├── __init__.py
        ├── app.py
        ├── models.py
        └── test_app.py
    ├── geopandas
        ├── test_engine.py
        ├── test_from_to_format_conversions.py
        ├── test_geopandas.py
        └── test_pydantic.py
    ├── hypotheses
        ├── __init__.py
        └── test_hypotheses.py
    ├── io
        ├── __init__.py
        └── test_pandas_io.py
    ├── modin
        ├── __init__.py
        ├── conftest.py
        ├── test_logical_dtypes.py
        ├── test_modin_accessor.py
        └── test_schemas_on_modin.py
    ├── mypy
        ├── config
        │   ├── no_plugin.ini
        │   └── plugin_mypy.ini
        ├── pandas_modules
        │   ├── pandas_concat.py
        │   ├── pandas_dataframe.py
        │   ├── pandas_index.py
        │   ├── pandas_series.py
        │   ├── pandas_time.py
        │   ├── pandera_inheritance.py
        │   ├── pandera_types.py
        │   └── python_slice.py
        └── test_pandas_static_type_checking.py
    ├── pandas
        ├── __init__.py
        ├── checks_fixtures.py
        ├── conftest.py
        ├── modules
        │   ├── __init__.py
        │   └── validate_on_init.py
        ├── test__pandas_deprecated__test_model.py
        ├── test__pandas_deprecated__test_schemas.py
        ├── test_checks.py
        ├── test_checks_builtin.py
        ├── test_config.py
        ├── test_decorators.py
        ├── test_docs_setting_column_widths.py
        ├── test_dtypes.py
        ├── test_engine.py
        ├── test_engine_utils.py
        ├── test_errors.py
        ├── test_extension_modules.py
        ├── test_extensions.py
        ├── test_from_to_format_conversions.py
        ├── test_logical_dtypes.py
        ├── test_model.py
        ├── test_model_components.py
        ├── test_multithreaded.py
        ├── test_numpy_engine.py
        ├── test_pandas_accessor.py
        ├── test_pandas_config.py
        ├── test_pandas_engine.py
        ├── test_pandas_parallel.py
        ├── test_parsers.py
        ├── test_pydantic.py
        ├── test_pydantic_dtype.py
        ├── test_schema_components.py
        ├── test_schema_inference.py
        ├── test_schema_statistics.py
        ├── test_schemas.py
        ├── test_typing.py
        └── test_validation_depth.py
    ├── polars
        ├── __init__.py
        ├── conftest.py
        ├── test_polars_builtin_checks.py
        ├── test_polars_check.py
        ├── test_polars_components.py
        ├── test_polars_config.py
        ├── test_polars_container.py
        ├── test_polars_dataframe_generic.py
        ├── test_polars_decorators.py
        ├── test_polars_dtypes.py
        ├── test_polars_model.py
        ├── test_polars_parallel.py
        ├── test_polars_pydantic.py
        ├── test_polars_strategies.py
        └── test_polars_typing.py
    ├── pyspark
        ├── __init__.py
        ├── conftest.py
        ├── test_pyspark_accessor.py
        ├── test_pyspark_check.py
        ├── test_pyspark_config.py
        ├── test_pyspark_container.py
        ├── test_pyspark_decorators.py
        ├── test_pyspark_dtypes.py
        ├── test_pyspark_engine.py
        ├── test_pyspark_error.py
        ├── test_pyspark_model.py
        └── test_schemas_on_pyspark_pandas.py
    ├── strategies
        ├── __init__.py
        └── test_strategies.py
    └── test_inspection_utils.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = pandera
 3 | 
 4 | [report]
 5 | exclude_lines =
 6 |     if self.debug:
 7 |     pragma: no cover
 8 |     raise NotImplementedError
 9 |     if __name__ == .__main__.:
10 | ignore_errors = True
11 | omit =
12 |     tests/*
13 |     pandera/mypy.py
14 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [cosmicBboy]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | - [ ] I have checked that this issue has not already been reported.
14 | - [ ] I have confirmed this bug exists on the latest version of pandera.
15 | - [ ] (optional) I have confirmed this bug exists on the main branch of pandera.
16 | 
17 | **Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug.
18 | 
19 | #### Code Sample, a copy-pastable example
20 | 
21 | ```python
22 | # Your code here
23 | 
24 | ```
25 | 
26 | #### Expected behavior
27 | A clear and concise description of what you expected to happen.
28 | 
29 | #### Desktop (please complete the following information):
30 | 
31 |  - OS: [e.g. iOS]
32 |  - Browser: [e.g. chrome, safari]
33 |  - Version: [e.g. 22]
34 | 
35 | #### Screenshots
36 | If applicable, add screenshots to help explain your problem.
37 | 
38 | #### Additional context
39 | Add any other context about the problem here.
40 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-improvement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation Improvement
 3 | about: Report wrong or missing documentation
 4 | title: ''
 5 | labels: docs
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | #### Location of the documentation
11 | 
12 | [this should provide the location of the documentation, e.g. "pandera.api.pandas.container.DataFrameSchema" or the URL of the documentation, e.g. "https://pandera.readthedocs.io/en/stable/dataframe_schemas.html#column-validation"]
13 | 
14 | **Note**: You can check the latest versions of the docs on `master` [here](https://pandera.readthedocs.io/en/latest/).
15 | 
16 | #### Documentation problem
17 | 
18 | [this should provide a description of what documentation you believe needs to be fixed/improved]
19 | 
20 | #### Suggested fix for documentation
21 | 
22 | [this should explain the suggested fix and **why** it's better than the existing documentation]
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is.
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/submit-question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Submit Question
 3 | about: Ask a general question about pandera
 4 | title: ''
 5 | labels: question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | #### Question about pandera
11 | 
12 | **Note**: If you'd still like to submit a question, please read [this guide](
13 | https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question.
14 | 
15 | ```python
16 | # Your code here, if applicable
17 | 
18 | ```
19 | 


--------------------------------------------------------------------------------
/.github/config.yml:
--------------------------------------------------------------------------------
 1 | # Comment to be posted on PRs from first-time contributors in your repository
 2 | newPRWelcomeComment: |
 3 |   Thank you for opening this pull request! 🙌
 4 | 
 5 |   These tips will help get your PR across the finish line:
 6 | 
 7 |   - If you haven't already, check out the [Contributing Guide](https://pandera.readthedocs.io/en/stable/CONTRIBUTING.html)
 8 |   - Sign off your commits (Reference: [DCO Guide](https://github.com/src-d/guide/blob/master/developer-community/fix-DCO.md)).
 9 | 
10 | # Comment to be posted to on pull requests merged by a first time user
11 | firstPRMergeComment: >
12 |   Congrats on merging your first pull request! 🎉
13 | 
14 | # Comment to be posted on first-time issues
15 | newIssueWelcomeComment: >
16 |   Thank you for opening your first issue here! 🛠
17 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "monthly"
7 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   build_wheel_and_sdist:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |         with:
13 |           fetch-depth: "0"
14 |       - name: Set up Python
15 |         uses: actions/setup-python@v5
16 |         with:
17 |           python-version: "3.x"
18 |       - name: Install dependencies
19 |         run: |
20 |           python -m pip install --upgrade pip
21 |           pip install build twine
22 |       - name: Build wheel and sdist
23 |         run: python -m build
24 |         shell: bash
25 |       - uses: actions/upload-artifact@v4
26 |         with:
27 |           name: pandera-artifact
28 |           path: ./dist
29 | 
30 |   pypi-publish:
31 |     name: Upload release to PyPI
32 |     needs: [build_wheel_and_sdist]
33 |     runs-on: ubuntu-latest
34 |     permissions:
35 |       id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
36 |     environment: release
37 |     steps:
38 |       - uses: actions/download-artifact@v4
39 |         with:
40 |           name: pandera-artifact
41 |           path: dist
42 |       - run: ls dist
43 |       - name: Publish package distributions to PyPI
44 |         uses: pypa/gh-action-pypi-publish@release/v1
45 |         with:
46 |           attestations: false
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | pandera/_version.py
  2 | uv.lock
  3 | *.db
  4 | .vscode
  5 | dask-worker-space
  6 | spark-warehouse
  7 | docs/source/_contents
  8 | docs/jupyter_execute
  9 | **.DS_Store
 10 | 
 11 | # Byte-compiled / optimized / DLL files
 12 | __pycache__/
 13 | *.py[cod]
 14 | *$py.class
 15 | 
 16 | # C extensions
 17 | *.so
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # Environments
 95 | .env
 96 | .venv
 97 | env/
 98 | venv/
 99 | ENV/
100 | env.bak/
101 | venv.bak/
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/
115 | 
116 | # Pycharm settings
117 | .idea
118 | 
119 | # Airspeed Velocity Benchmarks
120 | /asv_bench/html/
121 | /asv_bench/results/
122 | 
123 | # Docs
124 | docs/source/reference/generated
125 | 
126 | # Nox
127 | .nox
128 | .nox-*
129 | 
130 | # ignore markdown files copied from .github
131 | docs/source/CONTRIBUTING.md
132 | .aider*
133 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: (^asv_bench|setup.py|requirements-dev.txt)
 2 | 
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v4.1.0
 6 |     hooks:
 7 |       - id: check-ast
 8 |         description: Simply check whether files parse as valid python
 9 |       - id: check-case-conflict
10 |         description: Check for files that would conflict in case-insensitive filesystems
11 |       - id: check-merge-conflict
12 |         description: Check for files that contain merge conflict strings
13 |       - id: check-yaml
14 |         description: Attempts to load all yaml files to verify syntax
15 |       - id: debug-statements
16 |         description: Check for debugger imports and py37+ breakpoint() calls in python source
17 |       - id: end-of-file-fixer
18 |         description: Makes sure files end in a newline and only a newline
19 |       - id: trailing-whitespace
20 |         description: Trims trailing whitespace
21 |       - id: mixed-line-ending
22 |         description: Replaces or checks mixed line ending
23 | 
24 |   - repo: https://github.com/pre-commit/mirrors-isort
25 |     rev: v5.10.1
26 |     hooks:
27 |       - id: isort
28 |         args: ["--line-length=79", "--skip=docs/source/conf.py", "--diff"]
29 | 
30 |   - repo: https://github.com/ikamensh/flynt
31 |     rev: "0.76"
32 |     hooks:
33 |       - id: flynt
34 | 
35 |   - repo: https://github.com/psf/black
36 |     rev: 24.4.2
37 |     hooks:
38 |       - id: black
39 | 
40 |   - repo: https://github.com/asottile/pyupgrade
41 |     rev: v3.19.1
42 |     hooks:
43 |       - id: pyupgrade
44 |         args: [--py38-plus, --keep-runtime-typing]
45 | 
46 |   - repo: https://github.com/pycqa/pylint
47 |     rev: v3.3.6
48 |     hooks:
49 |       - id: pylint
50 |         args: ["--disable=import-error"]
51 |         exclude: (^docs/|^scripts)
52 | 
53 |   - repo: https://github.com/pre-commit/mirrors-mypy
54 |     rev: v1.10.0
55 |     hooks:
56 |       - id: mypy
57 |         additional_dependencies:
58 |           - types-click
59 |           - types-pytz
60 |           - types-pyyaml
61 |           - types-requests
62 |           - types-setuptools
63 |           - polars
64 |         args: ["pandera", "tests", "scripts"]
65 |         exclude: (^docs/|^tests/mypy/modules/)
66 |         pass_filenames: false
67 |         require_serial: true
68 |         verbose: true
69 | 
70 |   - repo: https://github.com/codespell-project/codespell
71 |     rev: v2.4.1
72 |     hooks:
73 |       - id: codespell
74 |         additional_dependencies:
75 |           - tomli


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [BASIC]
 2 | ignore=mypy.py,noxfile.py,pandera/accessors/pyspark_sql_accessor.py,pandera/engines/pyspark_engine.py,pandera/pyspark.py,pandera/typing/pyspark_sql.py,
 3 | ignore-patterns=pandera/api/pyspark/*,tests/pyspark/*
 4 | good-names=
 5 |     T,
 6 |     F,
 7 |     logger,
 8 |     df,
 9 |     fn,
10 |     i,
11 |     e,
12 |     x,
13 |     f,
14 |     k,
15 |     v,
16 |     fp,
17 |     bar,
18 |     eq,
19 |     ne,
20 |     gt,
21 |     ge,
22 |     lt,
23 |     le,
24 |     dt,
25 |     tz,
26 |     TBaseModel,
27 |     TArraySchemaBase,
28 |     TDataFrameModel,
29 |     _DataType
30 | 
31 | [MESSAGES CONTROL]
32 | disable=
33 |     # C0330 conflicts with black: https://github.com/psf/black/issues/48
34 |     R0913,
35 |     duplicate-code,
36 |     too-many-instance-attributes,
37 |     no-else-return,
38 |     inconsistent-return-statements,
39 |     protected-access,
40 |     too-many-ancestors,
41 |     too-many-lines,
42 |     too-few-public-methods,
43 |     line-too-long,
44 |     ungrouped-imports,
45 |     function-redefined,
46 |     arguments-differ,
47 |     unnecessary-dunder-call,
48 |     use-dict-literal,
49 |     invalid-name,
50 |     import-outside-toplevel,
51 |     missing-class-docstring,
52 |     missing-function-docstring,
53 |     fixme,
54 |     too-many-locals,
55 |     redefined-outer-name,
56 |     logging-fstring-interpolation,
57 |     multiple-statements,
58 |     cyclic-import,
59 |     too-many-positional-arguments,
60 |     too-many-function-args,
61 |     # Due to custom `immutable` decorator replacing `dataclasses.dataclass`
62 |     invalid-field-call
63 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | build:
 9 |   os: ubuntu-24.04
10 |   apt_packages:
11 |     # Install OpenJDK as Java backend to run PySpark examples.
12 |     - openjdk-11-jre-headless
13 |   tools:
14 |     python: "3.11"
15 |   jobs:
16 |     post_install:
17 |       - pip install uv
18 |       - UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --all-extras --all-groups --link-mode=copy
19 | 
20 | sphinx:
21 |   configuration: docs/source/conf.py
22 | 
23 | 
24 | # Optionally build your docs in additional formats such as PDF and ePub
25 | formats: []
26 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Niels Bantilan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: tests clean clean-pyc upload-pypi-test upload-pypi requirements docs \
 2 | 	code-cov docs-clean requirements-dev.txt
 3 | 
 4 | clean:
 5 | 	python setup.py clean
 6 | 
 7 | clean-pyc:
 8 | 	find . -name '*.pyc' -exec rm {} \;
 9 | 
10 | upload-pypi-test:
11 | 	python setup.py sdist bdist_wheel && \
12 | 		twine upload --repository-url https://test.pypi.org/legacy/ dist/* && \
13 | 		rm -rf dist
14 | 
15 | upload-pypi:
16 | 	python setup.py sdist bdist_wheel && \
17 | 		twine upload dist/* && \
18 | 		rm -rf dist
19 | 
20 | .PHONY: install-uv
21 | install-uv:
22 | 	pip install uv
23 | 
24 | setup: install-uv
25 | 	uv sync --all-extras
26 | 
27 | setup-macos: install-uv
28 | 	uv sync --all-extras
29 | 	uv pip install polars-lts-cpu
30 | 
31 | docs-clean:
32 | 	rm -rf docs/source/reference/generated docs/**/generated docs/**/methods docs/_build docs/source/_contents
33 | 
34 | docs: docs-clean
35 | 	python -m sphinx -W -E "docs/source" "docs/_build" && make -C docs doctest
36 | 
37 | quick-docs:
38 | 	python -m sphinx -E "docs/source" "docs/_build" && make -C docs doctest
39 | 
40 | code-cov:
41 | 	pytest --cov-report=html --cov=pandera tests/
42 | 
43 | NOX_FLAGS ?= "-r"
44 | 
45 | deps-from-environment.yml:
46 | 	python scripts/generate_pip_deps_from_conda.py
47 | 
48 | unit-tests:
49 | 	pytest tests/core
50 | 
51 | nox-tests:
52 | 	nox -db uv -s tests ${NOX_FLAGS}
53 | 


--------------------------------------------------------------------------------
/asv_bench/README.md:
--------------------------------------------------------------------------------
 1 | # Airspeed Velocity 
 2 | 
 3 | `pandera`'s performance benchmarks over time can be [viewed on this airspeed-velocity dashboard](https://pandera-dev.github.io/pandera-asv-logs/).
 4 | 
 5 | The [config](https://github.com/pandera-dev/pandera-asv-logs/tree/master/asv_bench/asv.conf.json) and [results files](https://github.com/pandera-dev/pandera-asv-logs/tree/master/results) files are tracked in the [pandera-asv-logs](https://github.com/pandera-dev/pandera-asv-logs) repo to avoid build files in the main repo.
 6 | 
 7 | The [benchmarks](https://github.com/pandera-dev/pandera/tree/master/benchmarks/) are tracked in the main [pandera repo](https://github.com/pandera-dev/pandera).
 8 | 
 9 | ## Running `asv`
10 | 
11 | Ensure both the `pandera` and `pandera-asv-logs` repos are checked out to the same parent directory.
12 | 
13 | From the `pandera-asv-logs` repo, run:
14 | ```
15 | asv run ALL --config asv_bench/asv.conf.json
16 | ```
17 | 
18 | ## Publishing results:
19 | 
20 | To build the html and preview the results:
21 | ```
22 | asv publish --config asv_bench/asv.conf.json
23 | asv preview --config asv_bench/asv.conf.json
24 | ```
25 | 
26 | The `.json` results files are committed or PR'd into the master branch of `pandera-asv-logs`.
27 | 
28 | The published html is pushed directly to the gh-pages branch of `pandera-asv-logs` by running:
29 | 
30 | ```
31 | asv gh-pages --rewrite --config asv_bench/asv.conf.json
32 | ```
33 | 
34 | The `--rewrite` flag overwrites the existing `gh-pages`, avoiding duplication of data.
35 | 
36 | The `asv` docs are [here](https://asv.readthedocs.io/en/stable/index.html).
37 | 


--------------------------------------------------------------------------------
/asv_bench/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/asv_bench/benchmarks/__init__.py


--------------------------------------------------------------------------------
/asv_bench/benchmarks/dataframe_schema.py:
--------------------------------------------------------------------------------
  1 | # Airspeed Velocity Benchmarks for pandera
  2 | import pandas as pd
  3 | 
  4 | from pandera.pandas import (
  5 |     Column,
  6 |     DataFrameSchema,
  7 |     Bool,
  8 |     Category,
  9 |     Check,
 10 |     DateTime,
 11 |     Float,
 12 |     Int,
 13 |     Object,
 14 |     String,
 15 |     Timedelta,
 16 |     check_input,
 17 |     check_output,
 18 | )
 19 | 
 20 | 
 21 | class Validate:
 22 |     """
 23 |     Benchmarking schema.validate
 24 |     """
 25 | 
 26 |     def setup(self):
 27 |         self.schema = DataFrameSchema(
 28 |             {
 29 |                 "a": Column(Int),
 30 |                 "b": Column(Float),
 31 |                 "c": Column(String),
 32 |                 "d": Column(Bool),
 33 |                 "e": Column(Category),
 34 |                 "f": Column(Object),
 35 |                 "g": Column(DateTime),
 36 |                 "i": Column(Timedelta),
 37 |             },
 38 |         )
 39 |         self.df = pd.DataFrame(
 40 |             {
 41 |                 "a": [1, 2, 3],
 42 |                 "b": [1.1, 2.5, 9.9],
 43 |                 "c": ["z", "y", "x"],
 44 |                 "d": [True, True, False],
 45 |                 "e": pd.Series(["c2", "c1", "c3"], dtype="category"),
 46 |                 "f": [(3,), (2,), (1,)],
 47 |                 "g": [
 48 |                     pd.Timestamp("2015-02-01"),
 49 |                     pd.Timestamp("2015-02-02"),
 50 |                     pd.Timestamp("2015-02-03"),
 51 |                 ],
 52 |                 "i": [
 53 |                     pd.Timedelta(1, unit="D"),
 54 |                     pd.Timedelta(5, unit="D"),
 55 |                     pd.Timedelta(9, unit="D"),
 56 |                 ],
 57 |             }
 58 |         )
 59 | 
 60 |     def time_df_schema(self):
 61 |         self.schema.validate(self.df)
 62 | 
 63 |     def mem_df_schema(self):
 64 |         self.schema.validate(self.df)
 65 | 
 66 |     def peakmem_df_schema(self):
 67 |         self.schema.validate(self.df)
 68 | 
 69 | 
 70 | class Decorators:
 71 |     """
 72 |     Benchmarking input and output decorator performance.
 73 |     """
 74 | 
 75 |     def transformer(df):
 76 |         return df.assign(column2=[1, 2, 3])
 77 | 
 78 |     def setup(self):
 79 |         self.in_schema = DataFrameSchema({"column1": Column(String)})
 80 |         self.out_schema = DataFrameSchema({"column2": Column(Int)})
 81 |         self.df = pd.DataFrame({"column1": ["a", "b", "c"]})
 82 | 
 83 |     def time_check_input(self):
 84 |         @check_input(self.in_schema)
 85 |         def transform_first_arg(self):
 86 |             return Decorators.transformer(self.df)
 87 | 
 88 |     def mem_check_input(self):
 89 |         @check_input(self.in_schema)
 90 |         def transform_first_arg(self):
 91 |             return Decorators.transformer(self.df)
 92 | 
 93 |     def peakmem_check_input(self):
 94 |         @check_input(self.in_schema)
 95 |         def transform_first_arg(self):
 96 |             return Decorators.transformer(self.df)
 97 | 
 98 |     def time_check_output(self):
 99 |         @check_output(self.out_schema)
100 |         def transform_first_arg(self):
101 |             return Decorators.transformer(self.df)
102 | 
103 |     def mem_check_output(self):
104 |         @check_output(self.out_schema)
105 |         def transform_first_arg(self):
106 |             return Decorators.transformer(self.df)
107 | 
108 |     def peakmem_check_output(self):
109 |         @check_output(self.out_schema)
110 |         def transform_first_arg(self):
111 |             return Decorators.transformer(self.df)
112 | 


--------------------------------------------------------------------------------
/asv_bench/benchmarks/series_schema.py:
--------------------------------------------------------------------------------
 1 | # Airspeed Velocity Benchmarks for pandera
 2 | import pandas as pd
 3 | 
 4 | from pandera.pandas import (
 5 |     Column,
 6 |     DataFrameSchema,
 7 |     SeriesSchema,
 8 |     Bool,
 9 |     Category,
10 |     Check,
11 |     DateTime,
12 |     Float,
13 |     Int,
14 |     Object,
15 |     String,
16 |     Timedelta,
17 |     String,
18 | )
19 | 
20 | 
21 | class Validate:
22 |     """
23 |     Benchmarking Series schema.validate
24 |     """
25 | 
26 |     def setup(self):
27 |         self.schema = SeriesSchema(
28 |             String,
29 |             checks=[
30 |                 Check(lambda s: s.str.startswith("foo")),
31 |                 Check(lambda s: s.str.endswith("bar")),
32 |                 Check(lambda x: len(x) > 3, element_wise=True),
33 |             ],
34 |             nullable=False,
35 |             unique=False,
36 |             name="my_series",
37 |         )
38 |         self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series")
39 | 
40 |     def time_series_schema(self):
41 |         self.schema.validate(self.series)
42 | 
43 |     def mem_series_schema(self):
44 |         self.schema.validate(self.series)
45 | 
46 |     def peakmem_series_schema(self):
47 |         self.schema.validate(self.series)
48 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_static/custom.js:
--------------------------------------------------------------------------------
 1 | // Add event listener for DOMContentLoaded event
 2 | window.addEventListener("DOMContentLoaded", function() {
 3 |     // Select all <a> elements with class "external"
 4 |     var externalLinks = document.querySelectorAll("a.external");
 5 | 
 6 |     // Loop through each <a> element with class "external"
 7 |     externalLinks.forEach(function(link) {
 8 |         // Set the target attribute to "_blank"
 9 |         link.setAttribute("target", "_blank");
10 |     });
11 | });
12 | 
13 | 
14 | function setHtmlDataTheme() {
15 |     // Set theme at the root html element
16 |     setTimeout(() => {
17 |       const theme = document.body.dataset.theme;
18 |       const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
19 | 
20 |       if (theme === "auto") {
21 |         document.documentElement.dataset.theme = prefersDark ? "dark" : "light";
22 |       } else {
23 |         document.documentElement.dataset.theme = theme;
24 |       }
25 |     }, 10)
26 |   }
27 | 
28 | function setupAlgoliaTheme() {
29 |     // To get darkmode in the algolia search modal, we need to set the theme in
30 |     // the root html element. This function propagates the theme set by furo
31 |     // that's set in the body element.
32 |     const buttons = document.getElementsByClassName("theme-toggle");
33 | 
34 |     // set for initial document load
35 |     setHtmlDataTheme();
36 | 
37 |     // listen for when theme button is clicked.
38 |     Array.from(buttons).forEach((btn) => {
39 |       btn.addEventListener("click", setHtmlDataTheme);
40 |     });
41 | }
42 | 
43 | function main() {
44 |     setupAlgoliaTheme()
45 | }
46 | 
47 | document.addEventListener('DOMContentLoaded', main);
48 | window.addEventListener('keydown', (event) => {
49 |     if (event.code === "Escape") {
50 |         // make sure to prevent default behavior with escape key so that algolia
51 |         // modal can be closed properly.
52 |         event.preventDefault();
53 |     }
54 | });
55 | 


--------------------------------------------------------------------------------
/docs/source/_static/docsearch_config.js_t:
--------------------------------------------------------------------------------
 1 | docsearch({
 2 |   container: "{{ docsearch_container|default('#docsearch') }}",
 3 |   appId: "{{ docsearch_app_id }}",
 4 |   apiKey: "{{ docsearch_api_key }}",
 5 |   indexName: "{{ docsearch_index_name }}",
 6 |   {%- if docsearch_search_parameters %}
 7 |   searchParameters: {
 8 |     {% for key, value in docsearch_search_parameters.items() %}
 9 |       {{ key }}: {% if value is string %}"{{ value }}"{% else %}{{ value }}{% endif %}{% if not loop.last %},{% endif %}
10 |     {% endfor %}
11 |   }
12 |   {%- endif %}
13 | });
14 | 


--------------------------------------------------------------------------------
/docs/source/_static/pandera-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-banner.png


--------------------------------------------------------------------------------
/docs/source/_static/pandera-favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-favicon.png


--------------------------------------------------------------------------------
/docs/source/_static/pandera-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-logo.png


--------------------------------------------------------------------------------
/docs/source/_templates/class.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autoclass:: {{ objname }}
 6 | 
 7 |    {% block attributes %}
 8 |    {% if attributes %}
 9 |    .. rubric:: Attributes
10 | 
11 |    .. autosummary::
12 |       :nosignatures:
13 | 
14 |    {% for item in attributes %}
15 |      ~{{ name }}.{{ item }}
16 |    {%- endfor %}
17 | 
18 |    {% endif %}
19 |    {% endblock %}
20 | 
21 |    {% block methods %}
22 |    {% if methods %}
23 |    .. rubric:: Methods
24 | 
25 |    {% for item in methods %}
26 |    {%- if item not in inherited_members %}
27 |    .. automethod:: {{ item }}
28 |    {% endif %}
29 |    {%- endfor %}
30 | 
31 |    {% endif %}
32 | 
33 |    {%- if members and '__call__' in members %}
34 |    .. automethod:: __call__
35 |    {%- endif %}
36 | 
37 |    {% endblock %}
38 | 


--------------------------------------------------------------------------------
/docs/source/_templates/dtype.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autoclass:: {{ objname }}
 6 | 
 7 |    {% block attributes %}
 8 |    {% if attributes %}
 9 |    .. rubric:: Attributes
10 | 
11 |    .. autosummary::
12 |       :nosignatures:
13 | 
14 |    {% for item in attributes %}
15 |      ~{{ name }}.{{ item }}
16 |    {%- endfor %}
17 | 
18 |    {% endif %}
19 |    {% endblock %}
20 | 
21 |    {% block methods %}
22 |    {% if methods %}
23 |    .. rubric:: Methods
24 | 
25 |    {% for item in methods %}
26 |    .. automethod:: {{ item }}
27 |    {%- endfor %}
28 | 
29 |    {%- if members and '__call__' in members %}
30 |    .. automethod:: __call__
31 |    {%- endif %}
32 | 
33 |    {%- endif %}
34 |    {% endblock %}
35 | 


--------------------------------------------------------------------------------
/docs/source/_templates/model_component_class.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autoclass:: {{ objname }}
 6 |    :show-inheritance:
 7 |    :exclude-members:
 8 | 
 9 |    {% block attributes %}
10 |    {% if attributes %}
11 |    .. rubric:: Attributes
12 | 
13 |    .. autosummary::
14 |       :nosignatures:
15 | 
16 |    {% for item in attributes %}
17 |      ~{{ name }}.{{ item }}
18 |    {%- endfor %}
19 | 
20 |    {% endif %}
21 |    {% endblock %}
22 | 


--------------------------------------------------------------------------------
/docs/source/_templates/module.rst:
--------------------------------------------------------------------------------
 1 | .. empty
 2 | 
 3 | {{ fullname | escape | underline }}
 4 | 
 5 | .. currentmodule:: {{ fullname }}
 6 | 
 7 | .. automodule:: {{ fullname }}
 8 | 
 9 |    {% block classes %}
10 | 
11 |      {% for item in classes %}
12 |         .. autoclass:: {{ item }}
13 |            :members:
14 |            :member-order: bysource
15 |            :show-inheritance:
16 |            :exclude-members:
17 |      {%- endfor %}
18 | 
19 |    {% endblock %}
20 | 
21 |    {% block functions %}
22 | 
23 |      {% for item in functions %}
24 |         .. autofunction:: {{ item }}
25 |      {%- endfor %}
26 | 
27 |    {% endblock %}
28 | 


--------------------------------------------------------------------------------
/docs/source/_templates/page.html:
--------------------------------------------------------------------------------
 1 | {% extends "!page.html" %}
 2 | 
 3 | {% block body -%}
 4 | {{ super() }}
 5 | 
 6 | <!-- <div class="sponsorship">
 7 | <a href="{{ sponsor_link }}" target="_blank">
 8 |     <button><i class="fa fa-heart"></i> Sponsor</button>
 9 | </a>
10 | </div> -->
11 | 
12 | {%- endblock %}
13 | 


--------------------------------------------------------------------------------
/docs/source/_templates/sidebar/search.html:
--------------------------------------------------------------------------------
1 | <div id="docsearch"></div>
2 | 


--------------------------------------------------------------------------------
/docs/source/_templates/strategies_module.rst:
--------------------------------------------------------------------------------
 1 | .. empty
 2 | 
 3 | {{ fullname | escape | underline }}
 4 | 
 5 | .. currentmodule:: {{ fullname }}
 6 | 
 7 | .. automodule:: {{ fullname }}
 8 | 
 9 |    {% block functions %}
10 | 
11 |      {% for item in functions %}
12 |        {% if item not in ["null_dataframe_masks", "null_field_masks", "set_pandas_index", "strategy_import_error"] %}
13 |          .. autofunction:: {{ item }}
14 |        {% endif %}
15 |      {%- endfor %}
16 | 
17 |    {% endblock %}
18 | 


--------------------------------------------------------------------------------
/docs/source/configuration.md:
--------------------------------------------------------------------------------
 1 | (configuration)=
 2 | 
 3 | # Configuration
 4 | 
 5 | *New in version 0.17.3*
 6 | 
 7 | `pandera` provides a global config `~pandera.config.PanderaConfig`. The
 8 | global configuration is available through `pandera.config.CONFIG`. It can also
 9 | be modified with a configuration context `~pandera.config.config_context` and
10 | fetched with `~pandera.config.get_config_context` in custom code.
11 | 
12 | This configuration can also be set using environment variables.
13 | 
14 | ## Validation depth
15 | 
16 | Validation depth determines whether pandera only runs schema-level validations
17 | (column names and datatypes), data-level validations (checks on actual values),
18 | or both:
19 | 
20 | ```
21 | export PANDERA_VALIDATION_ENABLED=False
22 | export PANDERA_VALIDATION_DEPTH=DATA_ONLY  # SCHEMA_AND_DATA, SCHEMA_ONLY, DATA_ONLY
23 | ```
24 | 
25 | ## Enabling/disabling validation
26 | 
27 | Runtime data validation incurs a performance overhead. To mitigate this in the
28 | appropriate contexts, you have the option to disable validation globally.
29 | 
30 | This can be achieved by setting the environment variable
31 | `PANDERA_VALIDATION_ENABLED=False`. When validation is disabled, any
32 | `validate` call not actually run any validation checks.
33 | 


--------------------------------------------------------------------------------
/docs/source/dask.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | file_format: mystnb
 3 | ---
 4 | 
 5 | ```{currentmodule} pandera
 6 | ```
 7 | 
 8 | (scaling-dask)=
 9 | 
10 | # Data Validation with Dask
11 | 
12 | *new in 0.8.0*
13 | 
14 | [Dask](https://docs.dask.org/en/latest/dataframe.html) is a distributed
15 | compute framework that offers a pandas-like dataframe API.
16 | You can use pandera to validate {py:func}`~dask.dataframe.DataFrame`
17 | and {py:func}`~dask.dataframe.Series` objects directly. First, install
18 | `pandera` with the `dask` extra:
19 | 
20 | ```bash
21 | pip install 'pandera[dask]'
22 | ```
23 | 
24 | Then you can use pandera schemas to validate dask dataframes. In the example
25 | below we'll use the {ref}`class-based API <dataframe-models>` to define a
26 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation.
27 | 
28 | ```{code-cell} python
29 | import dask.dataframe as dd
30 | import pandas as pd
31 | import pandera.pandas as pa
32 | 
33 | from pandera.typing.dask import DataFrame, Series
34 | 
35 | 
36 | class Schema(pa.DataFrameModel):
37 |     state: Series[str]
38 |     city: Series[str]
39 |     price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})
40 | 
41 | 
42 | ddf = dd.from_pandas(
43 |     pd.DataFrame(
44 |         {
45 |             'state': ['FL','FL','FL','CA','CA','CA'],
46 |             'city': [
47 |                 'Orlando',
48 |                 'Miami',
49 |                 'Tampa',
50 |                 'San Francisco',
51 |                 'Los Angeles',
52 |                 'San Diego',
53 |             ],
54 |             'price': [8, 12, 10, 16, 20, 18],
55 |         }
56 |     ),
57 |     npartitions=2
58 | )
59 | pandera_ddf = Schema(ddf)
60 | pandera_ddf
61 | ```
62 | 
63 | As you can see, passing the dask dataframe into `Schema` will produce
64 | another dask dataframe which hasn't been evaluated yet. What this means is
65 | that pandera will only validate when the dask graph is evaluated.
66 | 
67 | ```{code-cell} python
68 | pandera_ddf.compute()
69 | ```
70 | 
71 | You can also use the {py:func}`~pandera.check_types` decorator to validate
72 | dask dataframes at runtime:
73 | 
74 | ```{code-cell} python
75 | @pa.check_types
76 | def function(ddf: DataFrame[Schema]) -> DataFrame[Schema]:
77 |     return ddf[ddf["state"] == "CA"]
78 | 
79 | function(ddf).compute()
80 | ```
81 | 
82 | And of course, you can use the object-based API to validate dask dataframes:
83 | 
84 | ```{code-cell} python
85 | schema = pa.DataFrameSchema({
86 |     "state": pa.Column(str),
87 |     "city": pa.Column(str),
88 |     "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20))
89 | })
90 | schema(ddf).compute()
91 | ```
92 | 


--------------------------------------------------------------------------------
/docs/source/drop_invalid_rows.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | file_format: mystnb
  3 | ---
  4 | 
  5 | ```{currentmodule} pandera
  6 | ```
  7 | 
  8 | (drop-invalid-rows)=
  9 | 
 10 | # Dropping Invalid Rows
 11 | 
 12 | *New in version 0.16.0*
 13 | 
 14 | If you wish to use the validation step to remove invalid data, you can pass the
 15 | `drop_invalid_rows=True` argument to the `schema` object on creation. On `schema.validate()`,
 16 | if a data-level check fails, then that row which caused the failure will be removed from the dataframe
 17 | when it is returned.
 18 | 
 19 | `drop_invalid_rows` will prevent data-level schema errors being raised and will instead
 20 | remove the rows which causes the failure.
 21 | 
 22 | This functionality is available on `DataFrameSchema`, `SeriesSchema`, `Column`,
 23 | as well as `DataFrameModel` schemas.
 24 | 
 25 | **Note** that this functionality works by identifying the index or multi-index of the failing rows.
 26 | If the index is not unique on the dataframe, this could result in incorrect rows being dropped.
 27 | 
 28 | Dropping invalid rows with {class}`~pandera.api.pandas.container.DataFrameSchema`:
 29 | 
 30 | ```{code-cell} python
 31 | import pandas as pd
 32 | import pandera.pandas as pa
 33 | 
 34 | 
 35 | df = pd.DataFrame({"counter": [1, 2, 3]})
 36 | schema = pa.DataFrameSchema(
 37 |     {"counter": pa.Column(int, checks=[pa.Check(lambda x: x >= 3)])},
 38 |     drop_invalid_rows=True,
 39 | )
 40 | 
 41 | schema.validate(df, lazy=True)
 42 | ```
 43 | 
 44 | Dropping invalid rows with {class}`~pandera.api.pandas.array.SeriesSchema`:
 45 | 
 46 | ```{code-cell} python
 47 | import pandas as pd
 48 | import pandera.pandas as pa
 49 | 
 50 | 
 51 | series = pd.Series([1, 2, 3])
 52 | schema = pa.SeriesSchema(
 53 |     int,
 54 |     checks=[pa.Check(lambda x: x >= 3)],
 55 |     drop_invalid_rows=True,
 56 | )
 57 | 
 58 | schema.validate(series, lazy=True)
 59 | ```
 60 | 
 61 | Dropping invalid rows with {class}`~pandera.api.pandas.components.Column`:
 62 | 
 63 | ```{code-cell} python
 64 | import pandas as pd
 65 | import pandera.pandas as pa
 66 | 
 67 | 
 68 | df = pd.DataFrame({"counter": [1, 2, 3]})
 69 | schema = pa.Column(
 70 |     int,
 71 |     name="counter",
 72 |     drop_invalid_rows=True,
 73 |     checks=[pa.Check(lambda x: x >= 3)]
 74 | )
 75 | 
 76 | schema.validate(df, lazy=True)
 77 | ```
 78 | 
 79 | Dropping invalid rows with {class}`~pandera.api.pandas.model.DataFrameModel`:
 80 | 
 81 | ```{code-cell} python
 82 | import pandas as pd
 83 | import pandera.pandas as pa
 84 | 
 85 | 
 86 | class MySchema(pa.DataFrameModel):
 87 |     counter: int = pa.Field(in_range={"min_value": 3, "max_value": 5})
 88 | 
 89 |     class Config:
 90 |         drop_invalid_rows = True
 91 | 
 92 | 
 93 | MySchema.validate(
 94 |     pd.DataFrame({"counter": [1, 2, 3, 4, 5, 6]}), lazy=True
 95 | )
 96 | ```
 97 | 
 98 | ```{note}
 99 | In order to use `drop_invalid_rows=True`, `lazy=True` must
100 | be passed to the `schema.validate()`. {ref}`lazy-validation` enables all schema
101 | errors to be collected and raised together, meaning all invalid rows can be dropped together.
102 | This provides clear API for ensuring the validated dataframe contains only valid data.
103 | ```
104 | 


--------------------------------------------------------------------------------
/docs/source/error_report.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | file_format: mystnb
  3 | ---
  4 | 
  5 | (error-report)=
  6 | 
  7 | # Error Reports
  8 | 
  9 | *new in 0.19.0*
 10 | 
 11 | The pandera error report is a generalised machine-readable summary of failures
 12 | which occurred during schema validation. It is available for both `pysparksql` and
 13 | `pandas` objects.
 14 | 
 15 | By default, error reports are generated for both schema and data level validation,
 16 | but more granular control over schema or data only validations is available.
 17 | 
 18 | This is achieved by introducing configurable settings using environment variables
 19 | that allow you to control execution at three different levels:
 20 | 
 21 | 1. `SCHEMA_ONLY`: perform schema validations only. It checks that data conforms
 22 |    to the schema definition, but does not perform any data-level validations on dataframe.
 23 | 2. `DATA_ONLY`: perform data-level validations only. It validates that data
 24 |    conforms to the defined `checks`, but does not validate the schema.
 25 | 3. `SCHEMA_AND_DATA`: (**default**) perform both schema and data level
 26 |    validations. It runs most exhaustive validation and could be compute intensive.
 27 | 
 28 | You can override default behaviour by setting an environment variable from terminal
 29 | before running the `pandera` process as:
 30 | 
 31 | ```bash
 32 | export PANDERA_VALIDATION_DEPTH=SCHEMA_ONLY
 33 | ```
 34 | 
 35 | This will be picked up by `pandera` to only enforce SCHEMA level validations.
 36 | 
 37 | ## Error reports with `pandas`
 38 | 
 39 | To create an error report with pandas, you must specify `lazy=True` to allow all errors
 40 | to be aggregated and raised together as a `SchemaErrors`.
 41 | 
 42 | ```{code-cell} python
 43 | import pandas as pd
 44 | import pandera.pandas as pa
 45 | import json
 46 | 
 47 | pandas_schema = pa.DataFrameSchema(
 48 |     {
 49 |         "color": pa.Column(str, pa.Check.isin(["red", "green", "blue"])),
 50 |         "length": pa.Column(int, pa.Check.gt(10)),
 51 |     }
 52 | )
 53 | data = [("red", 4), ("blue", 11), ("purple", 15), ("green", 39)]
 54 | 
 55 | df = pd.DataFrame(
 56 |     {
 57 |         "color": ["red", "blue", "purple", "green"],
 58 |         "length": [4, 11, 15, 39],
 59 |     }
 60 | )
 61 | 
 62 | try:
 63 |     pandas_schema.validate(df, lazy=True)
 64 | except pa.errors.SchemaErrors as e:
 65 |     print(json.dumps(e.message, indent=2))
 66 | ```
 67 | 
 68 | ## Error reports with `pyspark.sql`
 69 | 
 70 | Accessing the error report on a validated `pyspark` dataframe can be done via the
 71 | `errors` attribute on the `pandera` accessor.
 72 | 
 73 | ```{code-cell} python
 74 | import pandera.pyspark as pa
 75 | import pyspark.sql.types as T
 76 | import json
 77 | 
 78 | from decimal import Decimal
 79 | from pyspark.sql import SparkSession
 80 | from pandera.pyspark import DataFrameModel
 81 | 
 82 | spark = SparkSession.builder.getOrCreate()
 83 | 
 84 | class PysparkPanderSchema(DataFrameModel):
 85 |     color: T.StringType() = pa.Field(isin=["red", "green", "blue"])
 86 |     length: T.IntegerType() = pa.Field(gt=10)
 87 | 
 88 | data = [("red", 4), ("blue", 11), ("purple", 15), ("green", 39)]
 89 | 
 90 | spark_schema = T.StructType(
 91 |     [
 92 |         T.StructField("color", T.StringType(), False),
 93 |         T.StructField("length", T.IntegerType(), False),
 94 |     ],
 95 | )
 96 | 
 97 | df = spark.createDataFrame(data, spark_schema)
 98 | df_out = PysparkPanderSchema.validate(check_obj=df)
 99 | 
100 | print(json.dumps(dict(df_out.pandera.errors), indent=4))
101 | ```
102 | 


--------------------------------------------------------------------------------
/docs/source/fastapi.md:
--------------------------------------------------------------------------------
 1 | ```{eval-rst}
 2 | .. currentmodule:: pandera
 3 | ```
 4 | 
 5 | (fastapi-integration)=
 6 | 
 7 | # FastAPI
 8 | 
 9 | *new in 0.9.0*
10 | 
11 | Since both FastAPI and Pandera integrates seamlessly with Pydantic, you can
12 | use the {py:class}`~pandera.api.pandas.model.DataFrameModel` types to validate incoming
13 | or outgoing data with respect to your API endpoints.
14 | 
15 | ## Using DataFrameModels to Validate Endpoint Inputs and Outputs
16 | 
17 | Suppose we want to process transactions, where each transaction has an
18 | `id` and `cost`. We can model this with a pandera dataframe model:
19 | 
20 | ```{literalinclude} ../../tests/fastapi/models.py
21 | :language: python
22 | :lines: 1-14
23 | ```
24 | 
25 | Also suppose that we expect our endpoint to add a `name` to the transaction
26 | data:
27 | 
28 | ```{literalinclude} ../../tests/fastapi/models.py
29 | :language: python
30 | :lines: 22-25
31 | ```
32 | 
33 | Let's also assume that the output of the endpoint should be a list of dictionary
34 | records containing the named transactions data. We can do this easily with the
35 | `to_format` option in the dataframe model {py:class}`~pandera.typing.config.BaseConfig`.
36 | 
37 | ```{literalinclude} ../../tests/fastapi/models.py
38 | :language: python
39 | :lines: 34-37
40 | ```
41 | 
42 | Note that the `to_format_kwargs` is a dictionary of key-word arguments
43 | to be passed into the respective pandas `to_{format}` method.
44 | 
45 | % TODO: create new page for the to/from_format config option
46 | 
47 | Next we'll create a FastAPI app and define a `/transactions/` POST endpoint:
48 | 
49 | ```{literalinclude} ../../tests/fastapi/app.py
50 | :language: python
51 | :lines: 2-6,14-21,28-34
52 | ```
53 | 
54 | ## Reading File Uploads
55 | 
56 | Similar to the `TransactionsDictOut` example to convert dataframes to a
57 | particular format as an endpoint response, pandera also provides a
58 | `from_format` dataframe model configuration option to read a dataframe from
59 | a particular serialization format.
60 | 
61 | ```{literalinclude} ../../tests/fastapi/models.py
62 | :language: python
63 | :lines: 17-19
64 | ```
65 | 
66 | Let's also define a response model for the `/file/` upload endpoint:
67 | 
68 | ```{literalinclude} ../../tests/fastapi/models.py
69 | :language: python
70 | :lines: 28-32,46-48
71 | ```
72 | 
73 | In the next example, we use the pandera
74 | {py:class}`~pandera.typing.fastapi.UploadFile` type to upload a parquet file
75 | to the `/file/` POST endpoint and return a response containing the filename
76 | and the modified data in json format.
77 | 
78 | ```{literalinclude} ../../tests/fastapi/app.py
79 | :language: python
80 | :lines: 37-44
81 | ```
82 | 
83 | Pandera's {py:class}`~pandera.typing.fastapi.UploadFile` type is a subclass of FastAPI's
84 | [UploadFile](https://fastapi.tiangolo.com/tutorial/request-files/?h=uploadfile#uploadfile)
85 | but it exposes a `.data` property containing the pandera-validated dataframe.
86 | 
87 | ## Takeaway
88 | 
89 | With the FastAPI and Pandera integration, you can use Pandera
90 | {py:class}`~pandera.api.pandas.model.DataFrameModel` types to validate the dataframe inputs
91 | and outputs of your FastAPI endpoints.
92 | 


--------------------------------------------------------------------------------
/docs/source/frictionless.md:
--------------------------------------------------------------------------------
 1 | ```{eval-rst}
 2 | .. currentmodule:: pandera
 3 | ```
 4 | 
 5 | (frictionless-integration)=
 6 | 
 7 | # Reading Third-Party Schema
 8 | 
 9 | *new in 0.7.0*
10 | 
11 | Pandera now accepts schema from other data validation frameworks. This requires
12 | a pandera installation with the `io` extension; please see the
13 | {ref}`installation<installation>` instructions for more details.
14 | 
15 | ## Frictionless Data Schema
16 | 
17 | :::{note}
18 | Please see the
19 | [Frictionless schema](https://specs.frictionlessdata.io/table-schema/)
20 | documentation for more information on this standard.
21 | :::
22 | 
23 | ```{eval-rst}
24 | .. autofunction:: pandera.io.from_frictionless_schema
25 | ```
26 | 
27 | under the hood, this uses the {class}`~pandera.io.pandas_io.FrictionlessFieldParser` class
28 | to parse each frictionless field (column):
29 | 
30 | ```{eval-rst}
31 | .. autoclass:: pandera.io.pandas_io.FrictionlessFieldParser
32 |     :members:
33 | ```
34 | 


--------------------------------------------------------------------------------
/docs/source/geopandas.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | file_format: mystnb
 3 | ---
 4 | 
 5 | ```{eval-rst}
 6 | .. currentmodule:: pandera
 7 | ```
 8 | 
 9 | (supported-lib-geopandas)=
10 | 
11 | # Data Validation with GeoPandas
12 | 
13 | *new in 0.9.0*
14 | 
15 | [GeoPandas](https://geopandas.org/en/stable/docs.html) is an extension of Pandas that adds
16 | support for geospatial data. You can use pandera to validate {py:func}`~geopandas.GeoDataFrame`
17 | and {py:func}`~geopandas.GeoSeries` objects directly. First, install
18 | `pandera` with the `geopandas` extra:
19 | 
20 | ```bash
21 | pip install 'pandera[geopandas]'
22 | ```
23 | 
24 | Then you can use pandera schemas to validate geodataframes. In the example
25 | below we'll use the {ref}`class-based API <dataframe-models>` to define a
26 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation.
27 | 
28 | ```{code-cell} python
29 | import geopandas as gpd
30 | import pandas as pd
31 | import pandera.pandas as pa
32 | from shapely.geometry import Polygon
33 | 
34 | geo_schema = pa.DataFrameSchema({
35 |     "geometry": pa.Column("geometry"),
36 |     "region": pa.Column(str),
37 | })
38 | 
39 | geo_df = gpd.GeoDataFrame({
40 |     "geometry": [
41 |         Polygon(((0, 0), (0, 1), (1, 1), (1, 0))),
42 |         Polygon(((0, 0), (0, -1), (-1, -1), (-1, 0)))
43 |     ],
44 |     "region": ["NA", "SA"]
45 | })
46 | 
47 | geo_schema.validate(geo_df)
48 | ```
49 | 
50 | You can also use the `GeometryDtype` data type in either instantiated or
51 | un-instantiated form:
52 | 
53 | ```{code-cell} python
54 | geo_schema = pa.DataFrameSchema({
55 |     "geometry": pa.Column(gpd.array.GeometryDtype),
56 |     # or
57 |     "geometry": pa.Column(gpd.array.GeometryDtype()),
58 | })
59 | ```
60 | 
61 | If you want to validate-on-instantiation, you can use the
62 | {py:class}`~pandera.typing.geopangas.GeoDataFrame` generic type with the
63 | dataframe model defined above:
64 | 
65 | ```{code-cell} python
66 | from pandera.typing import Series
67 | from pandera.typing.geopandas import GeoDataFrame, GeoSeries
68 | 
69 | 
70 | class Schema(pa.DataFrameModel):
71 |     geometry: GeoSeries
72 |     region: Series[str]
73 | 
74 | 
75 | # create a geodataframe that's validated on object initialization
76 | df = GeoDataFrame[Schema](
77 |     {
78 |         'geometry': [
79 |             Polygon(((0, 0), (0, 1), (1, 1), (1, 0))),
80 |             Polygon(((0, 0), (0, -1), (-1, -1), (-1, 0)))
81 |         ],
82 |         'region': ['NA','SA']
83 |     }
84 | )
85 | df
86 | ```
87 | 


--------------------------------------------------------------------------------
/docs/source/integrations.md:
--------------------------------------------------------------------------------
 1 | (integrations)=
 2 | 
 3 | # Integrations
 4 | 
 5 | Pandera ships with integrations with other tools in the Python ecosystem, with
 6 | the goal of interoperating with libraries that you know and love.
 7 | 
 8 | ```{eval-rst}
 9 | .. list-table::
10 |    :widths: 25 75
11 | 
12 |    * - :ref:`FastAPI <fastapi-integration>`
13 |      - Use pandera DataFrameModels in your FastAPI app
14 |    * - :ref:`Frictionless <frictionless-integration>`
15 |      - Convert frictionless schemas to pandera schemas
16 |    * - :ref:`Hypothesis <data-synthesis-strategies>`
17 |      - Use the hypothesis library to generate valid data under your schema's constraints.
18 |    * - :ref:`Mypy <mypy-integration>`
19 |      - Type-lint your pandas and pandera code with mypy for static type safety [experimental 🧪]
20 |    * - :ref:`Pydantic <pydantic-integration>`
21 |      - Use pandera DataFrameModels when defining  your pydantic BaseModels
22 | ```
23 | 
24 | ```{toctree}
25 | :caption: Introduction
26 | :hidden: true
27 | :maxdepth: 1
28 | 
29 | FastAPI <fastapi>
30 | Frictionless <frictionless>
31 | Hypothesis <data_synthesis_strategies>
32 | Mypy <mypy_integration>
33 | Pydantic <pydantic_integration>
34 | ```
35 | 
36 | :::{note}
37 | Don't see a library that you want supported? Check out the
38 | [github issues](https://github.com/pandera-dev/pandera/issues) to see if
39 | that library is in the roadmap. If it isn't, open up a
40 | [new issue](https://github.com/pandera-dev/pandera/issues/new?assignees=&labels=enhancement&template=feature_request.md&title=)
41 | to add support for it!
42 | :::
43 | 


--------------------------------------------------------------------------------
/docs/source/jupyterlite_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "LiteBuildConfig": {
3 |         "federated_extensions": [],
4 |         "ignore_sys_prefix": true,
5 |         "piplite_urls": []
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/source/lazy_validation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | file_format: mystnb
 3 | ---
 4 | 
 5 | ```{currentmodule} pandera
 6 | ```
 7 | 
 8 | (lazy-validation)=
 9 | 
10 | # Lazy Validation
11 | 
12 | *New in version 0.4.0*
13 | 
14 | By default, when you call the `validate` method on schema or schema component
15 | objects, a {class}`~pandera.errors.SchemaError` is raised as soon as one of the
16 | assumptions specified in the schema is falsified. For example, for a
17 | {class}`~pandera.api.pandas.container.DataFrameSchema` object, the following situations will raise an
18 | exception:
19 | 
20 | - a column specified in the schema is not present in the dataframe.
21 | - if `strict=True`, a column in the dataframe is not specified in the schema.
22 | - the `data type` does not match.
23 | - if `coerce=True`, the dataframe column cannot be coerced into the specified
24 |   `data type`.
25 | - the {class}`~pandera.api.checks.Check` specified in one of the columns returns `False` or
26 |   a boolean series containing at least one `False` value.
27 | 
28 | For example:
29 | 
30 | ```{code-cell} python
31 | import pandas as pd
32 | import pandera.pandas as pa
33 | 
34 | 
35 | df = pd.DataFrame({"column": ["a", "b", "c"]})
36 | 
37 | schema = pa.DataFrameSchema({"column": pa.Column(int)})
38 | 
39 | try:
40 |     schema.validate(df)
41 | except pa.errors.SchemaError as exc:
42 |     print(exc)
43 | ```
44 | 
45 | For more complex cases, it is useful to see all of the errors raised during
46 | the `validate` call so that you can debug the causes of errors on different
47 | columns and checks. The `lazy` keyword argument in the `validate` method
48 | of all schemas and schema components gives you the option of doing just this:
49 | 
50 | ```{code-cell} python
51 | import json
52 | 
53 | import pandas as pd
54 | import pandera.pandas as pa
55 | 
56 | 
57 | schema = pa.DataFrameSchema(
58 |     columns={
59 |         "int_column": pa.Column(int),
60 |         "float_column": pa.Column(float, pa.Check.greater_than(0)),
61 |         "str_column": pa.Column(str, pa.Check.equal_to("a")),
62 |         "date_column": pa.Column(pa.DateTime),
63 |     },
64 |     strict=True
65 | )
66 | 
67 | df = pd.DataFrame({
68 |     "int_column": ["a", "b", "c"],
69 |     "float_column": [0, 1, 2],
70 |     "str_column": ["a", "b", "d"],
71 |     "unknown_column": None,
72 | })
73 | 
74 | try:
75 |     schema.validate(df, lazy=True)
76 | except pa.errors.SchemaErrors as exc:
77 |     print(json.dumps(exc.message, indent=2))
78 | ```
79 | 
80 | As you can see from the output above, a {class}`~pandera.errors.SchemaErrors`
81 | exception is raised with a summary of the error counts and failure cases
82 | caught by the schema. This summary is called an {ref}`error-report`.
83 | 
84 | You can also inspect the failure cases in a more granular form:
85 | 
86 | ```{code-cell} python
87 | try:
88 |     schema.validate(df, lazy=True)
89 | except pa.errors.SchemaErrors as exc:
90 |     print("Schema errors and failure cases:")
91 |     print(exc.failure_cases)
92 |     print("\nDataFrame object that failed validation:")
93 |     print(exc.data)
94 | ```
95 | 


--------------------------------------------------------------------------------
/docs/source/modin.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | file_format: mystnb
  3 | ---
  4 | 
  5 | ```{currentmodule} pandera
  6 | ```
  7 | 
  8 | (scaling-modin)=
  9 | 
 10 | # Data Validation with Modin
 11 | 
 12 | *new in 0.8.0*
 13 | 
 14 | [Modin](https://modin.readthedocs.io/en/latest/) is a distributed
 15 | compute framework that offers a pandas drop-in replacement dataframe
 16 | implementation. You can use pandera to validate {py:func}`~modin.pandas.DataFrame`
 17 | and {py:func}`~modin.pandas.Series` objects directly. First, install
 18 | `pandera` with the `dask` extra:
 19 | 
 20 | ```bash
 21 | pip install 'pandera[modin]'       # installs both ray and dask backends
 22 | pip install 'pandera[modin-ray]'   # only ray backend
 23 | pip install 'pandera[modin-dask]'  # only dask backend
 24 | ```
 25 | 
 26 | Then you can use pandera schemas to validate modin dataframes. In the example
 27 | below we'll use the {ref}`class-based API <dataframe-models>` to define a
 28 | {py:class}`~pandera.api.model.pandas.DataFrameModel` for validation.
 29 | 
 30 | ```python
 31 | import modin.pandas as pd
 32 | import pandera.pandas as pa
 33 | 
 34 | from pandera.typing.modin import DataFrame, Series
 35 | 
 36 | 
 37 | class Schema(pa.DataFrameModel):
 38 |     state: Series[str]
 39 |     city: Series[str]
 40 |     price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})
 41 | 
 42 | 
 43 | # create a modin dataframe that's validated on object initialization
 44 | df = DataFrame[Schema](
 45 |     {
 46 |         'state': ['FL','FL','FL','CA','CA','CA'],
 47 |         'city': [
 48 |             'Orlando',
 49 |             'Miami',
 50 |             'Tampa',
 51 |             'San Francisco',
 52 |             'Los Angeles',
 53 |             'San Diego',
 54 |         ],
 55 |         'price': [8, 12, 10, 16, 20, 18],
 56 |     }
 57 | )
 58 | print(df)
 59 | ```
 60 | 
 61 | ```
 62 |   state           city  price
 63 | 0    FL        Orlando      8
 64 | 1    FL          Miami     12
 65 | 2    FL          Tampa     10
 66 | 3    CA  San Francisco     16
 67 | 4    CA    Los Angeles     20
 68 | 5    CA      San Diego     18
 69 | ```
 70 | 
 71 | You can also use the {py:func}`~pandera.check_types` decorator to validate
 72 | modin dataframes at runtime:
 73 | 
 74 | ```python
 75 | @pa.check_types
 76 | def function(df: DataFrame[Schema]) -> DataFrame[Schema]:
 77 |     return df[df["state"] == "CA"]
 78 | 
 79 | function(df)
 80 | ```
 81 | 
 82 | ```
 83 |   state           city  price
 84 | 3    CA  San Francisco     16
 85 | 4    CA    Los Angeles     20
 86 | 5    CA      San Diego     18
 87 | ```
 88 | 
 89 | And of course, you can use the object-based API to validate modin dataframes:
 90 | 
 91 | ```python
 92 | schema = pa.DataFrameSchema({
 93 |     "state": pa.Column(str),
 94 |     "city": pa.Column(str),
 95 |     "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20))
 96 | })
 97 | schema(df)
 98 | ```
 99 | 
100 | ```
101 |   state           city  price
102 | 0    FL        Orlando      8
103 | 1    FL          Miami     12
104 | 2    FL          Tampa     10
105 | 3    CA  San Francisco     16
106 | 4    CA    Los Angeles     20
107 | 5    CA      San Diego     18
108 | ```
109 | 


--------------------------------------------------------------------------------
/docs/source/pyspark.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | file_format: mystnb
 3 | ---
 4 | 
 5 | ```{currentmodule} pandera
 6 | ```
 7 | 
 8 | (scaling-pyspark)=
 9 | 
10 | # Data Validation with Pyspark Pandas
11 | 
12 | *new in 0.10.0*
13 | 
14 | [Pyspark](https://spark.apache.org/docs/3.2.0/api/python/index.html) is a
15 | distributed compute framework that offers a pandas drop-in replacement dataframe
16 | implementation via the [pyspark.pandas API](https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/index.html) .
17 | You can use pandera to validate {py:func}`~pyspark.pandas.DataFrame`
18 | and {py:func}`~pyspark.pandas.Series` objects directly. First, install
19 | `pandera` with the `pyspark` extra:
20 | 
21 | ```bash
22 | pip install 'pandera[pyspark]'
23 | ```
24 | 
25 | Then you can use pandera schemas to validate pyspark dataframes. In the example
26 | below we'll use the {ref}`class-based API <dataframe-models>` to define a
27 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation.
28 | 
29 | ```{code-cell} python
30 | import pyspark.pandas as ps
31 | import pandas as pd
32 | import pandera.pandas as pa
33 | 
34 | from pandera.typing.pyspark import DataFrame, Series
35 | 
36 | 
37 | class Schema(pa.DataFrameModel):
38 |     state: Series[str]
39 |     city: Series[str]
40 |     price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})
41 | 
42 | 
43 | # create a pyspark.pandas dataframe that's validated on object initialization
44 | df = DataFrame[Schema](
45 |     {
46 |         'state': ['FL','FL','FL','CA','CA','CA'],
47 |         'city': [
48 |             'Orlando',
49 |             'Miami',
50 |             'Tampa',
51 |             'San Francisco',
52 |             'Los Angeles',
53 |             'San Diego',
54 |         ],
55 |         'price': [8, 12, 10, 16, 20, 18],
56 |     }
57 | )
58 | print(df)
59 | ```
60 | 
61 | You can also use the {py:func}`~pandera.check_types` decorator to validate
62 | pyspark pandas dataframes at runtime:
63 | 
64 | ```{code-cell} python
65 | @pa.check_types
66 | def function(df: DataFrame[Schema]) -> DataFrame[Schema]:
67 |     return df[df["state"] == "CA"]
68 | 
69 | print(function(df))
70 | ```
71 | 
72 | And of course, you can use the object-based API to validate dask dataframes:
73 | 
74 | ```{code-cell} python
75 | schema = pa.DataFrameSchema({
76 |     "state": pa.Column(str),
77 |     "city": pa.Column(str),
78 |     "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20))
79 | })
80 | schema(df)
81 | ```
82 | 


--------------------------------------------------------------------------------
/docs/source/reference/core.rst:
--------------------------------------------------------------------------------
 1 | .. _api-core:
 2 | 
 3 | Core
 4 | ====
 5 | 
 6 | Schemas
 7 | -------
 8 | 
 9 | .. autosummary::
10 |    :toctree: generated
11 |    :template: class.rst
12 |    :nosignatures:
13 | 
14 |    pandera.api.pandas.container.DataFrameSchema
15 |    pandera.api.pandas.array.SeriesSchema
16 |    pandera.api.polars.container.DataFrameSchema
17 |    pandera.api.pyspark.container.DataFrameSchema
18 |    pandera.api.dataframe.container.DataFrameSchema
19 | 
20 | Schema Components
21 | -----------------
22 | 
23 | .. autosummary::
24 |    :toctree: generated
25 |    :template: class.rst
26 |    :nosignatures:
27 | 
28 |    pandera.api.pandas.components.Column
29 |    pandera.api.pandas.components.Index
30 |    pandera.api.pandas.components.MultiIndex
31 |    pandera.api.polars.components.Column
32 |    pandera.api.pyspark.components.Column
33 |    pandera.api.dataframe.components.ComponentSchema
34 | 
35 | Checks
36 | ------
37 | 
38 | .. autosummary::
39 |    :toctree: generated
40 |    :template: class.rst
41 |    :nosignatures:
42 | 
43 |    pandera.api.checks.Check
44 |    pandera.api.hypotheses.Hypothesis
45 | 
46 | Data Objects
47 | ------------
48 | 
49 | .. autosummary::
50 |    :toctree: generated
51 |    :template: class.rst
52 |    :nosignatures:
53 | 
54 |    pandera.api.polars.types.PolarsData
55 |    pandera.api.pyspark.types.PysparkDataframeColumnObject
56 | 
57 | Configuration
58 | -------------
59 | 
60 | .. autosummary::
61 |    :toctree: generated
62 |    :template: class.rst
63 |    :nosignatures:
64 | 
65 |    pandera.config.PanderaConfig
66 |    pandera.config.ValidationDepth
67 |    pandera.config.ValidationScope
68 |    pandera.config.config_context
69 |    pandera.config.get_config_context
70 | 


--------------------------------------------------------------------------------
/docs/source/reference/dataframe_models.rst:
--------------------------------------------------------------------------------
  1 | .. _api-dataframe-models:
  2 | 
  3 | DataFrame Models
  4 | ================
  5 | 
  6 | DataFrame Model
  7 | ---------------
  8 | 
  9 | .. autosummary::
 10 |    :toctree: generated
 11 |    :template: class.rst
 12 | 
 13 |    pandera.api.pandas.model.DataFrameModel
 14 |    pandera.api.polars.model.DataFrameModel
 15 |    pandera.api.pyspark.model.DataFrameModel
 16 |    pandera.api.dataframe.model.DataFrameModel
 17 | 
 18 | Model Components
 19 | ----------------
 20 | 
 21 | .. autosummary::
 22 |    :toctree: generated
 23 | 
 24 |    pandera.api.dataframe.model_components.Field
 25 |    pandera.api.dataframe.model_components.check
 26 |    pandera.api.dataframe.model_components.dataframe_check
 27 |    pandera.api.dataframe.model_components.parser
 28 |    pandera.api.dataframe.model_components.dataframe_parser
 29 | 
 30 | 
 31 | Config
 32 | ------
 33 | 
 34 | .. autosummary::
 35 |    :toctree: generated
 36 |    :template: model_component_class.rst
 37 |    :nosignatures:
 38 | 
 39 |    pandera.api.pandas.model_config.BaseConfig
 40 |    pandera.api.polars.model_config.BaseConfig
 41 |    pandera.api.pyspark.model_config.BaseConfig
 42 | 
 43 | 
 44 | Typing
 45 | ------
 46 | 
 47 | Pandas
 48 | ******
 49 | 
 50 | .. autosummary::
 51 |    :toctree: generated
 52 |    :template: class.rst
 53 | 
 54 |    pandera.typing.DataFrame
 55 |    pandera.typing.Series
 56 |    pandera.typing.Index
 57 | 
 58 | Geopandas
 59 | *********
 60 | 
 61 | .. autosummary::
 62 |    :toctree: generated
 63 |    :template: class.rst
 64 | 
 65 |    pandera.typing.geopandas.GeoDataFrame
 66 |    pandera.typing.geopandas.GeoSeries
 67 | 
 68 | Dask
 69 | ****
 70 | 
 71 | .. autosummary::
 72 |    :toctree: generated
 73 |    :template: class.rst
 74 | 
 75 |    pandera.typing.dask.DataFrame
 76 |    pandera.typing.dask.Series
 77 |    pandera.typing.dask.Index
 78 | 
 79 | Pyspark
 80 | *******
 81 | 
 82 | .. autosummary::
 83 |    :toctree: generated
 84 |    :template: class.rst
 85 | 
 86 |    pandera.typing.pyspark.DataFrame
 87 |    pandera.typing.pyspark.Series
 88 |    pandera.typing.pyspark.Index
 89 | 
 90 | Modin
 91 | *****
 92 | 
 93 | .. autosummary::
 94 |    :toctree: generated
 95 |    :template: class.rst
 96 | 
 97 |    pandera.typing.modin.DataFrame
 98 |    pandera.typing.modin.Series
 99 |    pandera.typing.modin.Index
100 | 
101 | FastAPI
102 | *******
103 | 
104 | .. autosummary::
105 |    :toctree: generated
106 |    :template: class.rst
107 | 
108 |    pandera.typing.fastapi.UploadFile
109 | 
110 | 
111 | Serialization Formats
112 | *********************
113 | 
114 | .. autosummary::
115 |    :toctree: generated
116 |    :template: class.rst
117 | 
118 |    pandera.typing.formats.Formats
119 | 


--------------------------------------------------------------------------------
/docs/source/reference/decorators.rst:
--------------------------------------------------------------------------------
 1 | .. _api-decorators:
 2 | 
 3 | Decorators
 4 | ==========
 5 | 
 6 | .. autosummary::
 7 |    :toctree: generated
 8 |    :nosignatures:
 9 | 
10 |    pandera.decorators.check_input
11 |    pandera.decorators.check_output
12 |    pandera.decorators.check_io
13 |    pandera.decorators.check_types
14 | 


--------------------------------------------------------------------------------
/docs/source/reference/errors.rst:
--------------------------------------------------------------------------------
 1 | .. _api-errors:
 2 | 
 3 | Errors
 4 | ======
 5 | 
 6 | .. autosummary::
 7 |    :toctree: generated
 8 |    :template: class.rst
 9 |    :nosignatures:
10 | 
11 |    pandera.errors.SchemaError
12 |    pandera.errors.SchemaErrors
13 |    pandera.errors.SchemaInitError
14 |    pandera.errors.SchemaDefinitionError
15 | 


--------------------------------------------------------------------------------
/docs/source/reference/extensions.rst:
--------------------------------------------------------------------------------
 1 | .. _api-extensions:
 2 | 
 3 | Extensions
 4 | ==========
 5 | 
 6 | .. autosummary::
 7 |    :toctree: generated
 8 |    :template: module.rst
 9 |    :nosignatures:
10 | 
11 |    pandera.extensions
12 | 


--------------------------------------------------------------------------------
/docs/source/reference/index.md:
--------------------------------------------------------------------------------
 1 | % pandera package index documentation toctree
 2 | 
 3 | ```{eval-rst}
 4 | .. currentmodule:: pandera
 5 | ```
 6 | 
 7 | # API
 8 | 
 9 | ```{eval-rst}
10 | .. list-table::
11 |    :widths: 30 70
12 | 
13 |    * - :ref:`Core <api-core>`
14 |      - The core objects for defining pandera schemas
15 |    * - :ref:`Data Types <api-dtypes>`
16 |      - Data types for type checking and coercion.
17 |    * - :ref:`DataFrame Models <api-dataframe-models>`
18 |      - Alternative class-based API for defining types for tabular/array-like data.
19 |    * - :ref:`Decorators <api-decorators>`
20 |      - Decorators for integrating pandera schemas with python functions.
21 |    * - :ref:`Schema Inference <api-schema-inference>`
22 |      - Bootstrap schemas from real data
23 |    * - :ref:`IO Utilities <api-io-utils>`
24 |      - Utility functions for reading/writing schemas
25 |    * - :ref:`Data Synthesis Strategies <api-strategies>`
26 |      - Module of functions for generating data from schemas.
27 |    * - :ref:`Extensions <api-extensions>`
28 |      - Utility functions for extending pandera functionality
29 |    * - :ref:`Errors <api-errors>`
30 |      - Pandera-specific exceptions
31 | ```
32 | 
33 | ```{toctree}
34 | :hidden: true
35 | 
36 | core
37 | dtypes
38 | dataframe_models
39 | decorators
40 | schema_inference
41 | io
42 | strategies
43 | extensions
44 | errors
45 | ```
46 | 


--------------------------------------------------------------------------------
/docs/source/reference/io.rst:
--------------------------------------------------------------------------------
 1 | .. _api-io-utils:
 2 | 
 3 | IO Utilities
 4 | ============
 5 | 
 6 | The ``io`` module and built-in ``Hypothesis`` checks require a pandera
 7 | installation with the corresponding extension, see the
 8 | :ref:`installation<installation>` instructions for more details.
 9 | 
10 | .. autosummary::
11 |    :toctree: generated
12 |    :nosignatures:
13 | 
14 |    pandera.io.from_yaml
15 |    pandera.io.to_yaml
16 |    pandera.io.to_script
17 | 


--------------------------------------------------------------------------------
/docs/source/reference/schema_inference.rst:
--------------------------------------------------------------------------------
 1 | .. _api-schema-inference:
 2 | 
 3 | Schema Inference
 4 | ================
 5 | 
 6 | .. autosummary::
 7 |    :toctree: generated
 8 |    :nosignatures:
 9 | 
10 |    pandera.schema_inference.pandas.infer_schema
11 | 


--------------------------------------------------------------------------------
/docs/source/reference/strategies.rst:
--------------------------------------------------------------------------------
 1 | .. _api-strategies:
 2 | 
 3 | Data Synthesis Strategies
 4 | =========================
 5 | 
 6 | .. autosummary::
 7 |    :toctree: generated
 8 |    :template: strategies_module.rst
 9 |    :nosignatures:
10 | 
11 |    pandera.strategies.pandas_strategies
12 | 


--------------------------------------------------------------------------------
/docs/source/series_schemas.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | file_format: mystnb
 3 | ---
 4 | 
 5 | % pandera documentation for seriesschemas
 6 | 
 7 | ```{currentmodule} pandera
 8 | ```
 9 | 
10 | (seriesschemas)=
11 | 
12 | # Series Schemas
13 | 
14 | The {class}`~pandera.api.pandas.array.SeriesSchema` class allows for the validation of pandas
15 | `Series` objects, and are very similar to {ref}`columns<column>` and
16 | {ref}`indexes<index>` described in {ref}`DataFrameSchemas<DataFrameSchemas>`.
17 | 
18 | ```{code-cell} python
19 | import pandas as pd
20 | import pandera.pandas as pa
21 | 
22 | schema = pa.SeriesSchema(
23 |     str,
24 |     checks=[
25 |         pa.Check(lambda s: s.str.startswith("foo")),
26 |         pa.Check(lambda s: s.str.endswith("bar")),
27 |         pa.Check(lambda x: len(x) > 3, element_wise=True)
28 |     ],
29 |     nullable=False,
30 |     unique=False,
31 |     name="my_series")
32 | 
33 | validated_series = schema.validate(
34 |     pd.Series(["foobar", "foobar", "foobar"], name="my_series")
35 | )
36 | 
37 | validated_series
38 | ```
39 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pandera-dev
 2 | channels:
 3 |   - conda-forge
 4 | 
 5 | dependencies:
 6 |   # environment management
 7 |   - pip
 8 | 
 9 |   # pandera dependencies
10 |   - packaging >= 20.0
11 |   - typing_extensions
12 |   - hypothesis >= 6.92.7
13 |   - pyyaml >= 5.1
14 |   - typing_inspect >= 0.6.0
15 |   - frictionless <= 4.40.8  # v5.* introduces breaking changes
16 |   - pyarrow
17 |   - pydantic
18 | 
19 |   # hypotheses extra
20 |   - scipy
21 | 
22 |   # mypy extra
23 |   - pandas-stubs
24 | 
25 |   # pyspark extra
26 |   - pyspark[connect] >= 3.2.0, < 4.0.0
27 | 
28 |   # polars extra
29 |   - polars >= 0.20.0
30 | 
31 |   # modin extra
32 |   - modin
33 |   - protobuf
34 | 
35 |   # geopandas extra
36 |   - geopandas
37 |   - shapely
38 | 
39 |   # fastapi extra
40 |   - fastapi
41 | 
42 |   # testing and dependencies
43 |   - black >= 24.0
44 | 
45 |   # testing
46 |   - numpy >= 1.24.4
47 |   - pandas >= 2.1.1
48 |   - isort >= 5.7.0
49 |   - joblib
50 |   - mypy = 1.10.0
51 |   - pylint < 3.3
52 |   - pytest
53 |   - pytest-cov
54 |   - pytest-xdist
55 |   - pytest-asyncio
56 |   - pytz
57 |   - xdoctest
58 |   - nox
59 |   - uv
60 |   - setuptools # required in noxfile and not automatically provided by python >= 3.12
61 | 
62 |   # fastapi testing
63 |   - uvicorn
64 |   - python-multipart
65 | 
66 |   # documentation
67 |   - sphinx
68 |   - sphinx-design
69 |   - sphinx-autodoc-typehints <= 1.14.1
70 |   - sphinx-copybutton
71 |   - recommonmark
72 |   - myst-nb
73 | 
74 |   # packaging
75 |   - twine
76 | 
77 |   # performance testing
78 |   - asv >= 0.5.1
79 | 
80 |   # optional
81 |   - pre_commit
82 | 
83 |   - pip:
84 |       # dask extra
85 |       - dask[dataframe]
86 |       - distributed
87 | 
88 |       # docs
89 |       - furo
90 |       - sphinx-docsearch
91 |       - grpcio
92 |       - ray
93 |       - typeguard
94 |       - types-click
95 |       - types-pytz
96 |       - types-pyyaml
97 |       - types-requests
98 |       - types-setuptools
99 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | disable_error_code =annotation-unchecked
 3 | ignore_missing_imports = True
 4 | follow_imports = normal
 5 | allow_redefinition = True
 6 | warn_return_any = False
 7 | warn_unused_configs = True
 8 | show_error_codes = True
 9 | exclude=(?x)(
10 |     ^tests/mypy/pandas_modules
11 |     | ^pandera/engines/pyspark_engine
12 |     | ^pandera/api/pyspark
13 |     | ^pandera/backends/pyspark
14 |     | ^tests/pyspark
15 |   )
16 | [mypy-pandera.api.pyspark.*]
17 | follow_imports = skip
18 | 
19 | [mypy-docs.*]
20 | follow_imports = skip
21 | 


--------------------------------------------------------------------------------
/pandera/__init__.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=wrong-import-position
 2 | """A flexible and expressive dataframe validation library."""
 3 | 
 4 | from pandera._version import __version__
 5 | 
 6 | 
 7 | _warning_msg = """Pandas and numpy have been removed from the base pandera
 8 | dependencies. Please install pandas as part of your environment's
 9 | dependencies or install the pandas extra with:
10 | 
11 | ```bash
12 | pip install pandas pandera
13 | 
14 | # or
15 | pip install 'pandera[pandas]'
16 | ```
17 | """
18 | 
19 | 
20 | try:
21 |     # Only add pandas to the top-level pandera namespace
22 |     # if pandas and numpy are installed
23 |     import pandas as pd
24 |     import numpy as np
25 | 
26 |     from pandera._pandas_deprecated import *
27 |     from pandera._pandas_deprecated import __all__ as _pandas_deprecated_all
28 |     from pandera import dtypes
29 |     from pandera import typing
30 | 
31 |     __all__ = [
32 |         "__version__",
33 |         *_pandas_deprecated_all,
34 |     ]
35 | 
36 | except ImportError as err:
37 |     import warnings
38 | 
39 |     if "pandas" in str(err) or "numpy" in str(err):
40 |         warnings.warn(_warning_msg, UserWarning)
41 |     else:
42 |         raise  # Re-raise any other `ImportError` exceptions
43 | 
44 |     from pandera import dtypes
45 |     from pandera import typing
46 |     from pandera.api.checks import Check
47 |     from pandera.api.dataframe.model_components import (
48 |         Field,
49 |         check,
50 |         dataframe_check,
51 |         dataframe_parser,
52 |         parser,
53 |     )
54 | 
55 |     __all__ = [
56 |         "__version__",
57 |         "Check",
58 |         "Field",
59 |         "check",
60 |         "dataframe_check",
61 |         "dataframe_parser",
62 |         "parser",
63 |         "dtypes",
64 |         "typing",
65 |     ]
66 | 


--------------------------------------------------------------------------------
/pandera/_patch_numpy2.py:
--------------------------------------------------------------------------------
 1 | """Patch numpy 2 to prevent errors."""
 2 | 
 3 | from functools import lru_cache
 4 | 
 5 | 
 6 | @lru_cache
 7 | def _patch_numpy2():
 8 |     """This is a temporary fix for numpy 2.
 9 | 
10 |     pyspark uses np.NaN, which is deprecated in numpy 2.
11 |     """
12 |     import numpy as np
13 | 
14 |     expired_attrs = getattr(np, "_expired_attrs_2_0", None)
15 | 
16 |     if expired_attrs:
17 |         attrs_replacement = {
18 |             "NaN": np.nan,
19 |             "string_": np.bytes_,
20 |             "float_": np.float64,
21 |             "unicode_": np.str_,
22 |         }
23 |         for attr, replacement in attrs_replacement.items():
24 |             has_attr = expired_attrs.__expired_attributes__.pop(attr, None)
25 |             if has_attr:
26 |                 setattr(np, attr, replacement)
27 | 


--------------------------------------------------------------------------------
/pandera/accessors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/accessors/__init__.py


--------------------------------------------------------------------------------
/pandera/accessors/dask_accessor.py:
--------------------------------------------------------------------------------
 1 | """Register dask accessor for pandera schema metadata."""
 2 | 
 3 | from dask.dataframe.extensions import (
 4 |     register_dataframe_accessor,
 5 |     register_series_accessor,
 6 | )
 7 | 
 8 | from pandera.accessors.pandas_accessor import (
 9 |     PanderaDataFrameAccessor,
10 |     PanderaSeriesAccessor,
11 | )
12 | 
13 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
14 | register_series_accessor("pandera")(PanderaSeriesAccessor)
15 | 


--------------------------------------------------------------------------------
/pandera/accessors/modin_accessor.py:
--------------------------------------------------------------------------------
 1 | """Custom accessor functionality for modin.
 2 | 
 3 | Source code adapted from pyspark.pandas implementation:
 4 | https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/api/pyspark.pandas.extensions.register_dataframe_accessor.html?highlight=register_dataframe_accessor#pyspark.pandas.extensions.register_dataframe_accessor
 5 | """
 6 | 
 7 | import warnings
 8 | 
 9 | from pandera.accessors.pandas_accessor import (
10 |     PanderaDataFrameAccessor,
11 |     PanderaSeriesAccessor,
12 | )
13 | 
14 | 
15 | # pylint: disable=too-few-public-methods
16 | class CachedAccessor:
17 |     """
18 |     Custom property-like object.
19 | 
20 |     A descriptor for caching accessors:
21 | 
22 |     :param name: Namespace that accessor's methods, properties, etc will be
23 |         accessed under, e.g. "foo" for a dataframe accessor yields the accessor
24 |         ``df.foo``
25 |     :param cls: Class with the extension methods.
26 | 
27 |     For accessor, the class's __init__ method assumes that you are registering
28 |     an accessor for one of ``Series``, ``DataFrame``, or ``Index``.
29 |     """
30 | 
31 |     def __init__(self, name, accessor):
32 |         self._name = name
33 |         self._accessor = accessor
34 | 
35 |     def __get__(self, obj, cls):
36 |         if obj is None:  # pragma: no cover
37 |             return self._accessor
38 |         accessor_obj = self._accessor(obj)
39 |         object.__setattr__(obj, self._name, accessor_obj)
40 |         return accessor_obj
41 | 
42 | 
43 | def _register_accessor(name, cls):
44 |     """
45 |     Register a custom accessor on {class} objects.
46 | 
47 |     :param name: Name under which the accessor should be registered. A warning
48 |         is issued if this name conflicts with a preexisting attribute.
49 |     :returns: A class decorator callable.
50 |     """
51 | 
52 |     def decorator(accessor):
53 |         if hasattr(cls, name):
54 |             msg = (
55 |                 f"registration of accessor {accessor} under name '{name}' for "
56 |                 "type {cls.__name__} is overriding a preexisting attribute "
57 |                 "with the same name."
58 |             )
59 | 
60 |             warnings.warn(
61 |                 msg,
62 |                 UserWarning,
63 |                 stacklevel=2,
64 |             )
65 |         setattr(cls, name, CachedAccessor(name, accessor))
66 |         return accessor
67 | 
68 |     return decorator
69 | 
70 | 
71 | def register_dataframe_accessor(name):
72 |     """
73 |     Register a custom accessor with a DataFrame
74 | 
75 |     :param name: name used when calling the accessor after its registered
76 |     :returns: a class decorator callable.
77 |     """
78 |     # pylint: disable=import-outside-toplevel
79 |     from modin.pandas import DataFrame
80 | 
81 |     return _register_accessor(name, DataFrame)
82 | 
83 | 
84 | def register_series_accessor(name):
85 |     """
86 |     Register a custom accessor with a Series object
87 | 
88 |     :param name: name used when calling the accessor after its registered
89 |     :returns: a callable class decorator
90 |     """
91 |     # pylint: disable=import-outside-toplevel
92 |     from modin.pandas import Series
93 | 
94 |     return _register_accessor(name, Series)
95 | 
96 | 
97 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
98 | register_series_accessor("pandera")(PanderaSeriesAccessor)
99 | 


--------------------------------------------------------------------------------
/pandera/accessors/pandas_accessor.py:
--------------------------------------------------------------------------------
 1 | """Register pandas accessor for pandera schema metadata."""
 2 | 
 3 | from typing import Optional, Union
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from pandera.api.pandas.array import SeriesSchema
 8 | from pandera.api.pandas.container import DataFrameSchema
 9 | 
10 | Schemas = Union[DataFrameSchema, SeriesSchema]
11 | 
12 | 
13 | class PanderaAccessor:
14 |     """Pandera accessor for pandas object."""
15 | 
16 |     def __init__(self, pandas_obj):
17 |         """Initialize the pandera accessor."""
18 |         self._pandas_obj = pandas_obj
19 |         self._schema: Optional[Schemas] = None
20 | 
21 |     @staticmethod
22 |     def check_schema_type(schema: Schemas):
23 |         """Abstract method for checking the schema type."""
24 |         raise NotImplementedError
25 | 
26 |     def add_schema(self, schema):
27 |         """Add a schema to the pandas object."""
28 |         self.check_schema_type(schema)
29 |         self._schema = schema
30 |         return self._pandas_obj
31 | 
32 |     @property
33 |     def schema(self) -> Optional[Schemas]:
34 |         """Access schema metadata."""
35 |         return self._schema
36 | 
37 | 
38 | @pd.api.extensions.register_dataframe_accessor("pandera")
39 | class PanderaDataFrameAccessor(PanderaAccessor):
40 |     """Pandera accessor for pandas DataFrame."""
41 | 
42 |     @staticmethod
43 |     def check_schema_type(schema):
44 |         if not isinstance(schema, DataFrameSchema):
45 |             raise TypeError(
46 |                 f"schema arg must be a {DataFrameSchema}, found {type(schema)}"
47 |             )
48 | 
49 | 
50 | @pd.api.extensions.register_series_accessor("pandera")
51 | class PanderaSeriesAccessor(PanderaAccessor):
52 |     """Pandera accessor for pandas Series."""
53 | 
54 |     @staticmethod
55 |     def check_schema_type(schema):
56 |         if not isinstance(schema, SeriesSchema):
57 |             raise TypeError(
58 |                 f"schema arg must be a {SeriesSchema}, found {type(schema)}"
59 |             )
60 | 


--------------------------------------------------------------------------------
/pandera/accessors/polars_accessor.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/accessors/polars_accessor.py


--------------------------------------------------------------------------------
/pandera/accessors/pyspark_accessor.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | # NOTE: skip file since py=3.10 yields these errors:
 3 | # https://github.com/pandera-dev/pandera/runs/4998710717?check_suite_focus=true
 4 | """Register pyspark accessor for pandera schema metadata."""
 5 | 
 6 | from pyspark.pandas.extensions import (
 7 |     register_dataframe_accessor,
 8 |     register_series_accessor,
 9 | )
10 | 
11 | from pandera.accessors.pandas_accessor import (
12 |     PanderaDataFrameAccessor,
13 |     PanderaSeriesAccessor,
14 | )
15 | 
16 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
17 | register_series_accessor("pandera")(PanderaSeriesAccessor)
18 | 


--------------------------------------------------------------------------------
/pandera/api/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandera api package.
2 | 
3 | This package contains the public-facing api schema specifications for all
4 | supported data objects.
5 | """
6 | 


--------------------------------------------------------------------------------
/pandera/api/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/base/__init__.py


--------------------------------------------------------------------------------
/pandera/api/base/model_config.py:
--------------------------------------------------------------------------------
 1 | """Class-based dataframe model API configuration."""
 2 | 
 3 | from typing import Any, Optional
 4 | 
 5 | 
 6 | class BaseModelConfig:  # pylint:disable=R0903
 7 |     """Model configuration base class."""
 8 | 
 9 |     #: datatype of the data container. This overrides the data types specified
10 |     #: in any of the fields.
11 |     dtype: Optional[Any] = None
12 | 
13 |     name: Optional[str] = None  #: name of schema
14 |     title: Optional[str] = None  #: human-readable label for schema
15 |     description: Optional[str] = None  #: arbitrary textual description
16 |     coerce: bool = False  #: coerce types of all schema components
17 | 


--------------------------------------------------------------------------------
/pandera/api/base/parsers.py:
--------------------------------------------------------------------------------
 1 | """Data validation base parse."""
 2 | 
 3 | import inspect
 4 | from typing import Any, Dict, NamedTuple, Optional, Tuple, Type
 5 | 
 6 | from pandera.backends.base import BaseParserBackend
 7 | 
 8 | 
 9 | class ParserResult(NamedTuple):
10 |     """Parser result for user-defined parsers."""
11 | 
12 |     parser_output: Any
13 |     parsed_object: Any
14 | 
15 | 
16 | class MetaParser(type):
17 |     """Parser metaclass."""
18 | 
19 |     BACKEND_REGISTRY: Dict[Tuple[Type, Type], Type[BaseParserBackend]] = {}
20 |     """Registry of parser backends implemented for specific data objects."""
21 | 
22 | 
23 | class BaseParser(metaclass=MetaParser):
24 |     """Parser base class."""
25 | 
26 |     def __init__(self, name: Optional[str] = None):
27 |         self.name = name
28 | 
29 |     @classmethod
30 |     def register_backend(cls, type_: Type, backend: Type[BaseParserBackend]):
31 |         """Register a backend for the specified type."""
32 |         cls.BACKEND_REGISTRY[(cls, type_)] = backend
33 | 
34 |     @classmethod
35 |     def get_backend(cls, parse_obj: Any) -> Type[BaseParserBackend]:
36 |         """Get the backend associated with the type of ``parse_obj`` ."""
37 | 
38 |         parse_obj_cls = type(parse_obj)
39 |         classes = inspect.getmro(parse_obj_cls)
40 |         for _class in classes:
41 |             try:
42 |                 return cls.BACKEND_REGISTRY[(cls, _class)]
43 |             except KeyError:
44 |                 pass
45 |         raise KeyError(
46 |             f"Backend not found for class: {parse_obj_cls}. Looked up the "
47 |             f"following base classes: {classes}"
48 |         )
49 | 
50 |     def __eq__(self, other: object) -> bool:
51 |         if not isinstance(other, type(self)):
52 |             return NotImplemented
53 | 
54 |         are_parser_fn_objects_equal = (
55 |             self._get_parser_fn_code() == other._get_parser_fn_code()
56 |         )
57 | 
58 |         are_all_other_parser_attributes_equal = {
59 |             k: v for k, v in self.__dict__.items() if k != "_parser_fn"
60 |         } == {k: v for k, v in other.__dict__.items() if k != "_parser_fn"}
61 | 
62 |         return (
63 |             are_parser_fn_objects_equal
64 |             and are_all_other_parser_attributes_equal
65 |         )
66 | 
67 |     def _get_parser_fn_code(self):
68 |         parser_fn = self.__dict__["_parser_fn"]
69 |         code = parser_fn.__code__.co_code
70 | 
71 |         return code
72 | 
73 |     def __repr__(self) -> str:
74 |         return f"<Parser {self.name}>"
75 | 


--------------------------------------------------------------------------------
/pandera/api/base/types.py:
--------------------------------------------------------------------------------
 1 | """Base type definitions for pandera."""
 2 | 
 3 | from typing import List, Union
 4 | 
 5 | from pandera.api.checks import Check
 6 | from pandera.api.hypotheses import Hypothesis
 7 | from pandera.api.parsers import Parser
 8 | 
 9 | try:
10 |     # python 3.8+
11 |     from typing import Literal  # type: ignore[attr-defined]
12 | except ImportError:  # pragma: no cover
13 |     from typing_extensions import Literal  # type: ignore[assignment]
14 | 
15 | 
16 | StrictType = Union[bool, Literal["filter"]]
17 | CheckList = Union[Check, List[Union[Check, Hypothesis]]]
18 | ParserList = Union[Parser, List[Parser]]
19 | 


--------------------------------------------------------------------------------
/pandera/api/dataframe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/dataframe/__init__.py


--------------------------------------------------------------------------------
/pandera/api/function_dispatch.py:
--------------------------------------------------------------------------------
 1 | """Multidispatcher implementation."""
 2 | 
 3 | from inspect import signature
 4 | from typing import Callable, Dict, Tuple, Type, Union
 5 | import typing_inspect
 6 | 
 7 | 
 8 | class Dispatcher:
 9 |     """Dispatch implementation."""
10 | 
11 |     def __init__(self):
12 |         self._function_registry: Dict[Type, Callable] = {}
13 |         self._name = None
14 | 
15 |     def register(self, fn):
16 |         # Get function signature
17 |         self._name = fn.__name__
18 |         data_types = get_first_arg_type(fn)
19 |         for data_type in data_types:
20 |             self._function_registry[data_type] = fn
21 | 
22 |     def __call__(self, *args, **kwargs):
23 |         input_data_type = type(args[0])
24 |         fn = self._function_registry[input_data_type]
25 |         return fn(*args, **kwargs)
26 | 
27 |     @property
28 |     def co_code(self):
29 |         """Method for getting bytecode of all the registered functions."""
30 |         _code = b""
31 |         for fn in self._function_registry.values():
32 |             _code += fn.__code__.co_code
33 |         return _code
34 | 
35 |     @property
36 |     def __name__(self):
37 |         return f"{self._name}"
38 | 
39 |     def __str__(self):
40 |         return f"{self._name}"
41 | 
42 |     def __repr__(self):
43 |         return f"{self._name}"
44 | 
45 | 
46 | def get_first_arg_type(fn):
47 |     fn_sig = signature(fn)
48 | 
49 |     # register the check strategy for this particular check, identified
50 |     # by the check `name`, and the data type of the check function. This
51 |     # supports Union types. Also assume that the data type of the data
52 |     # object to validate is the first argument.
53 |     data_type = [*fn_sig.parameters.values()][0].annotation
54 | 
55 |     if typing_inspect.get_origin(data_type) in (tuple, Tuple):
56 |         data_type, *_ = typing_inspect.get_args(data_type)
57 | 
58 |     if typing_inspect.get_origin(data_type) is Union:
59 |         data_types = typing_inspect.get_args(data_type)
60 |     else:
61 |         data_types = (data_type,)
62 | 
63 |     return data_types
64 | 


--------------------------------------------------------------------------------
/pandera/api/pandas/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandas core."""
2 | 


--------------------------------------------------------------------------------
/pandera/api/pandas/model_config.py:
--------------------------------------------------------------------------------
 1 | """Class-based dataframe model API configuration for pandas."""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from pandera.api.dataframe.model_config import BaseConfig as _BaseConfig
 6 | from pandera.api.pandas.types import PandasDtypeInputTypes
 7 | 
 8 | 
 9 | class BaseConfig(_BaseConfig):  # pylint:disable=R0903
10 |     """Define pandas DataFrameSchema-wide options."""
11 | 
12 |     #: datatype of the dataframe. This overrides the data types specified in
13 |     #: any of the fields.
14 |     dtype: Optional[PandasDtypeInputTypes] = None
15 | 


--------------------------------------------------------------------------------
/pandera/api/polars/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/polars/__init__.py


--------------------------------------------------------------------------------
/pandera/api/polars/model_config.py:
--------------------------------------------------------------------------------
 1 | """Class-based dataframe model API configuration for pandas."""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from pandera.api.dataframe.model_config import BaseConfig as _BaseConfig
 6 | from pandera.api.polars.types import PolarsDtypeInputTypes
 7 | 
 8 | 
 9 | class BaseConfig(_BaseConfig):  # pylint:disable=R0903
10 |     """Define polars DataFrameSchema-wide options."""
11 | 
12 |     #: datatype of the dataframe. This overrides the data types specified in
13 |     #: any of the fields.
14 |     dtype: Optional[PolarsDtypeInputTypes] = None
15 | 


--------------------------------------------------------------------------------
/pandera/api/polars/types.py:
--------------------------------------------------------------------------------
 1 | """Polars types."""
 2 | 
 3 | from typing import NamedTuple, Union, TypeVar
 4 | 
 5 | import polars as pl
 6 | 
 7 | 
 8 | class PolarsData(NamedTuple):
 9 |     lazyframe: pl.LazyFrame
10 |     key: str = "*"
11 | 
12 | 
13 | class CheckResult(NamedTuple):
14 |     """Check result for user-defined checks."""
15 | 
16 |     check_output: pl.LazyFrame
17 |     check_passed: pl.LazyFrame
18 |     checked_object: pl.LazyFrame
19 |     failure_cases: pl.LazyFrame
20 | 
21 | 
22 | PolarsCheckObjects = Union[pl.LazyFrame, pl.DataFrame]
23 | PolarsFrame = TypeVar("PolarsFrame", pl.LazyFrame, pl.DataFrame)
24 | 
25 | PolarsDtypeInputTypes = Union[
26 |     str,
27 |     type,
28 |     pl.datatypes.classes.DataTypeClass,
29 | ]
30 | 


--------------------------------------------------------------------------------
/pandera/api/polars/utils.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=cyclic-import
 2 | """Polars validation engine utilities."""
 3 | 
 4 | from typing import Dict, List
 5 | 
 6 | import polars as pl
 7 | 
 8 | from pandera.api.polars.types import PolarsCheckObjects
 9 | from pandera.engines.polars_engine import polars_version
10 | from pandera.config import (
11 |     ValidationDepth,
12 |     get_config_context,
13 |     get_config_global,
14 | )
15 | 
16 | 
17 | def get_lazyframe_schema(lf: pl.LazyFrame) -> Dict[str, pl.DataType]:
18 |     """Get a dict of column names and  dtypes from a polars LazyFrame."""
19 |     if polars_version().release >= (1, 0, 0):
20 |         return lf.collect_schema()
21 |     return lf.schema
22 | 
23 | 
24 | def get_lazyframe_column_dtypes(lf: pl.LazyFrame) -> List[pl.DataType]:
25 |     """Get a list of column dtypes from a polars LazyFrame."""
26 |     if polars_version().release >= (1, 0, 0):
27 |         return lf.collect_schema().dtypes()
28 |     return [*lf.schema.values()]
29 | 
30 | 
31 | def get_lazyframe_column_names(lf: pl.LazyFrame) -> List[str]:
32 |     """Get a list of column names from a polars LazyFrame."""
33 |     if polars_version().release >= (1, 0, 0):
34 |         return lf.collect_schema().names()
35 |     return lf.columns
36 | 
37 | 
38 | def get_validation_depth(check_obj: PolarsCheckObjects) -> ValidationDepth:
39 |     """Get validation depth for a given polars check object."""
40 |     is_dataframe = isinstance(check_obj, pl.DataFrame)
41 | 
42 |     config_global = get_config_global()
43 |     config_ctx = get_config_context(validation_depth_default=None)
44 | 
45 |     if config_ctx.validation_depth is not None:
46 |         # use context configuration if specified
47 |         return config_ctx.validation_depth
48 | 
49 |     if config_global.validation_depth is not None:
50 |         # use global configuration if specified
51 |         return config_global.validation_depth
52 | 
53 |     if (
54 |         isinstance(check_obj, pl.LazyFrame)
55 |         and config_global.validation_depth is None
56 |     ):
57 |         # if global validation depth is not set, use schema only validation
58 |         # when validating LazyFrames
59 |         validation_depth = ValidationDepth.SCHEMA_ONLY
60 |     elif is_dataframe and (
61 |         config_ctx.validation_depth is None
62 |         or config_ctx.validation_depth is None
63 |     ):
64 |         # if context validation depth is not set, use schema and data validation
65 |         # when validating DataFrames
66 |         validation_depth = ValidationDepth.SCHEMA_AND_DATA
67 |     else:
68 |         validation_depth = ValidationDepth.SCHEMA_ONLY
69 | 
70 |     return validation_depth
71 | 


--------------------------------------------------------------------------------
/pandera/api/pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | """PySpark native core."""
2 | 
3 | from pandera.api.pyspark.components import Column
4 | from pandera.api.pyspark.container import DataFrameSchema
5 | 


--------------------------------------------------------------------------------
/pandera/api/pyspark/model_config.py:
--------------------------------------------------------------------------------
 1 | """Class-based dataframe model API configuration for pyspark."""
 2 | 
 3 | from typing import Any, Callable, Dict, List, Optional, Union
 4 | 
 5 | from pandera.api.base.model_config import BaseModelConfig
 6 | from pandera.api.base.types import StrictType
 7 | from pandera.api.pyspark.types import PySparkDtypeInputTypes
 8 | from pandera.typing.formats import Format
 9 | 
10 | 
11 | class BaseConfig(BaseModelConfig):  # pylint:disable=R0903
12 |     """Define DataFrameSchema-wide options.
13 | 
14 |     *new in 0.16.0*
15 |     """
16 | 
17 |     #: datatype of the dataframe. This overrides the data types specified in
18 |     #: any of the fields.
19 |     dtype: Optional[PySparkDtypeInputTypes] = None
20 | 
21 |     name: Optional[str] = None  #: name of schema
22 |     title: Optional[str] = None  #: human-readable label for schema
23 |     description: Optional[str] = None  #: arbitrary textual description
24 |     coerce: bool = False  #: coerce types of all schema components
25 | 
26 |     #: make sure certain column combinations are unique
27 |     unique: Optional[Union[str, List[str]]] = None
28 | 
29 |     #: make sure all specified columns are in the validated dataframe -
30 |     #: if ``"filter"``, removes columns not specified in the schema
31 |     strict: StrictType = False
32 | 
33 |     ordered: bool = False  #: validate columns order
34 | 
35 |     #: make sure dataframe column names are unique
36 |     unique_column_names: bool = False
37 | 
38 |     #: data format before validation. This option only applies to
39 |     #: schemas used in the context of the pandera type constructor
40 |     #: ``pa.typing.DataFrame[Schema](data)``. If None, assumes a data structure
41 |     #: compatible with the ``pyspark.sql.DataFrame`` constructor.
42 |     from_format: Optional[Union[Format, Callable]] = None
43 | 
44 |     #: a dictionary keyword arguments to pass into the reader function that
45 |     #: converts the object of type ``from_format`` to a pandera-validate-able
46 |     #: data structure. The reader function is implemented in the pandera.typing
47 |     #: generic types via the ``from_format`` and ``to_format`` methods.
48 |     from_format_kwargs: Optional[Dict[str, Any]] = None
49 | 
50 |     #: data format to serialize into after validation. This option only applies
51 |     #: to  schemas used in the context of the pandera type constructor
52 |     #: ``pa.typing.DataFrame[Schema](data)``. If None, returns a dataframe.
53 |     to_format: Optional[Union[Format, Callable]] = None
54 | 
55 |     #: Buffer to be provided when to_format is a custom callable. See docs for
56 |     #: example of how to implement an example of a to format function.
57 |     to_format_buffer: Optional[Union[str, Callable]] = None
58 | 
59 |     #: a dictionary keyword arguments to pass into the writer function that
60 |     #: converts the pandera-validate-able object to type ``to_format``.
61 |     #: The writer function is implemented in the pandera.typing
62 |     #: generic types via the ``from_format`` and ``to_format`` methods.
63 |     to_format_kwargs: Optional[Dict[str, Any]] = None
64 | 
65 |     #: a dictionary object to store key-value data at schema level
66 |     metadata: Optional[dict] = None
67 | 


--------------------------------------------------------------------------------
/pandera/api/pyspark/types.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for pyspark validation."""
  2 | 
  3 | from functools import lru_cache
  4 | from typing import List, NamedTuple, Tuple, Type, Union
  5 | from numpy import bool_ as np_bool
  6 | from packaging import version
  7 | 
  8 | import pyspark.sql.types as pst
  9 | from pyspark.sql import DataFrame
 10 | 
 11 | import pyspark
 12 | from pandera.api.checks import Check
 13 | from pandera.dtypes import DataType
 14 | 
 15 | # pylint: disable=reimported
 16 | # Handles optional Spark Connect imports for pyspark>=3.4 (if available)
 17 | if version.parse(pyspark.__version__) >= version.parse("3.4"):
 18 |     from pyspark.sql.connect.dataframe import DataFrame as psc_DataFrame
 19 |     from pyspark.sql.connect.group import GroupedData
 20 | else:
 21 |     from pyspark.sql import (
 22 |         DataFrame as psc_DataFrame,
 23 |     )
 24 |     from pyspark.sql.group import GroupedData
 25 | 
 26 | DataFrameTypes = Union[DataFrame, psc_DataFrame]
 27 | GroupbyObject = GroupedData
 28 | 
 29 | CheckList = Union[Check, List[Check]]
 30 | 
 31 | PysparkDefaultTypes = Union[
 32 |     pst.BooleanType,
 33 |     pst.StringType,
 34 |     pst.IntegerType,
 35 |     pst.DecimalType,
 36 |     pst.FloatType,
 37 |     pst.DateType,
 38 |     pst.TimestampType,
 39 |     pst.DoubleType,
 40 |     pst.ShortType,
 41 |     pst.ByteType,
 42 |     pst.LongType,
 43 |     pst.BinaryType,
 44 | ]
 45 | 
 46 | PySparkDtypeInputTypes = Union[
 47 |     str,
 48 |     int,
 49 |     float,
 50 |     bool,
 51 |     type,
 52 |     DataType,
 53 |     Type,
 54 |     pst.BooleanType,
 55 |     pst.StringType,
 56 |     pst.IntegerType,
 57 |     pst.DecimalType,
 58 |     pst.FloatType,
 59 |     pst.DateType,
 60 |     pst.TimestampType,
 61 |     pst.DoubleType,
 62 |     pst.ShortType,
 63 |     pst.ByteType,
 64 |     pst.LongType,
 65 |     pst.BinaryType,
 66 | ]
 67 | 
 68 | 
 69 | class SupportedTypes(NamedTuple):
 70 |     table_types: Tuple[type, ...]
 71 | 
 72 | 
 73 | class PysparkDataframeColumnObject(NamedTuple):
 74 |     """Pyspark Object which holds dataframe and column value in a named tuble"""
 75 | 
 76 |     dataframe: DataFrameTypes
 77 |     column_name: str
 78 | 
 79 | 
 80 | @lru_cache(maxsize=None)
 81 | def supported_types() -> SupportedTypes:
 82 |     """Get the types supported by pandera schemas."""
 83 |     # pylint: disable=import-outside-toplevel
 84 |     table_types = [DataFrame]
 85 | 
 86 |     try:
 87 |         table_types.append(DataFrame)
 88 |         table_types.append(psc_DataFrame)
 89 | 
 90 |     except ImportError:  # pragma: no cover
 91 |         pass
 92 | 
 93 |     return SupportedTypes(
 94 |         tuple(table_types),
 95 |     )
 96 | 
 97 | 
 98 | def is_table(obj):
 99 |     """Verifies whether an object is table-like.
100 | 
101 |     Where a table is a 2-dimensional data matrix of rows and columns, which
102 |     can be indexed in multiple different ways.
103 |     """
104 |     return isinstance(obj, supported_types().table_types)
105 | 
106 | 
107 | def is_bool(x):
108 |     """Verifies whether an object is a boolean type."""
109 |     return isinstance(x, (bool, type(pst.BooleanType()), np_bool))
110 | 


--------------------------------------------------------------------------------
/pandera/backends/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandera backends."""
2 | 


--------------------------------------------------------------------------------
/pandera/backends/base/builtin_checks.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-function-docstring
  2 | """Built-in check functions base implementation.
  3 | 
  4 | This module contains check function abstract definitions that correspond to
  5 | the pandera.api.base.checks.Check methods. These functions do not actually
  6 | implement any validation logic and serve as the entrypoint for dispatching
  7 | specific implementations based on the data object type, e.g.
  8 | `pandas.DataFrame`s.
  9 | """
 10 | 
 11 | import re
 12 | from typing import Any, Iterable, Optional, TypeVar, Union
 13 | 
 14 | from pandera.api.checks import Check
 15 | 
 16 | T = TypeVar("T")
 17 | 
 18 | 
 19 | @Check.register_builtin_check_fn
 20 | def equal_to(data: Any, value: Any) -> Any:
 21 |     raise NotImplementedError
 22 | 
 23 | 
 24 | @Check.register_builtin_check_fn
 25 | def not_equal_to(data: Any, value: Any) -> Any:
 26 |     raise NotImplementedError
 27 | 
 28 | 
 29 | @Check.register_builtin_check_fn
 30 | def greater_than(data: Any, min_value: Any) -> Any:
 31 |     raise NotImplementedError
 32 | 
 33 | 
 34 | @Check.register_builtin_check_fn
 35 | def greater_than_or_equal_to(data: Any, min_value: Any) -> Any:
 36 |     raise NotImplementedError
 37 | 
 38 | 
 39 | @Check.register_builtin_check_fn
 40 | def less_than(data: Any, max_value: Any) -> Any:
 41 |     raise NotImplementedError
 42 | 
 43 | 
 44 | @Check.register_builtin_check_fn
 45 | def less_than_or_equal_to(data: Any, max_value: Any) -> Any:
 46 |     raise NotImplementedError
 47 | 
 48 | 
 49 | @Check.register_builtin_check_fn
 50 | def in_range(
 51 |     data: Any,
 52 |     min_value: T,
 53 |     max_value: T,
 54 |     include_min: bool = True,
 55 |     include_max: bool = True,
 56 | ) -> Any:
 57 |     raise NotImplementedError
 58 | 
 59 | 
 60 | @Check.register_builtin_check_fn
 61 | def isin(data: Any, allowed_values: Iterable) -> Any:
 62 |     raise NotImplementedError
 63 | 
 64 | 
 65 | @Check.register_builtin_check_fn
 66 | def notin(data: Any, forbidden_values: Iterable) -> Any:
 67 |     raise NotImplementedError
 68 | 
 69 | 
 70 | @Check.register_builtin_check_fn
 71 | def str_matches(data: Any, pattern: Union[str, re.Pattern]) -> Any:
 72 |     raise NotImplementedError
 73 | 
 74 | 
 75 | @Check.register_builtin_check_fn
 76 | def str_contains(data: Any, pattern: Union[str, re.Pattern]) -> Any:
 77 |     raise NotImplementedError
 78 | 
 79 | 
 80 | @Check.register_builtin_check_fn
 81 | def str_startswith(data: Any, string: str) -> Any:
 82 |     raise NotImplementedError
 83 | 
 84 | 
 85 | @Check.register_builtin_check_fn
 86 | def str_endswith(data: Any, string: str) -> Any:
 87 |     raise NotImplementedError
 88 | 
 89 | 
 90 | @Check.register_builtin_check_fn
 91 | def str_length(
 92 |     data: Any,
 93 |     min_value: Optional[int] = None,
 94 |     max_value: Optional[int] = None,
 95 | ) -> Any:
 96 |     raise NotImplementedError
 97 | 
 98 | 
 99 | @Check.register_builtin_check_fn
100 | def unique_values_eq(data: Any, values: Iterable) -> Any:
101 |     raise NotImplementedError
102 | 


--------------------------------------------------------------------------------
/pandera/backends/base/builtin_hypotheses.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-function-docstring
 2 | """Built-in hypothesis functions base implementation.
 3 | 
 4 | This module contains hypothesis function abstract definitions that
 5 | correspond to the pandera.api.base.checks.Check methods. These functions do not
 6 | actually implement any validation logic and serve as the entrypoint for
 7 | dispatching specific implementations based on the data object type, e.g.
 8 | `pandas.DataFrame`s.
 9 | """
10 | 
11 | from typing import Any, Tuple
12 | 
13 | from pandera.api.hypotheses import Hypothesis
14 | 
15 | 
16 | @Hypothesis.register_builtin_check_fn
17 | def two_sample_ttest(
18 |     *samples: Tuple[Any, ...],
19 |     equal_var: bool = True,
20 |     nan_policy: str = "propagate",
21 | ):
22 |     raise NotImplementedError
23 | 
24 | 
25 | @Hypothesis.register_builtin_check_fn
26 | def one_sample_ttest(
27 |     *samples: Tuple[Any, ...],
28 |     popmean: float,
29 |     nan_policy: str = "propagate",
30 | ):
31 |     raise NotImplementedError
32 | 


--------------------------------------------------------------------------------
/pandera/backends/pandas/__init__.py:
--------------------------------------------------------------------------------
1 | """Pandas backend implementation for schemas and checks."""
2 | 


--------------------------------------------------------------------------------
/pandera/backends/pandas/builtin_hypotheses.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-function-docstring
 2 | """Pandas implementation of built-in hypotheses."""
 3 | 
 4 | from typing import Tuple
 5 | 
 6 | from pandera.api.extensions import register_builtin_hypothesis
 7 | from pandera.backends.pandas.builtin_checks import PandasData
 8 | 
 9 | 
10 | @register_builtin_hypothesis(
11 |     error="failed two sample ttest between '{sample1}' and '{sample2}'",
12 |     samples_kwtypes={"sample1": str, "sample2": str},
13 | )
14 | def two_sample_ttest(
15 |     *samples: Tuple[PandasData, ...],
16 |     equal_var: bool = True,
17 |     nan_policy: str = "propagate",
18 | ) -> Tuple[float, float]:
19 |     from scipy import stats  # pylint: disable=import-outside-toplevel
20 | 
21 |     assert (
22 |         len(samples) == 2
23 |     ), "Expected two sample ttest data to contain exactly two samples"
24 |     return stats.ttest_ind(
25 |         samples[0],
26 |         samples[1],
27 |         equal_var=equal_var,
28 |         nan_policy=nan_policy,
29 |     )
30 | 
31 | 
32 | @register_builtin_hypothesis(
33 |     error="failed one sample ttest for column '{sample}'",
34 |     samples_kwtypes={"sample": str},
35 | )
36 | def one_sample_ttest(
37 |     *samples: Tuple[PandasData, ...],
38 |     popmean: float,
39 |     nan_policy: str = "propagate",
40 | ) -> Tuple[float, float]:
41 |     from scipy import stats  # pylint: disable=import-outside-toplevel
42 | 
43 |     assert (
44 |         len(samples) == 1
45 |     ), "Expected one sample ttest data to contain only one sample"
46 |     return stats.ttest_1samp(
47 |         samples[0], popmean=popmean, nan_policy=nan_policy
48 |     )
49 | 


--------------------------------------------------------------------------------
/pandera/backends/pandas/parsers.py:
--------------------------------------------------------------------------------
 1 | """Parser backend for pandas"""
 2 | 
 3 | from functools import partial
 4 | from typing import Dict, Optional, Union
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from pandera.api.base.parsers import ParserResult
 9 | from pandera.api.pandas.types import is_field, is_table
10 | from pandera.api.parsers import Parser
11 | from pandera.backends.base import BaseParserBackend
12 | 
13 | 
14 | class PandasParserBackend(BaseParserBackend):
15 |     """Parser backend of pandas."""
16 | 
17 |     def __init__(self, parser: Parser):
18 |         """Initializes a parser backend object."""
19 |         super().__init__(parser)
20 |         assert parser._parser_fn is not None, "Parser._parser_fn must be set."
21 |         self.parser = parser
22 |         self.parser_fn = partial(parser._parser_fn, **parser._parser_kwargs)
23 | 
24 |     def preprocess(
25 |         self, parse_obj, key
26 |     ) -> pd.Series:  # pylint:disable=unused-argument
27 |         """Preprocesses a parser object before applying the parse function."""
28 |         if is_table(parse_obj) and key is not None:
29 |             return self.preprocess_table_with_key(parse_obj, key)
30 |         elif is_table(parse_obj) and key is None:
31 |             return self.preprocess_table(parse_obj)
32 |         else:
33 |             return parse_obj
34 | 
35 |     def preprocess_table_with_key(
36 |         self,
37 |         parse_obj,
38 |         key,
39 |     ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
40 |         return parse_obj[key]
41 | 
42 |     def preprocess_table(
43 |         self, parse_obj
44 |     ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
45 |         return parse_obj
46 | 
47 |     def apply(self, parse_obj):
48 |         """Apply the parse function to a parser object."""
49 |         if is_field(parse_obj):
50 |             return self.apply_field(parse_obj)
51 |         elif is_table(parse_obj):
52 |             return self.apply_table(parse_obj)
53 |         else:
54 |             raise NotImplementedError
55 | 
56 |     def apply_field(self, parse_obj):
57 |         if self.parser.element_wise:
58 |             return parse_obj.map(self.parser_fn)
59 |         return self.parser_fn(parse_obj)
60 | 
61 |     def apply_table(self, parse_obj):
62 |         if self.parser.element_wise:
63 |             return getattr(parse_obj, "map", parse_obj.applymap)(
64 |                 self.parser_fn
65 |             )
66 |         return self.parser_fn(parse_obj)
67 | 
68 |     def postprocess(
69 |         self,
70 |         parse_obj,
71 |         parser_output,
72 |     ) -> ParserResult:
73 |         """Postprocesses the result of applying the parser function."""
74 |         return ParserResult(
75 |             parser_output=parser_output, parsed_object=parse_obj
76 |         )
77 | 
78 |     def __call__(
79 |         self,
80 |         parse_obj: Union[pd.Series, pd.DataFrame],
81 |         key: Optional[str] = None,
82 |     ):
83 |         parse_obj = self.preprocess(parse_obj, key)
84 |         parser_output = self.apply(parse_obj)
85 |         return self.postprocess(parse_obj, parser_output)
86 | 


--------------------------------------------------------------------------------
/pandera/backends/pandas/register.py:
--------------------------------------------------------------------------------
 1 | """Register pandas backends."""
 2 | 
 3 | from functools import lru_cache
 4 | from typing import Optional
 5 | 
 6 | from pandera.backends.pandas.array import SeriesSchemaBackend
 7 | from pandera.backends.pandas.checks import PandasCheckBackend
 8 | from pandera.backends.pandas.components import (
 9 |     ColumnBackend,
10 |     IndexBackend,
11 |     MultiIndexBackend,
12 | )
13 | from pandera.backends.pandas.container import DataFrameSchemaBackend
14 | from pandera.backends.pandas.hypotheses import PandasHypothesisBackend
15 | from pandera.backends.pandas.parsers import PandasParserBackend
16 | 
17 | 
18 | @lru_cache
19 | def register_pandas_backends(
20 |     check_cls_fqn: Optional[str] = None,
21 | ):  # pylint: disable=unused-argument
22 |     """Register pandas backends.
23 | 
24 |     This function is called at schema initialization in the _register_*_backends
25 |     method.
26 | 
27 |     :param framework_name: name of the framework to register backends for.
28 |         Allowable types are "pandas", "dask", "modin", "pyspark", and
29 |         "geopandas".
30 |     """
31 | 
32 |     # pylint: disable=import-outside-toplevel,unused-import,cyclic-import
33 |     from pandera._patch_numpy2 import _patch_numpy2
34 | 
35 |     _patch_numpy2()
36 | 
37 |     from pandera.api.checks import Check
38 |     from pandera.api.hypotheses import Hypothesis
39 |     from pandera.api.pandas.array import SeriesSchema
40 |     from pandera.api.pandas.components import Column, Index, MultiIndex
41 |     from pandera.api.pandas.container import DataFrameSchema
42 |     from pandera.api.parsers import Parser
43 |     from pandera.api.pandas.types import get_backend_types
44 | 
45 |     # NOTE: This registers the deprecated DataFrameSchema class. Remove this
46 |     # once the deprecated class is removed.
47 |     from pandera._pandas_deprecated import (
48 |         DataFrameSchema as _DataFrameSchemaDeprecated,
49 |     )
50 | 
51 |     assert check_cls_fqn is not None, (
52 |         "pandas backend registration requires passing in the fully qualified "
53 |         "check class name"
54 |     )
55 |     backend_types = get_backend_types(check_cls_fqn)
56 | 
57 |     from pandera.backends.pandas import builtin_checks, builtin_hypotheses
58 | 
59 |     for t in backend_types.check_backend_types:
60 |         Check.register_backend(t, PandasCheckBackend)
61 |         Hypothesis.register_backend(t, PandasHypothesisBackend)
62 |         Parser.register_backend(t, PandasParserBackend)
63 | 
64 |     for t in backend_types.dataframe_datatypes:
65 |         DataFrameSchema.register_backend(t, DataFrameSchemaBackend)
66 |         _DataFrameSchemaDeprecated.register_backend(t, DataFrameSchemaBackend)
67 |         Column.register_backend(t, ColumnBackend)
68 |         MultiIndex.register_backend(t, MultiIndexBackend)
69 |         Index.register_backend(t, IndexBackend)
70 | 
71 |     for t in backend_types.series_datatypes:
72 |         SeriesSchema.register_backend(t, SeriesSchemaBackend)
73 |         Column.register_backend(t, ColumnBackend)
74 |         MultiIndex.register_backend(t, MultiIndexBackend)
75 |         Index.register_backend(t, IndexBackend)
76 | 
77 |     for t in backend_types.index_datatypes:
78 |         Index.register_backend(t, IndexBackend)
79 | 
80 |     for t in backend_types.multiindex_datatypes:
81 |         MultiIndex.register_backend(t, MultiIndexBackend)
82 | 


--------------------------------------------------------------------------------
/pandera/backends/polars/__init__.py:
--------------------------------------------------------------------------------
1 | """Polars backend implementation for schemas and checks."""
2 | 


--------------------------------------------------------------------------------
/pandera/backends/polars/error_formatters.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/backends/polars/error_formatters.py


--------------------------------------------------------------------------------
/pandera/backends/polars/register.py:
--------------------------------------------------------------------------------
 1 | """Register polars backends."""
 2 | 
 3 | from functools import lru_cache
 4 | from typing import Optional
 5 | 
 6 | import polars as pl
 7 | 
 8 | 
 9 | @lru_cache
10 | def register_polars_backends(
11 |     check_cls_fqn: Optional[str] = None,
12 | ):  # pylint: disable=unused-argument
13 |     """Register polars backends.
14 | 
15 |     This function is called at schema initialization in the _register_*_backends
16 |     method.
17 |     """
18 | 
19 |     # pylint: disable=import-outside-toplevel,unused-import,cyclic-import
20 |     from pandera.api.checks import Check
21 |     from pandera.api.polars.components import Column
22 |     from pandera.api.polars.container import DataFrameSchema
23 |     from pandera.backends.polars import builtin_checks
24 |     from pandera.backends.polars.checks import PolarsCheckBackend
25 |     from pandera.backends.polars.components import ColumnBackend
26 |     from pandera.backends.polars.container import DataFrameSchemaBackend
27 | 
28 |     DataFrameSchema.register_backend(pl.LazyFrame, DataFrameSchemaBackend)
29 |     DataFrameSchema.register_backend(pl.DataFrame, DataFrameSchemaBackend)
30 |     Column.register_backend(pl.LazyFrame, ColumnBackend)
31 |     Check.register_backend(pl.LazyFrame, PolarsCheckBackend)
32 | 


--------------------------------------------------------------------------------
/pandera/backends/pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | """PySpark native backend implementation for schemas and checks."""
2 | 


--------------------------------------------------------------------------------
/pandera/backends/pyspark/checks.py:
--------------------------------------------------------------------------------
 1 | """Check backend for pyspark."""
 2 | 
 3 | from functools import partial
 4 | from typing import Dict, List, Optional, Union
 5 | 
 6 | from pandera.api.base.checks import CheckResult
 7 | from pandera.api.checks import Check
 8 | from pandera.api.pyspark.types import (
 9 |     PysparkDataframeColumnObject,
10 |     is_bool,
11 |     is_table,
12 |     GroupbyObject,
13 | )
14 | from pandera.backends.base import BaseCheckBackend
15 | from pandera.api.pyspark.types import DataFrameTypes
16 | 
17 | 
18 | class PySparkCheckBackend(BaseCheckBackend):
19 |     """Check backend for PySpark."""
20 | 
21 |     def __init__(self, check: Check):
22 |         """Initializes a check backend object."""
23 |         super().__init__(check)
24 |         assert check._check_fn is not None, "Check._check_fn must be set."
25 |         self.check = check
26 |         self.check_fn = partial(check._check_fn, **check._check_kwargs)
27 | 
28 |     def groupby(self, check_obj: DataFrameTypes):  # pragma: no cover
29 |         """Implements groupby behavior for check object."""
30 |         assert self.check.groupby is not None, "Check.groupby must be set."
31 |         if isinstance(self.check.groupby, (str, list)):
32 |             return check_obj.groupby(self.check.groupby)
33 |         return self.check.groupby(check_obj)
34 | 
35 |     def query(self, check_obj):
36 |         """Implements querying behavior to produce subset of check object."""
37 |         raise NotImplementedError
38 | 
39 |     def aggregate(self, check_obj):
40 |         """Implements aggregation behavior for check object."""
41 |         raise NotImplementedError
42 | 
43 |     @staticmethod
44 |     def _format_groupby_input(
45 |         groupby_obj: GroupbyObject,
46 |         groups: Optional[List[str]],
47 |     ) -> Dict[str, DataFrameTypes]:  # pragma: no cover
48 |         raise NotImplementedError
49 | 
50 |     def preprocess(
51 |         self,
52 |         check_obj: DataFrameTypes,
53 |         key: str,  # type: ignore [valid-type]
54 |     ) -> DataFrameTypes:
55 |         return check_obj
56 | 
57 |     def apply(
58 |         self,
59 |         check_obj: Union[DataFrameTypes, is_table],
60 |         column_name: str = None,
61 |         kwargs: dict = None,
62 |     ):
63 |         if column_name and kwargs:
64 |             check_obj_and_col_name = PysparkDataframeColumnObject(
65 |                 check_obj, column_name
66 |             )
67 |             return self.check._check_fn(check_obj_and_col_name, **kwargs)
68 | 
69 |         else:
70 |             return self.check_fn(check_obj)  # pragma: no cover
71 | 
72 |     def postprocess(
73 |         self,
74 |         check_obj: DataFrameTypes,
75 |         check_output: is_bool,  # type: ignore [valid-type]
76 |     ) -> CheckResult:
77 |         """Postprocesses the result of applying the check function."""
78 |         return CheckResult(
79 |             check_output=check_output,
80 |             check_passed=check_output,
81 |             checked_object=check_obj,
82 |             failure_cases=None,
83 |         )
84 | 
85 |     def __call__(
86 |         self,
87 |         check_obj: DataFrameTypes,
88 |         key: Optional[str] = None,
89 |     ) -> CheckResult:
90 |         check_obj = self.preprocess(check_obj, key)
91 | 
92 |         check_output = self.apply(  # pylint:disable=too-many-function-args
93 |             check_obj, key, self.check._check_kwargs
94 |         )
95 | 
96 |         return self.postprocess(check_obj, check_output)
97 | 


--------------------------------------------------------------------------------
/pandera/backends/pyspark/error_formatters.py:
--------------------------------------------------------------------------------
 1 | """Make schema error messages human-friendly."""
 2 | 
 3 | 
 4 | def format_generic_error_message(
 5 |     parent_schema,
 6 |     check,
 7 | ) -> str:
 8 |     """Construct an error message when a check validator fails.
 9 | 
10 |     :param parent_schema: class of schema being validated.
11 |     :param check: check that generated error.
12 |     """
13 |     return f"{parent_schema} failed validation " f"{check.error}"
14 | 
15 | 
16 | def scalar_failure_case(x) -> dict:
17 |     """Construct failure case from a scalar value.
18 | 
19 |     :param x: a scalar value representing failure case.
20 |     :returns: Dictionary used for error reporting with ``SchemaErrors``.
21 |     """
22 |     return {
23 |         "index": [None],
24 |         "failure_case": [x],
25 |     }
26 | 


--------------------------------------------------------------------------------
/pandera/backends/pyspark/register.py:
--------------------------------------------------------------------------------
 1 | """Register pyspark backends."""
 2 | 
 3 | from functools import lru_cache
 4 | from typing import Optional
 5 | from packaging import version
 6 | 
 7 | import pyspark
 8 | import pyspark.sql as ps
 9 | 
10 | # Handles optional Spark Connect imports for pyspark>=3.4 (if available)
11 | CURRENT_PYSPARK_VERSION = version.parse(pyspark.__version__)
12 | if CURRENT_PYSPARK_VERSION >= version.parse("3.4"):
13 |     from pyspark.sql.connect import dataframe as psc
14 | 
15 | 
16 | @lru_cache
17 | def register_pyspark_backends(
18 |     check_cls_fqn: Optional[str] = None,
19 | ):  # pylint: disable=unused-argument
20 |     """Register pyspark backends.
21 | 
22 |     This function is called at schema initialization in the _register_*_backends
23 |     method.
24 |     """
25 | 
26 |     # pylint: disable=import-outside-toplevel,unused-import,cyclic-import
27 |     from pandera._patch_numpy2 import _patch_numpy2
28 | 
29 |     _patch_numpy2()
30 | 
31 |     from pandera.api.checks import Check
32 |     from pandera.api.pyspark.column_schema import ColumnSchema
33 |     from pandera.api.pyspark.components import Column
34 |     from pandera.api.pyspark.container import DataFrameSchema
35 |     from pandera.backends.pyspark import builtin_checks
36 |     from pandera.backends.pyspark.checks import PySparkCheckBackend
37 |     from pandera.backends.pyspark.column import ColumnSchemaBackend
38 |     from pandera.backends.pyspark.components import ColumnBackend
39 |     from pandera.backends.pyspark.container import DataFrameSchemaBackend
40 | 
41 |     # Register classical DataFrame
42 |     Check.register_backend(ps.DataFrame, PySparkCheckBackend)
43 |     ColumnSchema.register_backend(ps.DataFrame, ColumnSchemaBackend)
44 |     Column.register_backend(ps.DataFrame, ColumnBackend)
45 |     DataFrameSchema.register_backend(ps.DataFrame, DataFrameSchemaBackend)
46 |     # Register Spark Connect DataFrame, if available
47 |     if CURRENT_PYSPARK_VERSION >= version.parse("3.4"):
48 |         Check.register_backend(psc.DataFrame, PySparkCheckBackend)
49 |         ColumnSchema.register_backend(psc.DataFrame, ColumnSchemaBackend)
50 |         Column.register_backend(psc.DataFrame, ColumnBackend)
51 |         DataFrameSchema.register_backend(psc.DataFrame, DataFrameSchemaBackend)
52 | 


--------------------------------------------------------------------------------
/pandera/backends/pyspark/utils.py:
--------------------------------------------------------------------------------
 1 | """pyspark backend utilities."""
 2 | 
 3 | 
 4 | def convert_to_list(*args):
 5 |     """Converts arguments to a list"""
 6 |     converted_list = []
 7 |     for arg in args:
 8 |         if isinstance(arg, list):
 9 |             converted_list.extend(arg)
10 |         else:
11 |             converted_list.append(arg)
12 | 
13 |     return converted_list
14 | 


--------------------------------------------------------------------------------
/pandera/backends/utils.py:
--------------------------------------------------------------------------------
 1 | """Pandas backend utilities."""
 2 | 
 3 | from typing import Union
 4 | 
 5 | from pandera.dtypes import UniqueSettings
 6 | 
 7 | 
 8 | def convert_uniquesettings(unique: UniqueSettings) -> Union[bool, str]:
 9 |     """
10 |     Converts UniqueSettings object to string that can be passed onto pandas .duplicated() call
11 |     """
12 |     # Default `keep` argument for pandas .duplicated() function
13 |     keep_argument: Union[bool, str]
14 |     if unique == "exclude_first":
15 |         keep_argument = "first"
16 |     elif unique == "exclude_last":
17 |         keep_argument = "last"
18 |     elif unique == "all":
19 |         keep_argument = False
20 |     else:
21 |         raise ValueError(
22 |             str(unique) + " is not a recognized report_duplicates value"
23 |         )
24 |     return keep_argument
25 | 


--------------------------------------------------------------------------------
/pandera/constants.py:
--------------------------------------------------------------------------------
1 | """Pandera constants."""
2 | 
3 | CHECK_OUTPUT_KEY = "check_output"
4 | FAILURE_CASE_KEY = "failure_case"
5 | 


--------------------------------------------------------------------------------
/pandera/engines/__init__.py:
--------------------------------------------------------------------------------
 1 | """Pandera type engines."""
 2 | 
 3 | import pydantic
 4 | from packaging import version
 5 | 
 6 | 
 7 | def pydantic_version():
 8 |     """Return the pydantic version."""
 9 | 
10 |     return version.parse(pydantic.__version__)
11 | 
12 | 
13 | PYDANTIC_V2 = pydantic_version().release >= (2, 0, 0)
14 | 


--------------------------------------------------------------------------------
/pandera/engines/type_aliases.py:
--------------------------------------------------------------------------------
 1 | """Custom type aliases."""
 2 | 
 3 | from typing import Union
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | 
 9 | PandasObject = Union[pd.Series, pd.DataFrame]
10 | PandasExtensionType = pd.core.dtypes.base.ExtensionDtype
11 | PandasDataType = Union[pd.core.dtypes.base.ExtensionDtype, np.dtype, type]
12 | 


--------------------------------------------------------------------------------
/pandera/extensions.py:
--------------------------------------------------------------------------------
 1 | """Extensions module, for backwards compatibility."""
 2 | 
 3 | # pylint: disable=unused-import
 4 | from pandera.api.extensions import (
 5 |     CheckType,
 6 |     register_builtin_check,
 7 |     register_builtin_hypothesis,
 8 |     register_check_method,
 9 |     register_check_statistics,
10 | )
11 | 


--------------------------------------------------------------------------------
/pandera/external_config.py:
--------------------------------------------------------------------------------
 1 | """Configuration for external packages."""
 2 | 
 3 | import os
 4 | 
 5 | 
 6 | def _set_pyspark_environment_variables():
 7 |     """Sets environment variables for pyspark."""
 8 | 
 9 |     is_spark_local_ip_dirty = False
10 |     is_pyarrow_ignore_timezone_dirty = False
11 | 
12 |     try:
13 |         # try importing pyspark to see if it exists. This is important because the
14 |         # pandera.typing module defines a Series type that inherits from
15 |         # pandas.Series, and pyspark v1+ injects a __getitem__ method to pandas
16 |         # Series and DataFrames to support type hinting:
17 |         # https://spark.apache.org/docs/3.2.0/api/python/user_guide/pandas_on_spark/typehints.html#type-hinting-with-names
18 |         # pylint: disable=unused-import
19 |         if os.getenv("SPARK_LOCAL_IP") is None:
20 |             is_spark_local_ip_dirty = True
21 |             os.environ["SPARK_LOCAL_IP"] = "127.0.0.1"
22 |         if os.getenv("PYARROW_IGNORE_TIMEZONE") is None:
23 |             is_pyarrow_ignore_timezone_dirty = True
24 |             # This can be overridden by the user
25 |             os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
26 | 
27 |         import pyspark.pandas
28 |     except (ImportError, ModuleNotFoundError):
29 |         pass
30 |     finally:
31 |         if is_spark_local_ip_dirty:
32 |             os.environ.pop("SPARK_LOCAL_IP")
33 |         if is_pyarrow_ignore_timezone_dirty:
34 |             os.environ.pop("PYARROW_IGNORE_TIMEZONE")
35 | 


--------------------------------------------------------------------------------
/pandera/import_utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for importing optional dependencies."""
 2 | 
 3 | from functools import wraps
 4 | from typing import Callable, TypeVar, cast
 5 | 
 6 | 
 7 | F = TypeVar("F", bound=Callable)
 8 | 
 9 | 
10 | def strategy_import_error(fn: F) -> F:
11 |     """Decorator to generate input error if dependency is missing."""
12 | 
13 |     @wraps(fn)
14 |     def _wrapper(*args, **kwargs):
15 | 
16 |         try:
17 |             # pylint: disable=unused-import
18 |             import hypothesis
19 |         except ImportError as exc:
20 |             raise ImportError(
21 |                 'Strategies for generating data requires "hypothesis" to be \n'
22 |                 "installed. You can install pandera together with the strategies \n"
23 |                 "dependencies with:\n"
24 |                 "pip install pandera[strategies]"
25 |             ) from exc
26 | 
27 |         return fn(*args, **kwargs)
28 | 
29 |     return cast(F, _wrapper)
30 | 


--------------------------------------------------------------------------------
/pandera/inspection_utils.py:
--------------------------------------------------------------------------------
 1 | """Decorators for integrating pandera into existing data pipelines."""
 2 | 
 3 | from inspect import ismethod
 4 | from typing import Callable
 5 | 
 6 | 
 7 | def _is_like_classmethod(fn: Callable) -> bool:
 8 |     """A regular method defined on a metaclass behaves the same way as
 9 |     a method decorated with @classmethod defined on a regular class.
10 | 
11 |     This function covers both use cases.
12 |     """
13 |     is_method = ismethod(fn)
14 |     return is_method and isinstance(fn.__self__, type)  # type: ignore[attr-defined]
15 | 
16 | 
17 | def is_decorated_classmethod(fn: Callable) -> bool:
18 |     """Check if fn is a classmethod declared with the @classmethod decorator.
19 | 
20 |     Adapted from:
21 |     https://stackoverflow.com/questions/19227724/check-if-a-function-uses-classmethod
22 |     """
23 |     if not _is_like_classmethod(fn):
24 |         return False
25 |     bound_to = fn.__self__  # type: ignore[attr-defined]
26 |     assert isinstance(bound_to, type)
27 |     name = fn.__name__
28 |     for cls in bound_to.__mro__:
29 |         descriptor = vars(cls).get(name)
30 |         if descriptor is not None:
31 |             return isinstance(descriptor, classmethod)
32 |     return False
33 | 
34 | 
35 | def is_classmethod_from_meta(fn: Callable) -> bool:
36 |     """Check if fn is a regular method defined on a metaclass
37 |     (which behaves like an @classmethod method defined on a regular class)."""
38 |     return not is_decorated_classmethod(fn) and _is_like_classmethod(fn)
39 | 


--------------------------------------------------------------------------------
/pandera/io/__init__.py:
--------------------------------------------------------------------------------
 1 | """Subpackage for serializing/deserializing pandera schemas to other formats."""
 2 | 
 3 | from pandera.io.pandas_io import (
 4 |     _deserialize_check_stats,
 5 |     _deserialize_component_stats,
 6 |     _format_checks,
 7 |     _format_index,
 8 |     _format_script,
 9 |     _get_dtype_string_alias,
10 |     _serialize_check_stats,
11 |     _serialize_component_stats,
12 |     _serialize_dataframe_stats,
13 |     deserialize_schema,
14 |     from_frictionless_schema,
15 |     from_json,
16 |     from_yaml,
17 |     serialize_schema,
18 |     to_json,
19 |     to_script,
20 |     to_yaml,
21 | )
22 | 


--------------------------------------------------------------------------------
/pandera/polars.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=unused-import
 2 | """A flexible and expressive polars validation library for Python."""
 3 | 
 4 | from pandera import errors
 5 | from pandera.api.checks import Check
 6 | from pandera.api.dataframe.model_components import (
 7 |     Field,
 8 |     check,
 9 |     dataframe_check,
10 | )
11 | from pandera.api.polars.components import Column
12 | from pandera.api.polars.container import DataFrameSchema
13 | from pandera.api.polars.model import DataFrameModel
14 | from pandera.api.polars.types import PolarsData
15 | from pandera.backends.polars.register import register_polars_backends
16 | from pandera.decorators import check_input, check_io, check_output, check_types
17 | from pandera.typing import polars as typing
18 | 
19 | register_polars_backends()
20 | 
21 | 
22 | __all__ = [
23 |     "check_input",
24 |     "check_io",
25 |     "check_output",
26 |     "check_types",
27 |     "check",
28 |     "Check",
29 |     "Column",
30 |     "dataframe_check",
31 |     "DataFrameModel",
32 |     "DataFrameSchema",
33 |     "errors",
34 |     "Field",
35 |     "PolarsData",
36 | ]
37 | 


--------------------------------------------------------------------------------
/pandera/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/py.typed


--------------------------------------------------------------------------------
/pandera/pyspark.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=unused-import,wrong-import-position,shadowed-import,reimported
  2 | """A flexible and expressive pyspark validation library."""
  3 | 
  4 | from pandera._patch_numpy2 import _patch_numpy2
  5 | 
  6 | _patch_numpy2()
  7 | 
  8 | import pandera.backends.pyspark
  9 | from pandera import errors, external_config
 10 | from pandera.accessors import pyspark_sql_accessor
 11 | from pandera.api.checks import Check
 12 | from pandera.api.pyspark import Column, DataFrameSchema
 13 | from pandera.api.pyspark.model import DataFrameModel
 14 | from pandera.api.pyspark.model_components import Field, check, dataframe_check
 15 | from pandera.decorators import check_input, check_io, check_output, check_types
 16 | from pandera.dtypes import (
 17 |     Bool,
 18 |     Category,
 19 |     Complex,
 20 |     Complex64,
 21 |     Complex128,
 22 |     Complex256,
 23 |     DataType,
 24 |     Date,
 25 |     DateTime,
 26 |     Decimal,
 27 |     Float,
 28 |     Float16,
 29 |     Float32,
 30 |     Float64,
 31 |     Float128,
 32 |     Int,
 33 |     Int8,
 34 |     Int16,
 35 |     Int32,
 36 |     Int64,
 37 |     String,
 38 |     Timedelta,
 39 |     Timestamp,
 40 |     UInt,
 41 |     UInt8,
 42 |     UInt16,
 43 |     UInt32,
 44 |     UInt64,
 45 | )
 46 | from pandera.errors import PysparkSchemaError, SchemaInitError
 47 | from pandera.schema_inference.pandas import infer_schema
 48 | from pandera.typing import pyspark_sql
 49 | from pandera._version import __version__
 50 | from pandera.typing import pyspark_sql as typing
 51 | 
 52 | 
 53 | external_config._set_pyspark_environment_variables()
 54 | 
 55 | __all__ = [
 56 |     # dtypes
 57 |     "Bool",
 58 |     "Category",
 59 |     "Complex",
 60 |     "Complex64",
 61 |     "Complex128",
 62 |     "Complex256",
 63 |     "DataType",
 64 |     "DateTime",
 65 |     "Float",
 66 |     "Float16",
 67 |     "Float32",
 68 |     "Float64",
 69 |     "Float128",
 70 |     "Int",
 71 |     "Int8",
 72 |     "Int16",
 73 |     "Int32",
 74 |     "Int64",
 75 |     "String",
 76 |     "Timedelta",
 77 |     "Timestamp",
 78 |     "UInt",
 79 |     "UInt8",
 80 |     "UInt16",
 81 |     "UInt32",
 82 |     "UInt64",
 83 |     # checks
 84 |     "Check",
 85 |     # decorators
 86 |     "check_input",
 87 |     "check_io",
 88 |     "check_output",
 89 |     "check_types",
 90 |     # model
 91 |     "DataFrameModel",
 92 |     # model_components
 93 |     "Field",
 94 |     "check",
 95 |     "dataframe_check",
 96 |     # schema_components
 97 |     "Column",
 98 |     # schema_inference
 99 |     "infer_schema",
100 |     # schemas
101 |     "DataFrameSchema",
102 |     # version
103 |     "__version__",
104 | ]
105 | 


--------------------------------------------------------------------------------
/pandera/schema_inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/schema_inference/__init__.py


--------------------------------------------------------------------------------
/pandera/schema_inference/pandas.py:
--------------------------------------------------------------------------------
  1 | """Module for inferring dataframe/series schema."""
  2 | 
  3 | from typing import overload
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from pandera.api.pandas.array import SeriesSchema
  8 | from pandera.api.pandas.components import Column, Index, MultiIndex
  9 | from pandera.api.pandas.container import DataFrameSchema
 10 | from pandera.schema_statistics.pandas import (
 11 |     infer_dataframe_statistics,
 12 |     infer_series_statistics,
 13 |     parse_check_statistics,
 14 | )
 15 | 
 16 | 
 17 | @overload
 18 | def infer_schema(
 19 |     pandas_obj: pd.Series,
 20 | ) -> SeriesSchema:  # pragma: no cover
 21 |     ...
 22 | 
 23 | 
 24 | @overload
 25 | def infer_schema(  # type: ignore[misc]
 26 |     pandas_obj: pd.DataFrame,
 27 | ) -> DataFrameSchema:  # pragma: no cover
 28 |     ...
 29 | 
 30 | 
 31 | def infer_schema(pandas_obj):
 32 |     """Infer schema for pandas DataFrame or Series object.
 33 | 
 34 |     :param pandas_obj: DataFrame or Series object to infer.
 35 |     :returns: DataFrameSchema or SeriesSchema
 36 |     :raises: TypeError if pandas_obj is not expected type.
 37 |     """
 38 |     if isinstance(pandas_obj, pd.DataFrame):
 39 |         return infer_dataframe_schema(pandas_obj)
 40 |     elif isinstance(pandas_obj, pd.Series):
 41 |         return infer_series_schema(pandas_obj)
 42 |     else:
 43 |         raise TypeError(
 44 |             "pandas_obj type not recognized. Expected a pandas DataFrame or "
 45 |             f"Series, found {type(pandas_obj)}"
 46 |         )
 47 | 
 48 | 
 49 | def _create_index(index_statistics):
 50 |     index = [
 51 |         Index(
 52 |             properties["dtype"],
 53 |             checks=parse_check_statistics(properties["checks"]),
 54 |             nullable=properties["nullable"],
 55 |             name=properties["name"],
 56 |         )
 57 |         for properties in index_statistics
 58 |     ]
 59 |     if len(index) == 1:
 60 |         index = index[0]  # type: ignore
 61 |     else:
 62 |         index = MultiIndex(index)  # type: ignore
 63 | 
 64 |     return index
 65 | 
 66 | 
 67 | def infer_dataframe_schema(df: pd.DataFrame) -> DataFrameSchema:
 68 |     """Infer a DataFrameSchema from a pandas DataFrame.
 69 | 
 70 |     :param df: DataFrame object to infer.
 71 |     :returns: DataFrameSchema
 72 |     """
 73 |     df_statistics = infer_dataframe_statistics(df)
 74 |     schema = DataFrameSchema(
 75 |         columns={
 76 |             colname: Column(
 77 |                 properties["dtype"],
 78 |                 checks=parse_check_statistics(properties["checks"]),
 79 |                 nullable=properties["nullable"],
 80 |             )
 81 |             for colname, properties in df_statistics["columns"].items()
 82 |         },
 83 |         index=_create_index(df_statistics["index"]),
 84 |         coerce=True,
 85 |     )
 86 |     return schema
 87 | 
 88 | 
 89 | def infer_series_schema(series) -> SeriesSchema:
 90 |     """Infer a SeriesSchema from a pandas DataFrame.
 91 | 
 92 |     :param series: Series object to infer.
 93 |     :returns: SeriesSchema
 94 |     """
 95 |     series_statistics = infer_series_statistics(series)
 96 |     schema = SeriesSchema(
 97 |         dtype=series_statistics["dtype"],
 98 |         checks=parse_check_statistics(series_statistics["checks"]),
 99 |         nullable=series_statistics["nullable"],
100 |         name=series_statistics["name"],
101 |         coerce=True,
102 |     )
103 |     return schema
104 | 


--------------------------------------------------------------------------------
/pandera/schema_statistics/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module to extract schema statsitics from schema objects."""
 2 | 
 3 | from pandera.schema_statistics.pandas import (
 4 |     get_dataframe_schema_statistics,
 5 |     get_index_schema_statistics,
 6 |     get_series_schema_statistics,
 7 |     infer_dataframe_statistics,
 8 |     infer_index_statistics,
 9 |     infer_series_statistics,
10 |     parse_check_statistics,
11 |     parse_checks,
12 | )
13 | 


--------------------------------------------------------------------------------
/pandera/strategies/__init__.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=unused-import
 2 | """Data synthesis strategies for pandera, powered by the hypothesis package."""
 3 | 
 4 | import warnings
 5 | 
 6 | try:
 7 |     import pandas
 8 |     from pandera.strategies.pandas_strategies import *
 9 | except ImportError:
10 |     pass
11 | 


--------------------------------------------------------------------------------
/pandera/strategies/base_strategies.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=unused-import
 2 | """Base module for `hypothesis`-based strategies for data synthesis."""
 3 | 
 4 | from functools import wraps
 5 | from typing import Callable, Dict, Generic, Tuple, Type, TypeVar, cast
 6 | 
 7 | import pandera.backends.base.builtin_checks
 8 | 
 9 | 
10 | F = TypeVar("F", bound=Callable)
11 | 
12 | 
13 | try:
14 |     # pylint: disable=unused-import
15 |     from hypothesis.strategies import SearchStrategy, composite
16 | except ImportError:  # pragma: no cover
17 |     T = TypeVar("T")
18 | 
19 |     # pylint: disable=too-few-public-methods
20 |     class SearchStrategy(Generic[T]):  # type: ignore
21 |         """placeholder type."""
22 | 
23 |     def composite(fn):  # type: ignore
24 |         """placeholder composite strategy."""
25 |         return fn
26 | 
27 |     HAS_HYPOTHESIS = False
28 | else:
29 |     HAS_HYPOTHESIS = True
30 | 
31 | 
32 | # This strategy registry maps (check_name, data_type) -> strategy_function
33 | # For example: ("greater_than", pd.DataFrame) -> (<function gt_strategy>)
34 | STRATEGY_DISPATCHER: Dict[Tuple[str, Type], Callable] = {}
35 | 
36 | 
37 | def strategy_import_error(fn: F) -> F:
38 |     """Decorator to generate input error if dependency is missing."""
39 | 
40 |     @wraps(fn)
41 |     def _wrapper(*args, **kwargs):
42 |         if not HAS_HYPOTHESIS:  # pragma: no cover
43 |             raise ImportError(
44 |                 'Strategies for generating data requires "hypothesis" to be \n'
45 |                 "installed. You can install pandera together with the strategies \n"
46 |                 "dependencies with:\n"
47 |                 "pip install pandera[strategies]"
48 |             )
49 |         return fn(*args, **kwargs)
50 | 
51 |     return cast(F, _wrapper)
52 | 


--------------------------------------------------------------------------------
/pandera/system.py:
--------------------------------------------------------------------------------
1 | """Global variables relating to OS."""
2 | 
3 | import numpy as np
4 | 
5 | # Windows and Mac M1 don't support floats of this precision:
6 | # https://github.com/pandera-dev/pandera/issues/623
7 | FLOAT_128_AVAILABLE = hasattr(np, "float128")
8 | 


--------------------------------------------------------------------------------
/pandera/typing/__init__.py:
--------------------------------------------------------------------------------
  1 | """Typing module.
  2 | 
  3 | For backwards compatibility, pandas types are exposed to the top-level scope of
  4 | the typing module.
  5 | """
  6 | 
  7 | from functools import lru_cache
  8 | from typing import Set, Type
  9 | from pandera.typing.common import AnnotationInfo
 10 | 
 11 | try:
 12 |     from pandera.typing.pandas import (
 13 |         DataFrame,
 14 |         Index,
 15 |         Series,
 16 |         Bool,
 17 |         Category,
 18 |         Date,
 19 |         DateTime,
 20 |         Decimal,
 21 |         Float,
 22 |         Float16,
 23 |         Float32,
 24 |         Float64,
 25 |         Int,
 26 |         Int8,
 27 |         Int16,
 28 |         Int32,
 29 |         Int64,
 30 |         Object,
 31 |         String,
 32 |         Timedelta,
 33 |         UInt8,
 34 |         UInt16,
 35 |         UInt32,
 36 |         UInt64,
 37 |         INT8,
 38 |         INT16,
 39 |         INT32,
 40 |         INT64,
 41 |         UINT8,
 42 |         UINT16,
 43 |         UINT32,
 44 |         UINT64,
 45 |         STRING,
 46 |     )
 47 | except ImportError:
 48 |     pass
 49 | 
 50 | 
 51 | @lru_cache
 52 | def get_dataframe_types():
 53 |     from pandera.typing import (
 54 |         dask,
 55 |         geopandas,
 56 |         modin,
 57 |         pyspark,
 58 |         pyspark_sql,
 59 |     )
 60 | 
 61 |     dataframe_types: Set[Type] = {DataFrame}
 62 |     if dask.DASK_INSTALLED:
 63 |         dataframe_types.update({dask.DataFrame})
 64 | 
 65 |     if modin.MODIN_INSTALLED:
 66 |         dataframe_types.update({modin.DataFrame})
 67 | 
 68 |     if pyspark.PYSPARK_INSTALLED:
 69 |         dataframe_types.update({pyspark.DataFrame})
 70 | 
 71 |     if pyspark_sql.PYSPARK_SQL_INSTALLED:
 72 |         dataframe_types.update({pyspark_sql.DataFrame})
 73 | 
 74 |     if geopandas.GEOPANDAS_INSTALLED:
 75 |         dataframe_types.update({geopandas.GeoDataFrame})
 76 | 
 77 |     return dataframe_types
 78 | 
 79 | 
 80 | @lru_cache
 81 | def get_series_types():
 82 |     from pandera.typing import (
 83 |         dask,
 84 |         geopandas,
 85 |         modin,
 86 |         pyspark,
 87 |     )
 88 | 
 89 |     series_types: Set[Type] = {Series}
 90 |     if dask.DASK_INSTALLED:
 91 |         series_types.update({dask.Series})
 92 | 
 93 |     if modin.MODIN_INSTALLED:
 94 |         series_types.update({modin.Series})
 95 | 
 96 |     if pyspark.PYSPARK_INSTALLED:
 97 |         series_types.update({pyspark.Series})
 98 | 
 99 |     if geopandas.GEOPANDAS_INSTALLED:
100 |         series_types.update({geopandas.GeoSeries})
101 | 
102 |     return series_types
103 | 
104 | 
105 | @lru_cache
106 | def get_index_types():
107 |     from pandera.typing import dask, modin, pyspark
108 | 
109 |     index_types: Set[Type] = {Index}
110 |     if dask.DASK_INSTALLED:
111 |         index_types.update({dask.Index})
112 | 
113 |     if modin.MODIN_INSTALLED:
114 |         index_types.update({modin.Index})
115 | 
116 |     if pyspark.PYSPARK_INSTALLED:
117 |         index_types.update({pyspark.Index})  # type: ignore [arg-type]
118 | 
119 |     return index_types
120 | 
121 | 
122 | __all__ = [
123 |     "AnnotationInfo",
124 |     "DataFrame",
125 |     "Series",
126 |     "Index",
127 |     "get_dataframe_types",
128 |     "get_index_types",
129 |     "get_series_types",
130 | ]
131 | 


--------------------------------------------------------------------------------
/pandera/typing/dask.py:
--------------------------------------------------------------------------------
 1 | """Pandera type annotations for Dask."""
 2 | 
 3 | from typing import TYPE_CHECKING, Generic, TypeVar
 4 | 
 5 | from pandera.typing.common import DataFrameBase, IndexBase, SeriesBase
 6 | from pandera.typing.pandas import DataFrameModel, GenericDtype
 7 | 
 8 | try:
 9 |     import dask.dataframe as dd
10 | 
11 |     DASK_INSTALLED = True
12 | except ImportError:
13 |     DASK_INSTALLED = False
14 | 
15 | 
16 | # pylint:disable=invalid-name
17 | if TYPE_CHECKING:
18 |     T = TypeVar("T")  # pragma: no cover
19 | else:
20 |     T = DataFrameModel
21 | 
22 | 
23 | if DASK_INSTALLED:
24 |     # pylint: disable=too-few-public-methods,abstract-method
25 |     class DataFrame(DataFrameBase, dd.DataFrame, Generic[T]):
26 |         """
27 |         Representation of dask.dataframe.DataFrame, only used for type
28 |         annotation.
29 | 
30 |         *new in 0.8.0*
31 |         """
32 | 
33 |     # pylint:disable=too-few-public-methods
34 |     class Series(SeriesBase, dd.Series, Generic[GenericDtype]):  # type: ignore
35 |         """Representation of pandas.Series, only used for type annotation.
36 | 
37 |         *new in 0.8.0*
38 |         """
39 | 
40 |     # pylint:disable=too-few-public-methods
41 |     class Index(IndexBase, dd.Index, Generic[GenericDtype]):
42 |         """Representation of pandas.Index, only used for type annotation.
43 | 
44 |         *new in 0.8.0*
45 |         """
46 | 


--------------------------------------------------------------------------------
/pandera/typing/formats.py:
--------------------------------------------------------------------------------
 1 | """Serialization formats for dataframes."""
 2 | 
 3 | from enum import Enum
 4 | from typing import Union
 5 | 
 6 | try:
 7 |     # python 3.8+
 8 |     from typing import Literal  # type: ignore[attr-defined]
 9 | except ImportError:  # pragma: no cover
10 |     from typing_extensions import Literal  # type: ignore[assignment]
11 | 
12 | 
13 | class Formats(Enum):
14 |     """Data container serialization formats.
15 | 
16 |     The values of this enum specify the valid values taken by the ``to_format``
17 |     and ``from_format`` attributes in
18 |     :py:class:`~pandera.typing.config.BaseConfig` when specifying a
19 |     :py:class:`~pandera.api.pandas.model.DataFrameModel`.
20 |     """
21 | 
22 |     # pylint: disable=invalid-name
23 | 
24 |     #: comma-separated values file
25 |     csv = "csv"
26 | 
27 |     #: python dictionary
28 |     dict = "dict"
29 | 
30 |     #: json file
31 |     json = "json"
32 | 
33 |     #: feather file format. See
34 |     #: `here <https://arrow.apache.org/docs/python/feather.html>`__ for more
35 |     #: details
36 |     feather = "feather"
37 | 
38 |     #: parquet file format. See `here <https://parquet.apache.org/>`__ for more
39 |     #: details
40 |     parquet = "parquet"
41 | 
42 |     #: python pickle file format
43 |     pickle = "pickle"
44 | 
45 |     #: python json_normalize
46 |     json_normalize = "json_normalize"
47 | 
48 | 
49 | Format = Union[
50 |     Literal[Formats.csv],
51 |     Literal[Formats.dict],
52 |     Literal[Formats.json],
53 |     Literal[Formats.feather],
54 |     Literal[Formats.parquet],
55 |     Literal[Formats.pickle],
56 |     Literal[Formats.json_normalize],
57 | ]
58 | 


--------------------------------------------------------------------------------
/pandera/typing/modin.py:
--------------------------------------------------------------------------------
 1 | """Pandera type annotations for Modin."""
 2 | 
 3 | from typing import TYPE_CHECKING, Generic, TypeVar
 4 | 
 5 | from packaging import version
 6 | 
 7 | from pandera.typing.common import DataFrameBase, IndexBase, SeriesBase
 8 | from pandera.typing.pandas import DataFrameModel, GenericDtype
 9 | 
10 | try:
11 |     import modin
12 |     import modin.pandas as mpd
13 | 
14 |     MODIN_INSTALLED = True
15 | except ImportError:
16 |     MODIN_INSTALLED = False
17 | 
18 | 
19 | def modin_version():
20 |     """Return the modin version."""
21 |     return version.parse(modin.__version__)
22 | 
23 | 
24 | # pylint:disable=invalid-name
25 | if TYPE_CHECKING:
26 |     T = TypeVar("T")  # pragma: no cover
27 | else:
28 |     T = DataFrameModel
29 | 
30 | 
31 | if MODIN_INSTALLED:
32 |     # pylint: disable=too-few-public-methods
33 |     class DataFrame(DataFrameBase, mpd.DataFrame, Generic[T]):
34 |         """
35 |         Representation of dask.dataframe.DataFrame, only used for type
36 |         annotation.
37 | 
38 |         *new in 0.8.0*
39 |         """
40 | 
41 |     # pylint:disable=too-few-public-methods,abstract-method
42 |     class Series(SeriesBase, mpd.Series, Generic[GenericDtype]):
43 |         """Representation of pandas.Series, only used for type annotation.
44 | 
45 |         *new in 0.8.0*
46 |         """
47 | 
48 |     # pylint:disable=too-few-public-methods,abstract-method
49 |     class Index(IndexBase, mpd.Index, Generic[GenericDtype]):
50 |         """Representation of pandas.Index, only used for type annotation.
51 | 
52 |         *new in 0.8.0*
53 |         """
54 | 


--------------------------------------------------------------------------------
/pandera/typing/pyspark.py:
--------------------------------------------------------------------------------
 1 | """Pandera type annotations for Pyspark Pandas."""
 2 | 
 3 | from typing import TYPE_CHECKING, Generic, TypeVar
 4 | 
 5 | from pandera.typing.common import (
 6 |     DataFrameBase,
 7 |     GenericDtype,
 8 |     IndexBase,
 9 |     SeriesBase,
10 | )
11 | from pandera.typing.pandas import DataFrameModel, _GenericAlias
12 | 
13 | try:
14 |     import pyspark.pandas as ps
15 | 
16 |     PYSPARK_INSTALLED = True
17 | except ImportError:  # pragma: no cover
18 |     PYSPARK_INSTALLED = False
19 | 
20 | 
21 | # pylint:disable=invalid-name
22 | if TYPE_CHECKING:
23 |     T = TypeVar("T")  # pragma: no cover
24 | else:
25 |     T = DataFrameModel
26 | 
27 | 
28 | if PYSPARK_INSTALLED:
29 |     # pylint: disable=too-few-public-methods,arguments-renamed
30 |     class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]):
31 |         """
32 |         Representation of dask.dataframe.DataFrame, only used for type
33 |         annotation.
34 | 
35 |         *new in 0.8.0*
36 |         """
37 | 
38 |         def __class_getitem__(cls, item):
39 |             """Define this to override's pyspark.pandas generic type."""
40 |             return _GenericAlias(cls, item)
41 | 
42 |     # pylint:disable=too-few-public-methods,arguments-renamed
43 |     class Series(SeriesBase, ps.Series, Generic[GenericDtype]):  # type: ignore [misc]  # noqa
44 |         """Representation of pandas.Series, only used for type annotation.
45 | 
46 |         *new in 0.8.0*
47 |         """
48 | 
49 |         def __class_getitem__(cls, item):
50 |             """Define this to override pyspark.pandas generic type"""
51 |             return _GenericAlias(cls, item)
52 | 
53 |     # pylint:disable=too-few-public-methods
54 |     class Index(IndexBase, ps.Index, Generic[GenericDtype]):
55 |         """Representation of pandas.Index, only used for type annotation.
56 | 
57 |         *new in 0.8.0*
58 |         """
59 | 


--------------------------------------------------------------------------------
/pandera/typing/pyspark_sql.py:
--------------------------------------------------------------------------------
 1 | """Pandera type annotations for Pyspark."""
 2 | 
 3 | from typing import TypeVar, Union
 4 | 
 5 | from pandera.typing.common import DataFrameBase
 6 | from pandera.typing.pandas import DataFrameModel, _GenericAlias
 7 | 
 8 | try:
 9 |     import pyspark.sql as ps
10 | 
11 |     PYSPARK_SQL_INSTALLED = True
12 | except ImportError:  # pragma: no cover
13 |     PYSPARK_SQL_INSTALLED = False
14 | 
15 | if PYSPARK_SQL_INSTALLED:
16 |     from pandera.engines import pyspark_engine
17 | 
18 |     PysparkString = pyspark_engine.String
19 |     PysparkInt = pyspark_engine.Int
20 |     PysparkLongInt = pyspark_engine.BigInt
21 |     PysparkShortInt = pyspark_engine.ShortInt
22 |     PysparkByteInt = pyspark_engine.ByteInt
23 |     PysparkDouble = pyspark_engine.Double
24 |     PysparkFloat = pyspark_engine.Float
25 |     PysparkDecimal = pyspark_engine.Decimal
26 |     PysparkDate = pyspark_engine.Date
27 |     PysparkTimestamp = pyspark_engine.Timestamp
28 |     PysparkBinary = pyspark_engine.Binary
29 | 
30 |     PysparkDType = TypeVar(  # type: ignore
31 |         "PysparkDType",
32 |         bound=Union[
33 |             PysparkString,  # type: ignore
34 |             PysparkInt,  # type: ignore
35 |             PysparkLongInt,  # type: ignore
36 |             PysparkShortInt,  # type: ignore
37 |             PysparkByteInt,  # type: ignore
38 |             PysparkDouble,  # type: ignore
39 |             PysparkFloat,  # type: ignore
40 |             PysparkDecimal,  # type: ignore
41 |             PysparkDate,  # type: ignore
42 |             PysparkTimestamp,  # type: ignore
43 |             PysparkBinary,  # type: ignore
44 |         ],
45 |     )
46 |     from typing import TYPE_CHECKING, Generic
47 | 
48 |     # pylint:disable=invalid-name
49 |     if TYPE_CHECKING:
50 |         T = TypeVar("T")  # pragma: no cover
51 |     else:
52 |         T = DataFrameModel
53 | 
54 |     if PYSPARK_SQL_INSTALLED:
55 |         # pylint: disable=too-few-public-methods,arguments-renamed
56 |         class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]):
57 |             """
58 |             Representation of dask.dataframe.DataFrame, only used for type
59 |             annotation.
60 | 
61 |             *new in 0.8.0*
62 |             """
63 | 
64 |             def __class_getitem__(cls, item):
65 |                 """Define this to override's pyspark.pandas generic type."""
66 |                 return _GenericAlias(cls, item)  # pragma: no cover
67 | 


--------------------------------------------------------------------------------
/pandera/utils.py:
--------------------------------------------------------------------------------
 1 | """General utility functions"""
 2 | 
 3 | from typing import Any, Callable, TypeVar
 4 | 
 5 | F = TypeVar("F", bound=Callable)
 6 | 
 7 | 
 8 | def docstring_substitution(*args: Any, **kwargs: Any) -> Callable[[F], F]:
 9 |     """Typed wrapper around pandas.util.Substitution."""
10 | 
11 |     def decorator(func: F) -> F:
12 |         # handle case when pandera is run in optimized mode:
13 |         # https://docs.python.org/3/using/cmdline.html#cmdoption-OO
14 |         if func.__doc__ is None:
15 |             return func
16 | 
17 |         if args:
18 |             _doc = func.__doc__ % tuple(args)  # type: ignore[operator]
19 |         elif kwargs:
20 |             _doc = func.__doc__ % kwargs  # type: ignore[operator]
21 |         func.__doc__ = _doc  # pylint:disable=possibly-used-before-assignment
22 |         return func
23 | 
24 |     return decorator
25 | 
26 | 
27 | def is_regex(name: str):
28 |     """
29 |     Checks whether a string is a regex pattern, as defined as starting with
30 |     '^' and ending with '$'.
31 |     """
32 |     return name.startswith("^") and name.endswith("$")
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file is auto-generated from environment.yml, do not modify.
 2 | # See that file for comments about the need/usage of each dependency.
 3 | 
 4 | pip
 5 | packaging >= 20.0
 6 | typing_extensions
 7 | hypothesis >= 6.92.7
 8 | pyyaml >= 5.1
 9 | typing_inspect >= 0.6.0
10 | frictionless <= 4.40.8
11 | pyarrow
12 | pydantic
13 | scipy
14 | pandas-stubs
15 | pyspark[connect] >= 3.2.0, < 4.0.0
16 | polars >= 0.20.0
17 | modin
18 | protobuf
19 | geopandas
20 | shapely
21 | fastapi
22 | black >= 24.0
23 | numpy >= 1.24.4
24 | pandas >= 2.1.1
25 | isort >= 5.7.0
26 | joblib
27 | mypy == 1.10.0
28 | pylint < 3.3
29 | pytest
30 | pytest-cov
31 | pytest-xdist
32 | pytest-asyncio
33 | pytz
34 | xdoctest
35 | nox
36 | uv
37 | setuptools
38 | uvicorn
39 | python-multipart
40 | sphinx
41 | sphinx-design
42 | sphinx-autodoc-typehints <= 1.14.1
43 | sphinx-copybutton
44 | recommonmark
45 | myst-nb
46 | twine
47 | asv >= 0.5.1
48 | pre_commit
49 | dask[dataframe]
50 | distributed
51 | furo
52 | sphinx-docsearch
53 | grpcio
54 | ray
55 | typeguard
56 | types-click
57 | types-pytz
58 | types-pyyaml
59 | types-requests
60 | types-setuptools
61 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | float_to_top = true
3 | profile = black
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()
4 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/__init__.py


--------------------------------------------------------------------------------
/tests/base/test_base_schema.py:
--------------------------------------------------------------------------------
 1 | """Base schema unit tests."""
 2 | 
 3 | import pytest
 4 | 
 5 | from pandera.api.base.schema import BaseSchema
 6 | from pandera.backends.base import BaseSchemaBackend
 7 | 
 8 | 
 9 | class MockSchema(BaseSchema):
10 |     """Mock schema"""
11 | 
12 | 
13 | class MockSchemaBackend(BaseSchemaBackend):
14 |     """Mock schema backend"""
15 | 
16 | 
17 | def test_get_backend_error():
18 |     """Raise value error when no arguments are passed."""
19 | 
20 |     schema = MockSchema()
21 |     with pytest.raises(ValueError):
22 |         schema.get_backend()
23 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Pytest configuration."""
 2 | 
 3 | import os
 4 | 
 5 | try:
 6 |     # pylint: disable=unused-import
 7 |     import hypothesis  # noqa F401
 8 |     from hypothesis import settings
 9 | except ImportError:
10 |     HAS_HYPOTHESIS = False
11 | else:
12 |     HAS_HYPOTHESIS = True
13 | 
14 | # ignore test files associated with hypothesis strategies
15 | collect_ignore = []
16 | 
17 | if not HAS_HYPOTHESIS:
18 |     collect_ignore.append("test_strategies.py")
19 | else:
20 |     suppressed_health_checks = [
21 |         hypothesis.HealthCheck.data_too_large,
22 |         hypothesis.HealthCheck.too_slow,
23 |         hypothesis.HealthCheck.filter_too_much,
24 |     ]
25 | 
26 |     settings.register_profile(
27 |         "ci",
28 |         max_examples=10,
29 |         deadline=None,
30 |         suppress_health_check=suppressed_health_checks,
31 |     )
32 |     settings.register_profile(
33 |         "dev",
34 |         max_examples=30,
35 |         deadline=None,
36 |         suppress_health_check=suppressed_health_checks,
37 |     )
38 |     settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "dev"))
39 | 


--------------------------------------------------------------------------------
/tests/dask/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/dask/__init__.py


--------------------------------------------------------------------------------
/tests/dask/test_dask_accessor.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for dask_accessor module."""
 2 | 
 3 | from typing import Union
 4 | 
 5 | import dask.dataframe as dd
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | import pandera.pandas as pa
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "schema1, schema2, data, invalid_data",
14 |     [
15 |         [
16 |             pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True),
17 |             pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True),
18 |             dd.from_pandas(pd.DataFrame({"col": [1, 2, 3]}), npartitions=1),
19 |             dd.from_pandas(pd.Series([1, 2, 3]), npartitions=1),
20 |         ],
21 |         [
22 |             pa.SeriesSchema(int, coerce=True),
23 |             pa.SeriesSchema(float, coerce=True),
24 |             dd.from_pandas(pd.Series([1, 2, 3]), npartitions=1),
25 |             dd.from_pandas(pd.DataFrame({"col": [1, 2, 3]}), npartitions=1),
26 |         ],
27 |     ],
28 | )
29 | @pytest.mark.parametrize("inplace", [False, True])
30 | def test_dataframe_series_add_schema(
31 |     schema1: Union[pa.DataFrameSchema, pa.SeriesSchema],
32 |     schema2: Union[pa.DataFrameSchema, pa.SeriesSchema],
33 |     data: Union[pd.DataFrame, pd.Series],
34 |     invalid_data: Union[pd.DataFrame, pd.Series],
35 |     inplace: bool,
36 | ) -> None:
37 |     """
38 |     Test that pandas object contains schema metadata after pandera validation.
39 |     """
40 |     validated_data_1 = schema1(data, inplace=inplace)  # type: ignore[arg-type]
41 |     if inplace:
42 |         assert data.pandera.schema == schema1
43 |     else:
44 |         assert data.pandera.schema is None
45 |     assert validated_data_1.pandera.schema == schema1
46 | 
47 |     validated_data_2 = schema2(validated_data_1, inplace=inplace)  # type: ignore[arg-type]
48 |     if inplace:
49 |         assert validated_data_1.pandera.schema == schema2
50 |     else:
51 |         assert validated_data_1.pandera.schema == schema1
52 |     assert validated_data_2.pandera.schema == schema2
53 | 
54 |     with pytest.raises(TypeError):
55 |         schema1(invalid_data)  # type: ignore[arg-type]
56 | 
57 |     with pytest.raises(TypeError):
58 |         schema2(invalid_data)  # type: ignore[arg-type]
59 | 


--------------------------------------------------------------------------------
/tests/dask/test_dask_not_installed.py:
--------------------------------------------------------------------------------
 1 | """Tests behavior when dask is not installed."""
 2 | 
 3 | import sys
 4 | from unittest import mock
 5 | 
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | 
10 | def test_dask_not_installed() -> None:
11 |     """
12 |     Test that Pandera and its modules can be imported and continue to work
13 |     without dask.
14 |     """
15 |     with mock.patch.dict("sys.modules", {"dask": None}):
16 |         with pytest.raises(ImportError):
17 |             # pylint: disable=import-outside-toplevel,unused-import
18 |             import dask.dataframe
19 | 
20 |         for module in ["pandera", "pandera.accessors.dask_accessor"]:
21 |             try:
22 |                 del sys.modules[module]
23 |             except KeyError:
24 |                 ...
25 | 
26 |         # pylint: disable=import-outside-toplevel,unused-import
27 |         import pandera
28 | 
29 |         assert "pandera.accessors.dask_accessor" not in sys.modules
30 | 
31 |         del sys.modules["pandera"]
32 |         del sys.modules["pandera.api.pandas.types"]
33 |         # pylint: disable=import-outside-toplevel
34 |         from pandera.api.pandas.types import is_table
35 | 
36 |         assert not is_table(pd.Series([1]))
37 | 
38 |         for module in ["pandera", "pandera.typing"]:
39 |             try:
40 |                 del sys.modules[module]
41 |             except KeyError:
42 |                 ...
43 | 
44 |         # pylint: disable=import-outside-toplevel
45 |         import pandera.typing
46 | 
47 |         annotation = pandera.typing.DataFrame[int]
48 |         assert pandera.typing.AnnotationInfo(annotation).is_generic_df
49 | 


--------------------------------------------------------------------------------
/tests/fastapi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/fastapi/__init__.py


--------------------------------------------------------------------------------
/tests/fastapi/app.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | from fastapi import Body, FastAPI, File
 3 | from fastapi.responses import HTMLResponse
 4 | 
 5 | from pandera.typing import DataFrame
 6 | from pandera.typing.fastapi import UploadFile
 7 | from tests.fastapi.models import (
 8 |     Item,
 9 |     ResponseModel,
10 |     Transactions,
11 |     TransactionsDictOut,
12 |     TransactionsParquet,
13 | )
14 | 
15 | try:
16 |     from typing import Annotated  # type: ignore[attr-defined]
17 | except ImportError:
18 |     from typing_extensions import Annotated  # type: ignore[assignment]
19 | 
20 | app = FastAPI()
21 | 
22 | 
23 | @app.post("/items/", response_model=Item)
24 | def create_item(item: Item):
25 |     return item
26 | 
27 | 
28 | @app.post("/transactions/", response_model=DataFrame[TransactionsDictOut])
29 | def create_transactions(
30 |     transactions: Annotated[DataFrame[Transactions], Body()],
31 | ):
32 |     output = transactions.assign(name="foo")
33 |     ...  # do other stuff, e.g. update backend database with transactions
34 |     return output
35 | 
36 | 
37 | @app.post("/file/", response_model=ResponseModel)
38 | def create_upload_file(
39 |     file: Annotated[UploadFile[DataFrame[TransactionsParquet]], File()],
40 | ):
41 |     return {
42 |         "filename": file.filename,
43 |         "df": file.data.assign(name="foo"),
44 |     }
45 | 
46 | 
47 | @app.get("/")
48 | def main():
49 |     content = """
50 | <body>
51 | <form action="/file/" enctype="multipart/form-data" method="post">
52 | <input name="file" type="file" multiple>
53 | <input type="submit">
54 | </form>
55 | </body>
56 |     """
57 |     return HTMLResponse(content=content)
58 | 


--------------------------------------------------------------------------------
/tests/fastapi/models.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | from typing import Optional
 3 | 
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | import pandera.pandas as pa
 7 | 
 8 | 
 9 | class Transactions(pa.DataFrameModel):
10 |     id: pa.typing.Series[int]
11 |     cost: pa.typing.Series[float] = pa.Field(ge=0, le=1000)
12 | 
13 |     class Config:
14 |         coerce = True
15 | 
16 | 
17 | class TransactionsParquet(Transactions):
18 |     class Config:
19 |         from_format = "parquet"
20 | 
21 | 
22 | class TransactionsOut(Transactions):
23 |     id: pa.typing.Series[int]
24 |     cost: pa.typing.Series[float]
25 |     name: pa.typing.Series[str]
26 | 
27 | 
28 | class TransactionsJsonOut(TransactionsOut):
29 |     class Config:
30 |         to_format = "json"
31 |         to_format_kwargs = {"orient": "records"}
32 | 
33 | 
34 | class TransactionsDictOut(TransactionsOut):
35 |     class Config:
36 |         to_format = "dict"
37 |         to_format_kwargs = {"orient": "records"}
38 | 
39 | 
40 | class Item(BaseModel):
41 |     name: str
42 |     value: int = Field(ge=0)
43 |     description: Optional[str] = None
44 | 
45 | 
46 | class ResponseModel(BaseModel):
47 |     filename: str
48 |     df: pa.typing.DataFrame[TransactionsJsonOut]
49 | 


--------------------------------------------------------------------------------
/tests/fastapi/test_app.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=redefined-outer-name,unused-argument
 2 | """Unit tests for using pandera types in fastapi endpoints."""
 3 | 
 4 | import io
 5 | import subprocess
 6 | import time
 7 | from copy import deepcopy
 8 | 
 9 | import pandas as pd
10 | import pytest
11 | import requests
12 | from hypothesis import given
13 | 
14 | from tests.fastapi.models import Transactions, TransactionsOut
15 | 
16 | 
17 | @pytest.fixture(scope="module")
18 | def app():
19 |     """Transient app server for testing."""
20 |     # pylint: disable=consider-using-with
21 |     process = subprocess.Popen(
22 |         ["uvicorn", "tests.fastapi.app:app", "--port", "8000"],
23 |         stdout=subprocess.PIPE,
24 |     )
25 |     _wait_to_exist()
26 |     yield process
27 |     process.terminate()
28 | 
29 | 
30 | def _wait_to_exist():
31 |     for _ in range(20):
32 |         try:
33 |             requests.post("http://127.0.0.1:8000/")
34 |             break
35 |         except Exception:  # pylint: disable=broad-except
36 |             time.sleep(3.0)
37 | 
38 | 
39 | def test_items_endpoint(app):
40 |     """Happy path test with pydantic type annotations."""
41 |     data = {"name": "Book", "value": 10, "description": "Hello"}
42 |     for _ in range(10):
43 |         response = requests.post("http://127.0.0.1:8000/items/", json=data)
44 |         if response.status_code != 200:
45 |             time.sleep(3.0)
46 |     assert response.json() == data
47 | 
48 | 
49 | def test_transactions_endpoint(app):
50 |     """Happy path test with pandera type endpoint type annotation."""
51 |     data = {"id": [1], "cost": [10.99]}
52 |     response = requests.post(
53 |         "http://127.0.0.1:8000/transactions/",
54 |         json=data,
55 |     )
56 |     expected_output = deepcopy(data)
57 |     expected_output = [{"id": 1, "cost": 10.99, "name": "foo"}]
58 |     assert response.json() == expected_output
59 | 
60 | 
61 | @given(Transactions.strategy(size=10))
62 | def test_upload_file_endpoint(app, sample):
63 |     """
64 |     Test upload file endpoint with Upload[DataFrame[DataFrameModel]] input.
65 |     """
66 |     buf = io.BytesIO()
67 |     sample.to_parquet(buf)
68 |     buf.seek(0)
69 | 
70 |     expected_result = pd.read_parquet(buf).assign(name="foo")
71 |     buf.seek(0)
72 | 
73 |     response = requests.post(
74 |         "http://127.0.0.1:8000/file/", files={"file": buf}
75 |     )
76 |     output = response.json()
77 |     assert output["filename"] == "file"
78 |     output_df = pd.read_json(output["df"])
79 |     cost_notna = ~output_df["cost"].isna()
80 |     pd.testing.assert_frame_equal(
81 |         TransactionsOut.validate(output_df[cost_notna]),
82 |         TransactionsOut.validate(expected_result[cost_notna]),
83 |     )
84 | 


--------------------------------------------------------------------------------
/tests/hypotheses/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/hypotheses/__init__.py


--------------------------------------------------------------------------------
/tests/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/io/__init__.py


--------------------------------------------------------------------------------
/tests/modin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/modin/__init__.py


--------------------------------------------------------------------------------
/tests/modin/conftest.py:
--------------------------------------------------------------------------------
 1 | """Registers fixtures for core"""
 2 | 
 3 | import os
 4 | from typing import Generator
 5 | 
 6 | import pytest
 7 | from pandera.api.checks import Check
 8 | 
 9 | # pylint: disable=unused-import
10 | ENGINES = os.getenv("CI_MODIN_ENGINES", "").split(",")
11 | if ENGINES == [""]:
12 |     ENGINES = ["dask"]
13 | 
14 | 
15 | @pytest.fixture(scope="function")
16 | def custom_check_teardown() -> Generator[None, None, None]:
17 |     """Remove all custom checks after execution of each pytest function."""
18 |     yield
19 |     for check_name in list(Check.REGISTERED_CUSTOM_CHECKS):
20 |         del Check.REGISTERED_CUSTOM_CHECKS[check_name]
21 | 
22 | 
23 | @pytest.fixture(scope="session", params=ENGINES, autouse=True)
24 | def setup_modin_engine(request):
25 |     """Set up the modin engine.
26 | 
27 |     Eventually this will also support dask execution backend.
28 |     """
29 |     engine = request.param
30 |     os.environ["MODIN_ENGINE"] = engine
31 |     os.environ["MODIN_STORAGE_FORMAT"] = "pandas"
32 |     os.environ["MODIN_MEMORY"] = "100000000"
33 |     os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
34 | 
35 |     if engine == "ray":
36 |         # pylint: disable=import-outside-toplevel
37 |         import ray
38 | 
39 |         ray.init(
40 |             runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}}
41 |         )
42 |         yield
43 |         ray.shutdown()
44 | 
45 |     elif engine == "dask":
46 |         # pylint: disable=import-outside-toplevel
47 |         from distributed import Client
48 | 
49 |         client = Client()
50 |         yield
51 |         client.close()
52 |     else:
53 |         raise ValueError(f"Not supported engine: {engine}")
54 | 


--------------------------------------------------------------------------------
/tests/modin/test_modin_accessor.py:
--------------------------------------------------------------------------------
 1 | """Unit tests of modin accessor functionality.
 2 | 
 3 | Since modin doesn't currently support the pandas accessor extension API,
 4 | pandera implements it.
 5 | """
 6 | 
 7 | import pytest
 8 | 
 9 | from pandera.accessors import modin_accessor
10 | 
11 | 
12 | # pylint: disable=too-few-public-methods
13 | class CustomAccessor:
14 |     """Mock accessor class"""
15 | 
16 |     def __init__(self, obj):
17 |         self._obj = obj
18 | 
19 | 
20 | def test_modin_accessor_warning():
21 |     """Test that modin accessor raises warning when name already exists."""
22 |     modin_accessor.register_dataframe_accessor("foo")(CustomAccessor)
23 |     with pytest.warns(UserWarning):
24 |         modin_accessor.register_dataframe_accessor("foo")(CustomAccessor)
25 | 


--------------------------------------------------------------------------------
/tests/mypy/config/no_plugin.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | follow_imports = silent
4 | show_error_codes = True
5 | allow_redefinition = True
6 | warn_return_any = False
7 | warn_unused_configs = True
8 | 


--------------------------------------------------------------------------------
/tests/mypy/config/plugin_mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pandera.mypy
3 | ignore_missing_imports = True
4 | follow_imports = skip
5 | show_error_codes = True
6 | allow_redefinition = True
7 | warn_return_any = False
8 | warn_unused_configs = True
9 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_concat.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | import pandas as pd
 3 | 
 4 | df = pd.DataFrame([[1]])
 5 | sr = pd.Series([1])
 6 | 
 7 | 
 8 | df_concat = pd.concat([df, df])
 9 | sr_concat = pd.concat([sr, sr])
10 | sr_axis1_concat = pd.concat([sr, sr], axis=1)
11 | 
12 | # mypy error without pandera plugin
13 | df_generator_concat: pd.DataFrame = pd.concat(df for _ in range(3))
14 | 
15 | # mypy error without pandera plugin
16 | sr_generator_concat: pd.Series = pd.concat(sr for _ in range(3))
17 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_dataframe.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | """Unit tests for static type checking of dataframes.
 3 | 
 4 | This test module uses https://github.com/davidfritzsche/pytest-mypy-testing to
 5 | run statically check the functions marked pytest.mark.mypy_testing
 6 | """
 7 | 
 8 | from typing import Optional, cast
 9 | 
10 | import pandas as pd
11 | 
12 | import pandera.pandas as pa
13 | from pandera.typing import DataFrame, Series
14 | 
15 | 
16 | class Schema(pa.DataFrameModel):
17 |     id: Series[int]
18 |     name: Series[str]
19 | 
20 | 
21 | class SchemaOut(pa.DataFrameModel):
22 |     age: Series[int]
23 | 
24 | 
25 | class AnotherSchema(pa.DataFrameModel):
26 |     id: Series[int]
27 |     first_name: Optional[Series[str]]
28 | 
29 | 
30 | def fn(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
31 |     return df.assign(age=30).pipe(DataFrame[SchemaOut])  # mypy okay
32 | 
33 | 
34 | def fn_pipe_incorrect_type(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
35 |     return df.assign(age=30).pipe(DataFrame[AnotherSchema])  # mypy error
36 |     # error: Argument 1 to "pipe" of "NDFrame" has incompatible type "Type[DataFrame[Any]]";  # noqa
37 |     # expected "Union[Callable[..., DataFrame[SchemaOut]], Tuple[Callable[..., DataFrame[SchemaOut]], str]]"  [arg-type]  # noqa
38 | 
39 | 
40 | def fn_assign_copy(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
41 |     return df.assign(age=30)  # mypy error
42 |     # error: Incompatible return value type (got "pandas.core.frame.DataFrame",
43 |     # expected "pandera.typing.pandas.DataFrame[SchemaOut]")  [return-value]
44 | 
45 | 
46 | # Define a few dataframe objects
47 | schema_df = DataFrame[Schema]({"id": [1], "name": ["foo"]})
48 | pandas_df = pd.DataFrame({"id": [1], "name": ["foo"]})
49 | another_df = DataFrame[AnotherSchema]({"id": [1], "first_name": ["foo"]})
50 | 
51 | 
52 | fn(schema_df)  # mypy okay
53 | 
54 | fn(pandas_df)  # mypy error
55 | # error: Argument 1 to "fn" has incompatible type "pandas.core.frame.DataFrame";  # noqa
56 | # expected "pandera.typing.pandas.DataFrame[Schema]"  [arg-type]
57 | 
58 | fn(another_df)  # mypy error
59 | # error: Argument 1 to "fn" has incompatible type "DataFrame[AnotherSchema]";
60 | # expected "DataFrame[Schema]"  [arg-type]
61 | 
62 | 
63 | def fn_pipe_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
64 |     return df.assign(age=30).pipe(DataFrame[SchemaOut])  # mypy okay
65 | 
66 | 
67 | def fn_cast_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
68 |     return cast(DataFrame[SchemaOut], df.assign(age=30))  # mypy okay
69 | 
70 | 
71 | @pa.check_types
72 | def fn_mutate_inplace(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
73 |     out = df.assign(age=30).pipe(DataFrame[SchemaOut])
74 |     out.drop(columns="age", inplace=True)
75 |     return out  # okay for mypy, pandera raises error
76 | 
77 | 
78 | @pa.check_types
79 | def fn_assign_and_get_index(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
80 |     return df.assign(foo=30).iloc[:3]  # mypy error
81 |     # error: Incompatible return value type (got "pandas.core.frame.DataFrame",
82 |     # expected "pandera.typing.pandas.DataFrame[SchemaOut]")  [return-value]
83 | 
84 | 
85 | @pa.check_types
86 | def fn_cast_dataframe_invalid(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
87 |     return cast(
88 |         DataFrame[SchemaOut], df
89 |     )  # okay for mypy, pandera raises error
90 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_index.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | import pandas as pd
 3 | 
 4 | df = pd.DataFrame({"a": [1, 2, 3]})
 5 | sr = pd.Series([1, 2, 3])
 6 | idx = pd.Index([1, 2, 3])
 7 | 
 8 | df_index_unique: bool = df.index.is_unique
 9 | sr_index_unique: bool = df["a"].index.is_unique
10 | idx_unique: bool = idx.is_unique
11 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_series.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def fn(s: pd.Series[str]) -> bool:
 6 |     return True
 7 | 
 8 | 
 9 | fn(s=pd.Series([1.0, 1.0, 1.0], dtype=float))  # mypy okay
10 | 
11 | series = pd.Series([1.0, 1.0, 1.0], dtype=float)
12 | fn(series)  # mypy able to determine `series` type, raises error
13 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandas_time.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | import pandas as pd
 3 | 
 4 | pd.Timestamp.now() + pd.tseries.offsets.YearEnd(1)
 5 | 
 6 | pd.Timedelta(minutes=2)
 7 | pd.Timedelta(2, unit="minutes")
 8 | 
 9 | pd.Timedelta(minutes=2, seconds=30)
10 | pd.Timedelta(2.5, unit="minutes")  # mypy error
11 | pd.Timedelta(2, unit="minutes") + pd.Timedelta(30, unit="seconds")
12 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandera_inheritance.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | """With the pandera.mypy plugin, mypy ignores type overrides."""
 3 | 
 4 | import pandera.pandas as pa
 5 | 
 6 | 
 7 | class Schema(pa.DataFrameModel):
 8 |     a: pa.typing.Series[int]
 9 |     b: pa.typing.Series[str]
10 |     c: pa.typing.Series[bool]
11 | 
12 | 
13 | class Schema2(Schema):
14 |     a: pa.typing.Series[str]
15 |     b: pa.typing.Series[float]
16 |     c: pa.typing.Series[int]
17 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/pandera_types.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | import pandas as pd
 3 | 
 4 | import pandera.pandas as pa
 5 | 
 6 | 
 7 | def fn(series: pa.typing.Series[int]) -> None:
 8 |     pass
 9 | 
10 | 
11 | df = pd.DataFrame({"a": [1, 2, 3]})
12 | sr = pd.Series([1, 2, 3])
13 | 
14 | fn(sr)
15 | fn(df["a"])
16 | 


--------------------------------------------------------------------------------
/tests/mypy/pandas_modules/python_slice.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | import pandas as pd
3 | 
4 | df = pd.DataFrame({"a": [1, 2, 3]}, index=[*"abc"])
5 | df.loc["a":"c"]
6 | 


--------------------------------------------------------------------------------
/tests/pandas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/pandas/__init__.py


--------------------------------------------------------------------------------
/tests/pandas/checks_fixtures.py:
--------------------------------------------------------------------------------
 1 | """Pytest fixtures for testing custom checks."""
 2 | 
 3 | from typing import Generator
 4 | from unittest import mock
 5 | 
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | import pandera.pandas as pa
10 | import pandera.api.extensions as pa_ext
11 | 
12 | __all__ = "custom_check_teardown", "extra_registered_checks"
13 | 
14 | 
15 | @pytest.fixture(scope="function")
16 | def custom_check_teardown() -> Generator[None, None, None]:
17 |     """Remove all custom checks after execution of each pytest function."""
18 |     yield
19 |     for check_name in list(pa.Check.REGISTERED_CUSTOM_CHECKS):
20 |         del pa.Check.REGISTERED_CUSTOM_CHECKS[check_name]
21 | 
22 | 
23 | @pytest.fixture(scope="function")
24 | def extra_registered_checks() -> Generator[None, None, None]:
25 |     """temporarily registers custom checks onto the Check class"""
26 |     # pylint: disable=unused-variable
27 |     with mock.patch(
28 |         "pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict
29 |     ):
30 |         # register custom checks here
31 |         @pa_ext.register_check_method()
32 |         def no_param_check(_: pd.DataFrame) -> bool:
33 |             return True
34 | 
35 |         @pa_ext.register_check_method()
36 |         def no_param_check_ellipsis(_: pd.DataFrame) -> bool:
37 |             return True
38 | 
39 |         @pa_ext.register_check_method()
40 |         def raise_an_error_check(_: pd.DataFrame) -> bool:
41 |             raise TypeError("Test error in custom check")
42 | 
43 |         yield
44 | 


--------------------------------------------------------------------------------
/tests/pandas/conftest.py:
--------------------------------------------------------------------------------
1 | """Registers fixtures for core"""
2 | 
3 | # pylint: disable=unused-import
4 | from tests.pandas.checks_fixtures import (
5 |     custom_check_teardown,
6 |     extra_registered_checks,
7 | )
8 | 


--------------------------------------------------------------------------------
/tests/pandas/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/pandas/modules/__init__.py


--------------------------------------------------------------------------------
/tests/pandas/modules/validate_on_init.py:
--------------------------------------------------------------------------------
 1 | """Module for unit testing validation on initialization."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pandera.pandas as pa
 6 | from pandera.typing import DataFrame
 7 | 
 8 | 
 9 | class ExampleSchema(pa.DataFrameModel):
10 |     class Config:
11 |         coerce = True
12 | 
13 |     a: np.int64
14 | 
15 | 
16 | ExampleDataFrame = DataFrame[ExampleSchema]
17 | validated_dataframe = ExampleDataFrame(pd.DataFrame([], columns=["a"]))
18 | 


--------------------------------------------------------------------------------
/tests/pandas/test__pandas_deprecated__test_model.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=wrong-import-position,wildcard-import,unused-wildcard-import
 2 | """Unit tests for the deprecated top-level pandera DataFrameModel class.
 3 | 
 4 | Delete this file once the top-level pandera._pandas_deprecated module is
 5 | removed.
 6 | """
 7 | 
 8 | import pytest
 9 | from pandera._pandas_deprecated import DataFrameModel as _DataFrameModel
10 | 
11 | 
12 | @pytest.fixture(autouse=True)
13 | def monkeypatch_dataframe_model(monkeypatch):
14 |     """Monkeypatch DataFrameModel before importing test_schemas"""
15 |     monkeypatch.setattr(
16 |         "tests.pandas.test_schemas.DataFrameModel", _DataFrameModel
17 |     )
18 | 
19 | 
20 | from tests.pandas.test_schemas import *
21 | 


--------------------------------------------------------------------------------
/tests/pandas/test__pandas_deprecated__test_schemas.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=wrong-import-position,wildcard-import,unused-wildcard-import
 2 | """Unit tests for the deprecated top-level pandera DataFrameSchema class.
 3 | 
 4 | Delete this file once the top-level pandera._pandas_deprecated module is
 5 | removed.
 6 | """
 7 | 
 8 | import pytest
 9 | from pandera._pandas_deprecated import DataFrameSchema as _DataFrameSchema
10 | 
11 | 
12 | @pytest.fixture(autouse=True)
13 | def monkeypatch_dataframe_schema(monkeypatch):
14 |     """Monkeypatch DataFrameSchema before importing test_schemas"""
15 |     monkeypatch.setattr(
16 |         "tests.pandas.test_schemas.DataFrameSchema", _DataFrameSchema
17 |     )
18 | 
19 | 
20 | from tests.pandas.test_schemas import *
21 | 


--------------------------------------------------------------------------------
/tests/pandas/test_docs_setting_column_widths.py:
--------------------------------------------------------------------------------
 1 | """Some of the doctest examples only work if the terminal is the correct width
 2 | because of the way __str__/__repr__ works in pandas. This checks that
 3 | conditions necessary for the doctests to pass properly exist on the host
 4 | system."""
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from docs.source import conf
 9 | 
10 | 
11 | def test_sphinx_doctest_setting_global_pandas_conditions() -> None:
12 |     """Checks that no limit is set on the height/width of the __repr__/__str__
13 |     print of a pd.DataFrame to ensure doctest performs consistently across
14 |     different Operating Systems."""
15 |     # pylint: disable=W0122
16 |     exec(conf.doctest_global_setup)
17 | 
18 |     max_cols_after_being_set = pd.options.display.max_columns
19 |     max_rows_after_being_set = pd.options.display.max_rows
20 |     assert max_cols_after_being_set is None
21 |     assert max_rows_after_being_set is None
22 | 


--------------------------------------------------------------------------------
/tests/pandas/test_engine_utils.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for engine module utility functions."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | from pandera.engines import utils
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "data_container, data_type, expected_failure_cases",
12 |     [
13 |         [pd.Series(list("ab1cd3")), int, [False, False, True] * 2],
14 |         [pd.Series(list("12345")), int, [True] * 5],
15 |         [pd.Series([1, 2, "foo", "bar"]), float, [True, True, False, False]],
16 |     ],
17 | )
18 | def test_numpy_pandas_coercible(
19 |     data_container, data_type, expected_failure_cases
20 | ):
21 |     """Test that the correct boolean Series outputs are returned."""
22 |     assert (
23 |         expected_failure_cases
24 |         == utils.numpy_pandas_coercible(data_container, data_type).tolist()
25 |     )
26 | 
27 | 
28 | @pytest.mark.parametrize(
29 |     "data_container",
30 |     [
31 |         pd.Series([1, 2, 3, 4]),
32 |         np.array([1, 2, 3, 4]),
33 |         pd.DataFrame({0: [1, 2, 3, 4]}),
34 |         np.array([[1], [2], [3], [4]]),
35 |     ],
36 | )
37 | def test_numpy_pandas_coerce_failure_cases(data_container):
38 |     """
39 |     Test that different data container types can be checked for coerce failure
40 |     cases.
41 |     """
42 |     failure_cases = utils.numpy_pandas_coerce_failure_cases(
43 |         data_container, int
44 |     )
45 |     assert failure_cases is None
46 | 
47 | 
48 | @pytest.mark.parametrize(
49 |     "invalid_data_container, exception_type",
50 |     [
51 |         [1, TypeError],
52 |         [5.1, TypeError],
53 |         ["foobar", TypeError],
54 |         [[1, 2, 3], TypeError],
55 |         [{0: 1}, TypeError],
56 |         # pylint: disable=too-many-function-args
57 |         [np.array([1]).reshape(1, 1, 1), ValueError],
58 |     ],
59 | )
60 | def test_numpy_pandas_coerce_failure_cases_exceptions(
61 |     invalid_data_container, exception_type
62 | ):
63 |     """
64 |     Test exceptions of trying to get failure cases for invalid input types.
65 |     """
66 |     error_msg = {
67 |         TypeError: "type of data_container .+ not understood",
68 |         ValueError: "only numpy arrays of 1 or 2 dimensions are supported",
69 |     }[exception_type]
70 |     with pytest.raises(exception_type, match=error_msg):
71 |         utils.numpy_pandas_coerce_failure_cases(invalid_data_container, int)
72 | 


--------------------------------------------------------------------------------
/tests/pandas/test_extension_modules.py:
--------------------------------------------------------------------------------
 1 | """Tests for extension module imports."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from pandera.api.hypotheses import Hypothesis
 7 | 
 8 | 
 9 | try:
10 |     from scipy import stats  # pylint: disable=unused-import
11 | except ImportError:  # pragma: no cover
12 |     SCIPY_INSTALLED = False
13 | else:
14 |     SCIPY_INSTALLED = True
15 | 
16 | 
17 | def test_hypotheses_module_import() -> None:
18 |     """Test that Hypothesis built-in methods raise import error."""
19 |     data = pd.Series([1, 2, 3])
20 |     if not SCIPY_INSTALLED:
21 |         for fn, check_args in [
22 |             (
23 |                 lambda: Hypothesis.two_sample_ttest("sample1", "sample2"),
24 |                 pd.DataFrame({"sample1": data, "sample2": data}),
25 |             ),
26 |             (lambda: Hypothesis.one_sample_ttest(popmean=10), data),
27 |         ]:
28 |             with pytest.raises(ImportError):
29 |                 check = fn()
30 |                 check(check_args)
31 | 


--------------------------------------------------------------------------------
/tests/pandas/test_model_components.py:
--------------------------------------------------------------------------------
 1 | """Tests individual model components."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | import pytest
 6 | 
 7 | import pandera.pandas as pa
 8 | from pandera.engines.pandas_engine import Engine
 9 | 
10 | 
11 | def test_field_to_column() -> None:
12 |     """Test that Field outputs the correct column options."""
13 |     for flag in ["nullable", "unique", "coerce", "regex"]:
14 |         for value in [True, False]:
15 |             col_kwargs = pa.Field(**{flag: value}).column_properties(  # type: ignore[arg-type]
16 |                 pa.DateTime, required=value
17 |             )
18 |             col = pa.Column(**col_kwargs)
19 |             assert col.dtype == Engine.dtype(pa.DateTime)
20 |             assert col.properties[flag] == value
21 |             assert col.required == value
22 | 
23 | 
24 | def test_field_to_index() -> None:
25 |     """Test that Field outputs the correct index options."""
26 |     for flag in ["nullable", "unique"]:
27 |         for value in [True, False]:
28 |             index_kwargs = pa.Field(**{flag: value}).index_properties(  # type: ignore[arg-type]
29 |                 pa.DateTime
30 |             )
31 |             index = pa.Index(**index_kwargs)
32 |             assert index.dtype == Engine.dtype(pa.DateTime)
33 |             assert getattr(index, flag) == value
34 | 
35 | 
36 | def test_field_no_checks() -> None:
37 |     """Test Field without checks."""
38 |     assert not pa.Field().column_properties(str)["checks"]
39 | 
40 | 
41 | @pytest.mark.parametrize(
42 |     "arg,value,expected",
43 |     [
44 |         ("eq", 9, pa.Check.equal_to(9)),
45 |         ("ne", 9, pa.Check.not_equal_to(9)),
46 |         ("gt", 9, pa.Check.greater_than(9)),
47 |         ("ge", 9, pa.Check.greater_than_or_equal_to(9)),
48 |         ("lt", 9, pa.Check.less_than(9)),
49 |         ("le", 9, pa.Check.less_than_or_equal_to(9)),
50 |         (
51 |             "in_range",
52 |             {"min_value": 1, "max_value": 9},
53 |             pa.Check.in_range(1, 9),
54 |         ),
55 |         ("isin", [9, "a"], pa.Check.isin([9, "a"])),
56 |         ("notin", [9, "a"], pa.Check.notin([9, "a"])),
57 |         ("str_contains", "a", pa.Check.str_contains("a")),
58 |         ("str_endswith", "a", pa.Check.str_endswith("a")),
59 |         ("str_matches", "a", pa.Check.str_matches("a")),
60 |         (
61 |             "str_length",
62 |             {"min_value": 1, "max_value": 9},
63 |             pa.Check.str_length(1, 9),
64 |         ),
65 |         ("str_startswith", "a", pa.Check.str_startswith("a")),
66 |     ],
67 | )
68 | def test_field_checks(arg: str, value: Any, expected: pa.Check) -> None:
69 |     """Test that all built-in checks are available in a Field."""
70 |     checks = pa.Field(**{arg: value}).column_properties(str)["checks"]
71 |     assert len(checks) == 1
72 |     assert checks[0] == expected
73 | 


--------------------------------------------------------------------------------
/tests/pandas/test_multithreaded.py:
--------------------------------------------------------------------------------
 1 | """Test that pandera schemas are thread safe."""
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | from joblib import Parallel, delayed
 6 | 
 7 | import pandera.pandas as pa
 8 | 
 9 | 
10 | class Model(pa.DataFrameModel):
11 |     time: pa.typing.Series[np.float32] = pa.Field(coerce=True)
12 | 
13 | 
14 | def validate_df(df):
15 |     validated_df = Model.to_schema().validate(df)
16 |     assert validated_df.dtypes["time"] == np.float32
17 |     return validated_df
18 | 
19 | 
20 | def test_single_thread():
21 |     df = pd.DataFrame({"time": np.array([1.0, 2.0, 3.0], dtype=np.float64)})
22 |     validate_df(df)
23 | 
24 | 
25 | def test_multithreading():
26 |     df = pd.DataFrame({"time": np.array([1.0, 2.0, 3.0], dtype=np.float64)})
27 |     n_tries = 10
28 |     total = 8
29 |     n_jobs = 4
30 | 
31 |     for _ in range(n_tries):
32 |         results = Parallel(n_jobs=n_jobs, prefer="threads")(
33 |             delayed(validate_df)(df) for _ in range(total)
34 |         )
35 |         for res in results:
36 |             assert res.dtypes["time"] == np.float32
37 | 


--------------------------------------------------------------------------------
/tests/pandas/test_numpy_engine.py:
--------------------------------------------------------------------------------
 1 | """Test numpy engine."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from pandera.engines import numpy_engine
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "data_type", list(numpy_engine.Engine.get_registered_dtypes())
11 | )
12 | def test_numpy_data_type(data_type):
13 |     """Test base numpy engine DataType."""
14 |     numpy_engine.Engine.dtype(data_type)
15 |     numpy_engine.Engine.dtype(data_type.type)
16 |     numpy_engine.Engine.dtype(str(data_type.type))
17 |     with pytest.warns(UserWarning):
18 |         np_dtype = numpy_engine.DataType(data_type.type)
19 |     with pytest.warns(UserWarning):
20 |         np_dtype_from_str = numpy_engine.DataType(str(data_type.type))
21 |     assert np_dtype == np_dtype_from_str
22 | 
23 | 
24 | @pytest.mark.parametrize("data_type", ["foo", "bar", 1, 2, 3.14, np.void])
25 | def test_numpy_engine_dtype_exceptions(data_type):
26 |     """Test invalid inputs to numpy data-types."""
27 |     if data_type != np.void:
28 |         with pytest.raises(
29 |             TypeError, match="data type '.+' not understood by"
30 |         ):
31 |             numpy_engine.Engine.dtype(data_type)
32 |     else:
33 |         numpy_engine.Engine._registered_dtypes = set()
34 |         numpy_engine.Engine.dtype(data_type)
35 | 
36 | 
37 | def test_numpy_string():
38 |     """Test numpy engine String data type coercion."""
39 |     # pylint: disable=no-value-for-parameter
40 |     string_type = numpy_engine.String()
41 |     assert (
42 |         string_type.coerce(np.array([1, 2, 3, 4, 5], dtype=int))
43 |         == np.array(list("12345"))
44 |     ).all()
45 |     assert string_type.check(numpy_engine.String())
46 | 


--------------------------------------------------------------------------------
/tests/pandas/test_pandas_accessor.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for pandas_accessor module."""
 2 | 
 3 | from typing import Union
 4 | from unittest.mock import patch
 5 | 
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | import pandera.pandas as pa
10 | import pandera.api.pandas.container
11 | from pandera.errors import BackendNotFoundError
12 | 
13 | 
14 | @pytest.mark.parametrize(
15 |     "schema1, schema2, data, invalid_data",
16 |     [
17 |         [
18 |             pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True),
19 |             pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True),
20 |             pd.DataFrame({"col": [1, 2, 3]}),
21 |             pd.Series([1, 2, 3]),
22 |         ],
23 |         [
24 |             pa.SeriesSchema(int, coerce=True),
25 |             pa.SeriesSchema(float, coerce=True),
26 |             pd.Series([1, 2, 3]),
27 |             pd.DataFrame({"col": [1, 2, 3]}),
28 |         ],
29 |     ],
30 | )
31 | @pytest.mark.parametrize("inplace", [False, True])
32 | def test_dataframe_series_add_schema(
33 |     schema1: Union[pa.DataFrameSchema, pa.SeriesSchema],
34 |     schema2: Union[pa.DataFrameSchema, pa.SeriesSchema],
35 |     data: Union[pd.DataFrame, pd.Series],
36 |     invalid_data: Union[pd.DataFrame, pd.Series],
37 |     inplace: bool,
38 | ) -> None:
39 |     """
40 |     Test that pandas object contains schema metadata after pandera validation.
41 |     """
42 |     validated_data_1 = schema1(data, inplace=inplace)  # type: ignore
43 |     if inplace:
44 |         assert data.pandera.schema == schema1
45 |     else:
46 |         assert data.pandera.schema is None
47 |     assert validated_data_1.pandera.schema == schema1
48 | 
49 |     validated_data_2 = schema2(validated_data_1, inplace=inplace)  # type: ignore
50 |     if inplace:
51 |         assert validated_data_1.pandera.schema == schema2
52 |     else:
53 |         assert validated_data_1.pandera.schema == schema1
54 |     assert validated_data_2.pandera.schema == schema2
55 | 
56 |     with pytest.raises((BackendNotFoundError, TypeError)):
57 |         schema1(invalid_data)  # type: ignore
58 | 
59 |     with pytest.raises((BackendNotFoundError, TypeError)):
60 |         schema2(invalid_data)  # type: ignore
61 | 
62 |     with patch.object(
63 |         pandera.backends.pandas.container,
64 |         "is_table",
65 |         return_value=True,
66 |     ):
67 |         with patch.object(
68 |             pandera.api.pandas.array,
69 |             "is_field",
70 |             return_value=True,
71 |         ):
72 |             with pytest.raises(BackendNotFoundError):
73 |                 schema1(invalid_data)  # type: ignore
74 | 
75 |             with pytest.raises(BackendNotFoundError):
76 |                 schema2(invalid_data)  # type: ignore
77 | 


--------------------------------------------------------------------------------
/tests/pandas/test_pandas_config.py:
--------------------------------------------------------------------------------
 1 | """This module is to test the behaviour change based on defined config in pandera"""
 2 | 
 3 | # pylint:disable=import-outside-toplevel,abstract-method,redefined-outer-name
 4 | 
 5 | from dataclasses import asdict
 6 | 
 7 | import pandas as pd
 8 | import pytest
 9 | 
10 | import pandera.pandas as pa
11 | from pandera.pandas import DataFrameModel, DataFrameSchema, SeriesSchema
12 | from pandera.config import ValidationDepth, config_context, get_config_context
13 | 
14 | 
15 | @pytest.fixture(autouse=True, scope="function")
16 | def disable_validation():
17 |     """Fixture to disable validation and clean up after the test is finished"""
18 |     with config_context(validation_enabled=False):
19 |         yield
20 | 
21 | 
22 | class TestPandasDataFrameConfig:
23 |     """Class to test all the different configs types"""
24 | 
25 |     sample_data = pd.DataFrame(
26 |         (("Bread", 9), ("Cutter", 15)), columns=["product", "price_val"]
27 |     )
28 | 
29 |     # pylint: disable=unused-argument
30 |     def test_disable_validation(self):
31 |         """This function validates that a none object is loaded if validation is disabled"""
32 | 
33 |         pandera_schema = DataFrameSchema(
34 |             {
35 |                 "product": pa.Column(
36 |                     str, pa.Check(lambda s: s.str.startswith("B"))
37 |                 ),
38 |                 "price_val": pa.Column(int),
39 |             }
40 |         )
41 | 
42 |         class TestSchema(DataFrameModel):
43 |             """Test Schema class"""
44 | 
45 |             product: str = pa.Field(str_startswith="B")
46 |             price_val: int = pa.Field()
47 | 
48 |         expected = {
49 |             "cache_dataframe": False,
50 |             "keep_cached_dataframe": False,
51 |             "validation_enabled": False,
52 |             "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
53 |         }
54 | 
55 |         assert asdict(get_config_context()) == expected
56 |         assert pandera_schema.validate(self.sample_data) is self.sample_data
57 |         assert TestSchema.validate(self.sample_data) is self.sample_data
58 | 
59 | 
60 | class TestPandasSeriesConfig:
61 |     """Class to test all the different configs types"""
62 | 
63 |     sample_data = pd.Series([1, 1, 2, 2, 3, 3])
64 | 
65 |     # pylint: disable=unused-argument
66 |     def test_disable_validation(self):
67 |         """This function validates that a none object is loaded if validation is disabled"""
68 |         expected = {
69 |             "cache_dataframe": False,
70 |             "keep_cached_dataframe": False,
71 |             "validation_enabled": False,
72 |             "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
73 |         }
74 |         pandera_schema = SeriesSchema(
75 |             int, pa.Check(lambda s: s.value_counts() == 2, element_wise=False)
76 |         )
77 |         assert asdict(get_config_context()) == expected
78 |         assert pandera_schema.validate(self.sample_data) is self.sample_data
79 | 


--------------------------------------------------------------------------------
/tests/pandas/test_pandas_parallel.py:
--------------------------------------------------------------------------------
 1 | """Test parallelization with pandas using joblib."""
 2 | 
 3 | import pandas as pd
 4 | from joblib import Parallel, delayed
 5 | 
 6 | from pandera.pandas import Column, DataFrameSchema
 7 | 
 8 | schema = DataFrameSchema({"a": Column("int64")}, coerce=True)
 9 | 
10 | 
11 | def test_polars_parallel():
12 |     def fn():
13 |         return schema.validate(pd.DataFrame({"a": [1]}))
14 | 
15 |     results = Parallel(2)([delayed(fn)() for _ in range(10)])
16 |     assert len(results) == 10
17 |     for result in results:
18 |         assert result.dtypes["a"] == "int64"
19 | 


--------------------------------------------------------------------------------
/tests/pandas/test_pydantic_dtype.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for pydantic datatype."""
 2 | 
 3 | from typing import Type
 4 | 
 5 | import pandas as pd
 6 | import pytest
 7 | from pydantic import BaseModel
 8 | 
 9 | import pandera.pandas as pa
10 | from pandera.api.pandas.array import ArraySchema
11 | from pandera.engines.pandas_engine import PydanticModel
12 | 
13 | 
14 | class Record(BaseModel):
15 |     """Pydantic record model."""
16 | 
17 |     name: str
18 |     xcoord: int
19 |     ycoord: int
20 | 
21 | 
22 | class PydanticSchema(pa.DataFrameModel):
23 |     """Pandera schema using the pydantic model."""
24 | 
25 |     class Config:
26 |         """Config with dataframe-level data type."""
27 | 
28 |         dtype = PydanticModel(Record)
29 | 
30 | 
31 | class PanderaSchema(pa.DataFrameModel):
32 |     """Pandera schema that's equivalent to PydanticSchema."""
33 | 
34 |     name: pa.typing.Series[str]
35 |     xcoord: pa.typing.Series[int]
36 |     ycoord: pa.typing.Series[int]
37 | 
38 | 
39 | def test_pydantic_model():
40 |     """Test that pydantic model correctly validates data."""
41 | 
42 |     @pa.check_types
43 |     def func(df: pa.typing.DataFrame[PydanticSchema]):
44 |         return df
45 | 
46 |     valid_df = pd.DataFrame(
47 |         {
48 |             "name": ["foo", "bar", "baz"],
49 |             "xcoord": [1.0, 2, 3],
50 |             "ycoord": [4, 5.0, 6],
51 |         }
52 |     )
53 | 
54 |     invalid_df = pd.DataFrame(
55 |         {
56 |             "name": ["foo", "bar", "baz"],
57 |             "xcoord": [1, 2, "c"],
58 |             "ycoord": [4, 5, "d"],
59 |         }
60 |     )
61 | 
62 |     validated = func(valid_df)
63 |     PanderaSchema.validate(validated)
64 | 
65 |     expected_failure_cases = pd.DataFrame(
66 |         {"index": [2], "failure_case": ["{'xcoord': 'c', 'ycoord': 'd'}"]}
67 |     )
68 | 
69 |     try:
70 |         func(invalid_df)
71 |     except pa.errors.SchemaError as exc:
72 |         pd.testing.assert_frame_equal(
73 |             exc.failure_cases, expected_failure_cases
74 |         )
75 | 
76 | 
77 | @pytest.mark.parametrize("series_type", [pa.SeriesSchema, pa.Column, pa.Index])
78 | def test_pydantic_model_init_errors(series_type: Type[ArraySchema]):
79 |     """
80 |     Should raise SchemaInitError with PydanticModel as `SeriesSchemaBase.dtype`
81 |     """
82 |     with pytest.raises(pa.errors.SchemaInitError):
83 |         series_type(dtype=PydanticModel(Record))
84 | 
85 | 
86 | @pytest.mark.parametrize("coerce", [True, False])
87 | def test_pydantic_model_coerce(coerce: bool):
88 |     """Test that DataFrameSchema.coerce is always True with pydantic model"""
89 | 
90 |     dataframe_schema = pa.DataFrameSchema(
91 |         dtype=PydanticModel(Record), coerce=coerce
92 |     )
93 |     assert dataframe_schema.coerce is True
94 | 


--------------------------------------------------------------------------------
/tests/pandas/test_validation_depth.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for granular control based on validation depth."""
 2 | 
 3 | import pytest
 4 | 
 5 | from pandera.backends.base import CoreCheckResult
 6 | from pandera.config import ValidationDepth, ValidationScope, config_context
 7 | from pandera.validation_depth import validate_scope
 8 | 
 9 | 
10 | def custom_backend():
11 |     class CustomBackend:
12 | 
13 |         # pylint: disable=unused-argument
14 |         @validate_scope(ValidationScope.SCHEMA)
15 |         def check_schema(self, check_obj):
16 |             # core check result is passed as True when validation scope doesn't
17 |             # include schema checks
18 |             return CoreCheckResult(passed=False)
19 | 
20 |         # pylint: disable=unused-argument
21 |         @validate_scope(ValidationScope.DATA)
22 |         def check_data(self, check_obj):
23 |             # core check result is passed as True when validation scope doesn't
24 |             # include data checks
25 |             return CoreCheckResult(passed=False)
26 | 
27 |     return CustomBackend()
28 | 
29 | 
30 | @pytest.mark.parametrize(
31 |     "validation_depth,expected",
32 |     [
33 |         [ValidationDepth.SCHEMA_ONLY, [False, True]],
34 |         [ValidationDepth.DATA_ONLY, [True, False]],
35 |         [ValidationDepth.SCHEMA_AND_DATA, [False, False]],
36 |         [None, [False, False]],
37 |     ],
38 | )
39 | def test_validate_scope(validation_depth, expected):
40 | 
41 |     with config_context(validation_depth=validation_depth):
42 |         backend = custom_backend()
43 |         schema_result = backend.check_schema("foo")
44 |         data_result = backend.check_data("foo")
45 |         results = [schema_result.passed, data_result.passed]
46 |         assert results == expected
47 | 


--------------------------------------------------------------------------------
/tests/polars/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/polars/__init__.py


--------------------------------------------------------------------------------
/tests/polars/conftest.py:
--------------------------------------------------------------------------------
 1 | """Polars unit test-specific configuration."""
 2 | 
 3 | import pytest
 4 | 
 5 | from pandera.config import CONFIG, ValidationDepth, reset_config_context
 6 | 
 7 | 
 8 | @pytest.fixture(scope="function", autouse=True)
 9 | def validation_depth_schema_and_data():
10 |     """
11 |     These tests ensure that the validation depth is set to SCHEMA_AND_DATA
12 |     for unit tests.
13 |     """
14 |     _validation_depth = CONFIG.validation_depth
15 |     CONFIG.validation_depth = ValidationDepth.SCHEMA_AND_DATA
16 |     try:
17 |         yield
18 |     finally:
19 |         CONFIG.validation_depth = _validation_depth
20 |         reset_config_context()
21 | 


--------------------------------------------------------------------------------
/tests/polars/test_polars_dataframe_generic.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for polars LazyFrame generic."""
 2 | 
 3 | import polars as pl
 4 | import pytest
 5 | 
 6 | import pandera.polars as pa
 7 | from pandera.typing.polars import LazyFrame, Series
 8 | 
 9 | 
10 | def test_series_annotation():
11 |     class Model(pa.DataFrameModel):
12 |         col1: Series[pl.Int64]
13 | 
14 |     data = pl.LazyFrame(
15 |         {
16 |             "col1": [1, 2, 3],
17 |         }
18 |     )
19 | 
20 |     assert data.collect().equals(Model.validate(data).collect())
21 | 
22 |     invalid_data = data.cast({"col1": pl.Float64})
23 |     with pytest.raises(pa.errors.SchemaError):
24 |         Model.validate(invalid_data).collect()
25 | 
26 | 
27 | def test_lazyframe_generic_simple():
28 |     class Model(pa.DataFrameModel):
29 |         col1: pl.Int64
30 |         col2: pl.Utf8
31 |         col3: pl.Float64
32 | 
33 |     @pa.check_types
34 |     def fn(lf: LazyFrame[Model]) -> LazyFrame[Model]:
35 |         return lf
36 | 
37 |     data = pl.LazyFrame(
38 |         {
39 |             "col1": [1, 2, 3],
40 |             "col2": [*"abc"],
41 |             "col3": [1.0, 2.0, 3.0],
42 |         }
43 |     )
44 | 
45 |     assert data.collect().equals(fn(data).collect())
46 | 
47 |     invalid_data = data.cast({"col3": pl.Int64})
48 |     with pytest.raises(pa.errors.SchemaError):
49 |         fn(invalid_data).collect()
50 | 
51 | 
52 | def test_lazyframe_generic_transform():
53 |     class Input(pa.DataFrameModel):
54 |         col1: pl.Int64
55 |         col2: pl.Utf8
56 | 
57 |     class Output(Input):
58 |         col3: pl.Float64
59 | 
60 |     @pa.check_types
61 |     def fn(lf: LazyFrame[Input]) -> LazyFrame[Output]:
62 |         return lf.with_columns(col3=pl.lit(3.0))  # type: ignore
63 | 
64 |     @pa.check_types
65 |     def invalid_fn(lf: LazyFrame[Input]) -> LazyFrame[Output]:
66 |         return lf  # type: ignore
67 | 
68 |     data = pl.LazyFrame(
69 |         {
70 |             "col1": [1, 2, 3],
71 |             "col2": [*"abc"],
72 |         }
73 |     )
74 | 
75 |     assert isinstance(fn(data).collect(), pl.DataFrame)
76 | 
77 |     with pytest.raises(pa.errors.SchemaError):
78 |         invalid_fn(data).collect()
79 | 


--------------------------------------------------------------------------------
/tests/polars/test_polars_decorators.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for using schemas with polars and function decorators."""
 2 | 
 3 | import polars as pl
 4 | import pytest
 5 | 
 6 | import pandera.polars as pa
 7 | import pandera.typing.polars as pa_typing
 8 | 
 9 | 
10 | @pytest.fixture
11 | def data() -> pl.DataFrame:
12 |     return pl.DataFrame({"a": [1, 2, 3]})
13 | 
14 | 
15 | @pytest.fixture
16 | def invalid_data(data) -> pl.DataFrame:
17 |     return data.rename({"a": "b"})
18 | 
19 | 
20 | def test_polars_dataframe_check_io(data, invalid_data):
21 |     # pylint: disable=unused-argument
22 | 
23 |     schema = pa.DataFrameSchema({"a": pa.Column(int)})
24 | 
25 |     @pa.check_input(schema)
26 |     def fn_check_input(x): ...
27 | 
28 |     @pa.check_output(schema)
29 |     def fn_check_output(x):
30 |         return x
31 | 
32 |     @pa.check_io(x=schema, out=schema)
33 |     def fn_check_io(x):
34 |         return x
35 | 
36 |     @pa.check_io(x=schema, out=schema)
37 |     def fn_check_io_invalid(x):
38 |         return x.rename({"a": "b"})
39 | 
40 |     # valid data should pass
41 |     fn_check_input(data)
42 |     fn_check_output(data)
43 |     fn_check_io(data)
44 | 
45 |     # invalid data or invalid function should not pass
46 |     with pytest.raises(pa.errors.SchemaError):
47 |         fn_check_input(invalid_data)
48 | 
49 |     with pytest.raises(pa.errors.SchemaError):
50 |         fn_check_output(invalid_data)
51 | 
52 |     with pytest.raises(pa.errors.SchemaError):
53 |         fn_check_io_invalid(data)
54 | 
55 | 
56 | def test_polars_dataframe_check_types(data, invalid_data):
57 |     # pylint: disable=unused-argument
58 | 
59 |     class Model(pa.DataFrameModel):
60 |         a: int
61 | 
62 |     @pa.check_types
63 |     def fn_check_input(x: pa_typing.DataFrame[Model]): ...
64 | 
65 |     @pa.check_types
66 |     def fn_check_output(x) -> pa_typing.DataFrame[Model]:
67 |         return x
68 | 
69 |     @pa.check_types
70 |     def fn_check_io(
71 |         x: pa_typing.DataFrame[Model],
72 |     ) -> pa_typing.DataFrame[Model]:
73 |         return x
74 | 
75 |     @pa.check_types
76 |     def fn_check_io_invalid(
77 |         x: pa_typing.DataFrame[Model],
78 |     ) -> pa_typing.DataFrame[Model]:
79 |         return x.rename({"a": "b"})  # type: ignore
80 | 
81 |     # valid data should pass
82 |     fn_check_input(data)
83 |     fn_check_output(data)
84 |     fn_check_io(data)
85 | 
86 |     # invalid data or invalid function should not pass
87 |     with pytest.raises(pa.errors.SchemaError):
88 |         fn_check_input(invalid_data)
89 | 
90 |     with pytest.raises(pa.errors.SchemaError):
91 |         fn_check_output(invalid_data)
92 | 
93 |     with pytest.raises(pa.errors.SchemaError):
94 |         fn_check_io_invalid(data)
95 | 


--------------------------------------------------------------------------------
/tests/polars/test_polars_parallel.py:
--------------------------------------------------------------------------------
 1 | """Test parallelization with polars using joblib."""
 2 | 
 3 | import polars as pl
 4 | from joblib import Parallel, delayed
 5 | 
 6 | from pandera.polars import Column, DataFrameSchema
 7 | 
 8 | schema = DataFrameSchema({"a": Column(pl.Int32)}, coerce=True)
 9 | 
10 | 
11 | def test_polars_parallel():
12 |     def fn():
13 |         return schema.validate(pl.DataFrame({"a": [1]}))
14 | 
15 |     results = Parallel(2)([delayed(fn)() for _ in range(10)])
16 |     assert len(results) == 10
17 |     for result in results:
18 |         assert result.schema["a"] == pl.Int32
19 | 


--------------------------------------------------------------------------------
/tests/polars/test_polars_strategies.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for polars strategy methods."""
 2 | 
 3 | import pytest
 4 | 
 5 | import pandera.polars as pa
 6 | 
 7 | 
 8 | def test_dataframe_schema_strategy():
 9 |     schema = pa.DataFrameSchema()
10 | 
11 |     with pytest.raises(NotImplementedError):
12 |         schema.strategy()
13 | 
14 |     with pytest.raises(NotImplementedError):
15 |         schema.example()
16 | 
17 | 
18 | def test_column_schema_strategy():
19 |     column_schema = pa.Column(str)
20 | 
21 |     with pytest.raises(NotImplementedError):
22 |         column_schema.strategy()
23 | 
24 |     with pytest.raises(NotImplementedError):
25 |         column_schema.example()
26 | 
27 |     with pytest.raises(NotImplementedError):
28 |         column_schema.strategy_component()
29 | 


--------------------------------------------------------------------------------
/tests/pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | """Init file for pyspark tests"""
2 | 


--------------------------------------------------------------------------------
/tests/pyspark/test_pyspark_accessor.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for pyspark_accessor module."""
 2 | 
 3 | from typing import Union
 4 | 
 5 | import pytest
 6 | from pyspark.sql import DataFrame, SparkSession
 7 | from pyspark.sql.functions import col
 8 | from pyspark.sql.types import FloatType, LongType
 9 | 
10 | import pandera.pyspark as pa
11 | from pandera.config import PanderaConfig, ValidationDepth
12 | from pandera.pyspark import pyspark_sql_accessor
13 | 
14 | spark = SparkSession.builder.getOrCreate()
15 | 
16 | 
17 | @pytest.mark.parametrize(
18 |     "schema1, schema2, data, invalid_data",
19 |     [
20 |         [
21 |             pa.DataFrameSchema({"col": pa.Column("long")}, coerce=True),
22 |             pa.DataFrameSchema({"col": pa.Column("float")}, coerce=False),
23 |             spark.createDataFrame([{"col": 1}, {"col": 2}, {"col": 3}]),
24 |             spark.createDataFrame([{"col": 1}, {"col": 2}, {"col": 3}]),
25 |         ],
26 |     ],
27 | )
28 | def test_dataframe_add_schema(
29 |     schema1: pa.DataFrameSchema,
30 |     schema2: pa.DataFrameSchema,
31 |     data: Union[DataFrame, col],
32 |     invalid_data: Union[DataFrame, col],
33 |     config_params: PanderaConfig,
34 | ) -> None:
35 |     """
36 |     Test that pyspark object contains schema metadata after pandera validation.
37 |     """
38 |     schema1(data)  # type: ignore[arg-type]
39 | 
40 |     assert data.pandera.schema == schema1
41 |     assert isinstance(schema1.validate(data), DataFrame)
42 |     assert isinstance(schema1(data), DataFrame)
43 |     if config_params.validation_depth != ValidationDepth.DATA_ONLY:
44 |         assert dict(schema2(invalid_data).pandera.errors["SCHEMA"]) == {
45 |             "WRONG_DATATYPE": [
46 |                 {
47 |                     "schema": None,
48 |                     "column": "col",
49 |                     "check": f"dtype('{str(FloatType())}')",
50 |                     "error": f"expected column 'col' to have type {str(FloatType())}, got {str(LongType())}",
51 |                 }
52 |             ]
53 |         }  # type: ignore[arg-type]
54 | 
55 | 
56 | class CustomAccessor:
57 |     """Mock accessor class"""
58 | 
59 |     def __init__(self, obj):
60 |         self._obj = obj
61 | 
62 | 
63 | def test_modin_accessor_warning():
64 |     """Test that modin accessor raises warning when name already exists."""
65 |     pyspark_sql_accessor.register_dataframe_accessor("foo")(CustomAccessor)
66 |     with pytest.warns(UserWarning):
67 |         pyspark_sql_accessor.register_dataframe_accessor("foo")(CustomAccessor)
68 | 


--------------------------------------------------------------------------------
/tests/pyspark/test_pyspark_engine.py:
--------------------------------------------------------------------------------
 1 | """Tests Engine subclassing and registering DataTypes.Test pyspark engine."""
 2 | 
 3 | # pylint:disable=redefined-outer-name,unused-argument
 4 | 
 5 | import pytest
 6 | 
 7 | from pandera.engines import pyspark_engine
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "data_type",
12 |     list(
13 |         pyspark_engine.Engine.get_registered_dtypes()
14 |     ),  # pylint:disable=no-value-for-parameter
15 | )
16 | def test_pyspark_data_type(data_type):
17 |     """Test pyspark engine DataType base class."""
18 |     if data_type.type is None:
19 |         # don't test data types that require parameters e.g. Category
20 |         return
21 |     parameterized_datatypes = ["decimal", "array", "map"]
22 | 
23 |     pyspark_engine.Engine.dtype(
24 |         data_type
25 |     )  # pylint:disable=no-value-for-parameter
26 |     pyspark_engine.Engine.dtype(
27 |         data_type.type
28 |     )  # pylint:disable=no-value-for-parameter
29 |     if data_type.type.typeName() not in parameterized_datatypes:
30 |         pyspark_engine.Engine.dtype(
31 |             str(data_type.type)
32 |         )  # pylint:disable=no-value-for-parameter
33 | 
34 |     with pytest.warns(UserWarning):
35 |         pd_dtype = pyspark_engine.DataType(data_type.type)
36 |     if data_type.type.typeName() not in parameterized_datatypes:
37 |         with pytest.warns(UserWarning):
38 |             pd_dtype_from_str = pyspark_engine.DataType(str(data_type.type))
39 |             assert pd_dtype == pd_dtype_from_str
40 | 


--------------------------------------------------------------------------------
/tests/strategies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/strategies/__init__.py


--------------------------------------------------------------------------------
/tests/test_inspection_utils.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-function-docstring,missing-module-docstring
  2 | # pylint: disable=missing-class-docstring,bad-mcs-classmethod-argument
  3 | from pandera.inspection_utils import (
  4 |     is_classmethod_from_meta,
  5 |     is_decorated_classmethod,
  6 | )
  7 | 
  8 | 
  9 | class SomeMeta(type):
 10 |     def __new__(mcs, *args, **kwargs):
 11 |         return super().__new__(mcs, *args, **kwargs)
 12 | 
 13 |     def __init__(cls, *args, **kwargs):
 14 |         super().__init__(*args, **kwargs)
 15 | 
 16 |     def regular_method_meta(cls):
 17 |         return cls
 18 | 
 19 |     @classmethod
 20 |     def class_method_meta(mcs):
 21 |         return mcs
 22 | 
 23 |     @staticmethod
 24 |     def static_method_meta():
 25 |         return 1
 26 | 
 27 | 
 28 | class SomeClass(metaclass=SomeMeta):
 29 |     def regular_method(self):
 30 |         return self
 31 | 
 32 |     @classmethod
 33 |     def class_method(cls):
 34 |         return cls
 35 | 
 36 |     @staticmethod
 37 |     def static_method():
 38 |         return 2
 39 | 
 40 | 
 41 | class SomeChild(SomeClass):
 42 |     def regular_method_child(self):
 43 |         return self
 44 | 
 45 |     @classmethod
 46 |     def class_method_child(cls):
 47 |         return cls
 48 | 
 49 |     @staticmethod
 50 |     def static_method_child():
 51 |         return 3
 52 | 
 53 | 
 54 | def test_is_decorated_classmethod() -> None:
 55 |     some_instance = SomeClass()
 56 |     some_child = SomeChild()
 57 | 
 58 |     cls_methods_with_deco = {
 59 |         SomeMeta.class_method_meta,
 60 |         SomeClass.class_method_meta,
 61 |         SomeClass.class_method,
 62 |         SomeChild.class_method_meta,
 63 |         SomeChild.class_method,
 64 |         SomeChild.class_method_child,
 65 |     }
 66 | 
 67 |     cls_methods_from_meta = {
 68 |         SomeClass.regular_method_meta,
 69 |         SomeChild.regular_method_meta,
 70 |     }
 71 | 
 72 |     all_methods = {
 73 |         # from meta
 74 |         SomeMeta.class_method_meta,
 75 |         SomeMeta.static_method_meta,
 76 |         # from parent
 77 |         SomeClass.class_method_meta,
 78 |         SomeClass.regular_method_meta,
 79 |         SomeClass.static_method_meta,
 80 |         SomeClass.class_method,
 81 |         some_instance.regular_method,
 82 |         SomeClass.static_method,
 83 |         # from child
 84 |         SomeChild.class_method_meta,
 85 |         SomeChild.regular_method_meta,
 86 |         SomeChild.static_method_meta,
 87 |         SomeChild.class_method,
 88 |         some_child.regular_method,
 89 |         SomeChild.static_method,
 90 |         SomeChild.class_method_child,
 91 |         some_child.regular_method_child,
 92 |         SomeChild.static_method_child,
 93 |     }
 94 | 
 95 |     for method in cls_methods_with_deco:
 96 |         assert is_decorated_classmethod(method), f"{method} is decorated"
 97 |     for method in all_methods - cls_methods_with_deco:
 98 |         assert not is_decorated_classmethod(
 99 |             method
100 |         ), f"{method} is not decorated"
101 |     for method in cls_methods_from_meta:
102 |         assert is_classmethod_from_meta(method), f"{method} comes from meta"
103 |     for method in all_methods - cls_methods_from_meta:
104 |         assert not is_classmethod_from_meta(
105 |             method
106 |         ), f"{method} does not come from meta"
107 | 


--------------------------------------------------------------------------------