├── .coveragerc ├── .github ├── CONTRIBUTING.md ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation-improvement.md │ ├── feature_request.md │ └── submit-question.md ├── config.yml ├── dependabot.yml └── workflows │ ├── ci-tests.yml │ └── publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── .readthedocs.yml ├── CODE_OF_CONDUCT.md ├── LICENSE.txt ├── Makefile ├── README.md ├── asv_bench ├── README.md └── benchmarks │ ├── __init__.py │ ├── dataframe_schema.py │ └── series_schema.py ├── docs ├── Makefile ├── make.bat └── source │ ├── _static │ ├── custom.js │ ├── default.css │ ├── docsearch_config.js_t │ ├── pandera-banner.png │ ├── pandera-favicon.png │ └── pandera-logo.png │ ├── _templates │ ├── class.rst │ ├── dtype.rst │ ├── model_component_class.rst │ ├── module.rst │ ├── page.html │ ├── sidebar │ │ └── search.html │ └── strategies_module.rst │ ├── checks.md │ ├── conf.py │ ├── configuration.md │ ├── dask.md │ ├── data_format_conversion.md │ ├── data_synthesis_strategies.md │ ├── dataframe_models.md │ ├── dataframe_schemas.md │ ├── decorators.md │ ├── drop_invalid_rows.md │ ├── dtype_validation.md │ ├── dtypes.md │ ├── error_report.md │ ├── extensions.md │ ├── fastapi.md │ ├── frictionless.md │ ├── fugue.md │ ├── geopandas.md │ ├── hypothesis.md │ ├── index.md │ ├── integrations.md │ ├── jupyterlite_config.json │ ├── lazy_validation.md │ ├── modin.md │ ├── mypy_integration.md │ ├── notebooks │ └── try_pandera.ipynb │ ├── parsers.md │ ├── polars.md │ ├── pydantic_integration.md │ ├── pyspark.md │ ├── pyspark_sql.md │ ├── reference │ ├── core.rst │ ├── dataframe_models.rst │ ├── decorators.rst │ ├── dtypes.rst │ ├── errors.rst │ ├── extensions.rst │ ├── index.md │ ├── io.rst │ ├── schema_inference.rst │ └── strategies.rst │ ├── schema_inference.md │ ├── series_schemas.md │ └── supported_libraries.md ├── environment.yml ├── mypy.ini ├── noxfile.py ├── pandera ├── __init__.py ├── _pandas_deprecated.py ├── _patch_numpy2.py ├── accessors │ ├── __init__.py │ ├── dask_accessor.py │ ├── modin_accessor.py │ ├── pandas_accessor.py │ ├── polars_accessor.py │ ├── pyspark_accessor.py │ └── pyspark_sql_accessor.py ├── api │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── checks.py │ │ ├── error_handler.py │ │ ├── model.py │ │ ├── model_components.py │ │ ├── model_config.py │ │ ├── parsers.py │ │ ├── schema.py │ │ └── types.py │ ├── checks.py │ ├── dataframe │ │ ├── __init__.py │ │ ├── components.py │ │ ├── container.py │ │ ├── model.py │ │ ├── model_components.py │ │ └── model_config.py │ ├── extensions.py │ ├── function_dispatch.py │ ├── hypotheses.py │ ├── pandas │ │ ├── __init__.py │ │ ├── array.py │ │ ├── components.py │ │ ├── container.py │ │ ├── model.py │ │ ├── model_config.py │ │ └── types.py │ ├── parsers.py │ ├── polars │ │ ├── __init__.py │ │ ├── components.py │ │ ├── container.py │ │ ├── model.py │ │ ├── model_config.py │ │ ├── types.py │ │ └── utils.py │ └── pyspark │ │ ├── __init__.py │ │ ├── column_schema.py │ │ ├── components.py │ │ ├── container.py │ │ ├── model.py │ │ ├── model_components.py │ │ ├── model_config.py │ │ └── types.py ├── backends │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── builtin_checks.py │ │ └── builtin_hypotheses.py │ ├── pandas │ │ ├── __init__.py │ │ ├── array.py │ │ ├── base.py │ │ ├── builtin_checks.py │ │ ├── builtin_hypotheses.py │ │ ├── checks.py │ │ ├── components.py │ │ ├── container.py │ │ ├── error_formatters.py │ │ ├── hypotheses.py │ │ ├── parsers.py │ │ └── register.py │ ├── polars │ │ ├── __init__.py │ │ ├── base.py │ │ ├── builtin_checks.py │ │ ├── checks.py │ │ ├── components.py │ │ ├── container.py │ │ ├── error_formatters.py │ │ └── register.py │ ├── pyspark │ │ ├── __init__.py │ │ ├── base.py │ │ ├── builtin_checks.py │ │ ├── checks.py │ │ ├── column.py │ │ ├── components.py │ │ ├── container.py │ │ ├── decorators.py │ │ ├── error_formatters.py │ │ ├── register.py │ │ └── utils.py │ └── utils.py ├── config.py ├── constants.py ├── decorators.py ├── dtypes.py ├── engines │ ├── __init__.py │ ├── engine.py │ ├── geopandas_engine.py │ ├── numpy_engine.py │ ├── pandas_engine.py │ ├── polars_engine.py │ ├── pyarrow_engine.py │ ├── pyspark_engine.py │ ├── type_aliases.py │ └── utils.py ├── errors.py ├── extensions.py ├── external_config.py ├── import_utils.py ├── inspection_utils.py ├── io │ ├── __init__.py │ └── pandas_io.py ├── mypy.py ├── pandas.py ├── polars.py ├── py.typed ├── pyspark.py ├── schema_inference │ ├── __init__.py │ └── pandas.py ├── schema_statistics │ ├── __init__.py │ └── pandas.py ├── strategies │ ├── __init__.py │ ├── base_strategies.py │ └── pandas_strategies.py ├── system.py ├── typing │ ├── __init__.py │ ├── common.py │ ├── dask.py │ ├── fastapi.py │ ├── formats.py │ ├── geopandas.py │ ├── modin.py │ ├── pandas.py │ ├── polars.py │ ├── pyspark.py │ └── pyspark_sql.py ├── utils.py └── validation_depth.py ├── pyproject.toml ├── requirements.txt ├── scripts └── generate_pip_deps_from_conda.py ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── base └── test_base_schema.py ├── conftest.py ├── dask ├── __init__.py ├── test_dask.py ├── test_dask_accessor.py └── test_dask_not_installed.py ├── fastapi ├── __init__.py ├── app.py ├── models.py └── test_app.py ├── geopandas ├── test_engine.py ├── test_from_to_format_conversions.py ├── test_geopandas.py └── test_pydantic.py ├── hypotheses ├── __init__.py └── test_hypotheses.py ├── io ├── __init__.py └── test_pandas_io.py ├── modin ├── __init__.py ├── conftest.py ├── test_logical_dtypes.py ├── test_modin_accessor.py └── test_schemas_on_modin.py ├── mypy ├── config │ ├── no_plugin.ini │ └── plugin_mypy.ini ├── pandas_modules │ ├── pandas_concat.py │ ├── pandas_dataframe.py │ ├── pandas_index.py │ ├── pandas_series.py │ ├── pandas_time.py │ ├── pandera_inheritance.py │ ├── pandera_types.py │ └── python_slice.py └── test_pandas_static_type_checking.py ├── pandas ├── __init__.py ├── checks_fixtures.py ├── conftest.py ├── modules │ ├── __init__.py │ └── validate_on_init.py ├── test__pandas_deprecated__test_model.py ├── test__pandas_deprecated__test_schemas.py ├── test_checks.py ├── test_checks_builtin.py ├── test_config.py ├── test_decorators.py ├── test_docs_setting_column_widths.py ├── test_dtypes.py ├── test_engine.py ├── test_engine_utils.py ├── test_errors.py ├── test_extension_modules.py ├── test_extensions.py ├── test_from_to_format_conversions.py ├── test_logical_dtypes.py ├── test_model.py ├── test_model_components.py ├── test_multithreaded.py ├── test_numpy_engine.py ├── test_pandas_accessor.py ├── test_pandas_config.py ├── test_pandas_engine.py ├── test_pandas_parallel.py ├── test_parsers.py ├── test_pydantic.py ├── test_pydantic_dtype.py ├── test_schema_components.py ├── test_schema_inference.py ├── test_schema_statistics.py ├── test_schemas.py ├── test_typing.py └── test_validation_depth.py ├── polars ├── __init__.py ├── conftest.py ├── test_polars_builtin_checks.py ├── test_polars_check.py ├── test_polars_components.py ├── test_polars_config.py ├── test_polars_container.py ├── test_polars_dataframe_generic.py ├── test_polars_decorators.py ├── test_polars_dtypes.py ├── test_polars_model.py ├── test_polars_parallel.py ├── test_polars_pydantic.py ├── test_polars_strategies.py └── test_polars_typing.py ├── pyspark ├── __init__.py ├── conftest.py ├── test_pyspark_accessor.py ├── test_pyspark_check.py ├── test_pyspark_config.py ├── test_pyspark_container.py ├── test_pyspark_decorators.py ├── test_pyspark_dtypes.py ├── test_pyspark_engine.py ├── test_pyspark_error.py ├── test_pyspark_model.py └── test_schemas_on_pyspark_pandas.py ├── strategies ├── __init__.py └── test_strategies.py └── test_inspection_utils.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = pandera 3 | 4 | [report] 5 | exclude_lines = 6 | if self.debug: 7 | pragma: no cover 8 | raise NotImplementedError 9 | if __name__ == .__main__.: 10 | ignore_errors = True 11 | omit = 12 | tests/* 13 | pandera/mypy.py 14 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [cosmicBboy] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | - [ ] I have checked that this issue has not already been reported. 14 | - [ ] I have confirmed this bug exists on the latest version of pandera. 15 | - [ ] (optional) I have confirmed this bug exists on the main branch of pandera. 16 | 17 | **Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug. 18 | 19 | #### Code Sample, a copy-pastable example 20 | 21 | ```python 22 | # Your code here 23 | 24 | ``` 25 | 26 | #### Expected behavior 27 | A clear and concise description of what you expected to happen. 28 | 29 | #### Desktop (please complete the following information): 30 | 31 | - OS: [e.g. iOS] 32 | - Browser: [e.g. chrome, safari] 33 | - Version: [e.g. 22] 34 | 35 | #### Screenshots 36 | If applicable, add screenshots to help explain your problem. 37 | 38 | #### Additional context 39 | Add any other context about the problem here. 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-improvement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation Improvement 3 | about: Report wrong or missing documentation 4 | title: '' 5 | labels: docs 6 | assignees: '' 7 | 8 | --- 9 | 10 | #### Location of the documentation 11 | 12 | [this should provide the location of the documentation, e.g. "pandera.api.pandas.container.DataFrameSchema" or the URL of the documentation, e.g. "https://pandera.readthedocs.io/en/stable/dataframe_schemas.html#column-validation"] 13 | 14 | **Note**: You can check the latest versions of the docs on `master` [here](https://pandera.readthedocs.io/en/latest/). 15 | 16 | #### Documentation problem 17 | 18 | [this should provide a description of what documentation you believe needs to be fixed/improved] 19 | 20 | #### Suggested fix for documentation 21 | 22 | [this should explain the suggested fix and **why** it's better than the existing documentation] 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/submit-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Submit Question 3 | about: Ask a general question about pandera 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | #### Question about pandera 11 | 12 | **Note**: If you'd still like to submit a question, please read [this guide]( 13 | https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question. 14 | 15 | ```python 16 | # Your code here, if applicable 17 | 18 | ``` 19 | -------------------------------------------------------------------------------- /.github/config.yml: -------------------------------------------------------------------------------- 1 | # Comment to be posted on PRs from first-time contributors in your repository 2 | newPRWelcomeComment: | 3 | Thank you for opening this pull request! 🙌 4 | 5 | These tips will help get your PR across the finish line: 6 | 7 | - If you haven't already, check out the [Contributing Guide](https://pandera.readthedocs.io/en/stable/CONTRIBUTING.html) 8 | - Sign off your commits (Reference: [DCO Guide](https://github.com/src-d/guide/blob/master/developer-community/fix-DCO.md)). 9 | 10 | # Comment to be posted to on pull requests merged by a first time user 11 | firstPRMergeComment: > 12 | Congrats on merging your first pull request! 🎉 13 | 14 | # Comment to be posted on first-time issues 15 | newIssueWelcomeComment: > 16 | Thank you for opening your first issue here! 🛠 17 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | build_wheel_and_sdist: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | with: 13 | fetch-depth: "0" 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: "3.x" 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install build twine 22 | - name: Build wheel and sdist 23 | run: python -m build 24 | shell: bash 25 | - uses: actions/upload-artifact@v4 26 | with: 27 | name: pandera-artifact 28 | path: ./dist 29 | 30 | pypi-publish: 31 | name: Upload release to PyPI 32 | needs: [build_wheel_and_sdist] 33 | runs-on: ubuntu-latest 34 | permissions: 35 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 36 | environment: release 37 | steps: 38 | - uses: actions/download-artifact@v4 39 | with: 40 | name: pandera-artifact 41 | path: dist 42 | - run: ls dist 43 | - name: Publish package distributions to PyPI 44 | uses: pypa/gh-action-pypi-publish@release/v1 45 | with: 46 | attestations: false 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pandera/_version.py 2 | uv.lock 3 | *.db 4 | .vscode 5 | dask-worker-space 6 | spark-warehouse 7 | docs/source/_contents 8 | docs/jupyter_execute 9 | **.DS_Store 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # Environments 95 | .env 96 | .venv 97 | env/ 98 | venv/ 99 | ENV/ 100 | env.bak/ 101 | venv.bak/ 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | 116 | # Pycharm settings 117 | .idea 118 | 119 | # Airspeed Velocity Benchmarks 120 | /asv_bench/html/ 121 | /asv_bench/results/ 122 | 123 | # Docs 124 | docs/source/reference/generated 125 | 126 | # Nox 127 | .nox 128 | .nox-* 129 | 130 | # ignore markdown files copied from .github 131 | docs/source/CONTRIBUTING.md 132 | .aider* 133 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: (^asv_bench|setup.py|requirements-dev.txt) 2 | 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.1.0 6 | hooks: 7 | - id: check-ast 8 | description: Simply check whether files parse as valid python 9 | - id: check-case-conflict 10 | description: Check for files that would conflict in case-insensitive filesystems 11 | - id: check-merge-conflict 12 | description: Check for files that contain merge conflict strings 13 | - id: check-yaml 14 | description: Attempts to load all yaml files to verify syntax 15 | - id: debug-statements 16 | description: Check for debugger imports and py37+ breakpoint() calls in python source 17 | - id: end-of-file-fixer 18 | description: Makes sure files end in a newline and only a newline 19 | - id: trailing-whitespace 20 | description: Trims trailing whitespace 21 | - id: mixed-line-ending 22 | description: Replaces or checks mixed line ending 23 | 24 | - repo: https://github.com/pre-commit/mirrors-isort 25 | rev: v5.10.1 26 | hooks: 27 | - id: isort 28 | args: ["--line-length=79", "--skip=docs/source/conf.py", "--diff"] 29 | 30 | - repo: https://github.com/ikamensh/flynt 31 | rev: "0.76" 32 | hooks: 33 | - id: flynt 34 | 35 | - repo: https://github.com/psf/black 36 | rev: 24.4.2 37 | hooks: 38 | - id: black 39 | 40 | - repo: https://github.com/asottile/pyupgrade 41 | rev: v3.19.1 42 | hooks: 43 | - id: pyupgrade 44 | args: [--py38-plus, --keep-runtime-typing] 45 | 46 | - repo: https://github.com/pycqa/pylint 47 | rev: v3.3.6 48 | hooks: 49 | - id: pylint 50 | args: ["--disable=import-error"] 51 | exclude: (^docs/|^scripts) 52 | 53 | - repo: https://github.com/pre-commit/mirrors-mypy 54 | rev: v1.10.0 55 | hooks: 56 | - id: mypy 57 | additional_dependencies: 58 | - types-click 59 | - types-pytz 60 | - types-pyyaml 61 | - types-requests 62 | - types-setuptools 63 | - polars 64 | args: ["pandera", "tests", "scripts"] 65 | exclude: (^docs/|^tests/mypy/modules/) 66 | pass_filenames: false 67 | require_serial: true 68 | verbose: true 69 | 70 | - repo: https://github.com/codespell-project/codespell 71 | rev: v2.4.1 72 | hooks: 73 | - id: codespell 74 | additional_dependencies: 75 | - tomli -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [BASIC] 2 | ignore=mypy.py,noxfile.py,pandera/accessors/pyspark_sql_accessor.py,pandera/engines/pyspark_engine.py,pandera/pyspark.py,pandera/typing/pyspark_sql.py, 3 | ignore-patterns=pandera/api/pyspark/*,tests/pyspark/* 4 | good-names= 5 | T, 6 | F, 7 | logger, 8 | df, 9 | fn, 10 | i, 11 | e, 12 | x, 13 | f, 14 | k, 15 | v, 16 | fp, 17 | bar, 18 | eq, 19 | ne, 20 | gt, 21 | ge, 22 | lt, 23 | le, 24 | dt, 25 | tz, 26 | TBaseModel, 27 | TArraySchemaBase, 28 | TDataFrameModel, 29 | _DataType 30 | 31 | [MESSAGES CONTROL] 32 | disable= 33 | # C0330 conflicts with black: https://github.com/psf/black/issues/48 34 | R0913, 35 | duplicate-code, 36 | too-many-instance-attributes, 37 | no-else-return, 38 | inconsistent-return-statements, 39 | protected-access, 40 | too-many-ancestors, 41 | too-many-lines, 42 | too-few-public-methods, 43 | line-too-long, 44 | ungrouped-imports, 45 | function-redefined, 46 | arguments-differ, 47 | unnecessary-dunder-call, 48 | use-dict-literal, 49 | invalid-name, 50 | import-outside-toplevel, 51 | missing-class-docstring, 52 | missing-function-docstring, 53 | fixme, 54 | too-many-locals, 55 | redefined-outer-name, 56 | logging-fstring-interpolation, 57 | multiple-statements, 58 | cyclic-import, 59 | too-many-positional-arguments, 60 | too-many-function-args, 61 | # Due to custom `immutable` decorator replacing `dataclasses.dataclass` 62 | invalid-field-call 63 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: ubuntu-24.04 10 | apt_packages: 11 | # Install OpenJDK as Java backend to run PySpark examples. 12 | - openjdk-11-jre-headless 13 | tools: 14 | python: "3.11" 15 | jobs: 16 | post_install: 17 | - pip install uv 18 | - UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --all-extras --all-groups --link-mode=copy 19 | 20 | sphinx: 21 | configuration: docs/source/conf.py 22 | 23 | 24 | # Optionally build your docs in additional formats such as PDF and ePub 25 | formats: [] 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Niels Bantilan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: tests clean clean-pyc upload-pypi-test upload-pypi requirements docs \ 2 | code-cov docs-clean requirements-dev.txt 3 | 4 | clean: 5 | python setup.py clean 6 | 7 | clean-pyc: 8 | find . -name '*.pyc' -exec rm {} \; 9 | 10 | upload-pypi-test: 11 | python setup.py sdist bdist_wheel && \ 12 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* && \ 13 | rm -rf dist 14 | 15 | upload-pypi: 16 | python setup.py sdist bdist_wheel && \ 17 | twine upload dist/* && \ 18 | rm -rf dist 19 | 20 | .PHONY: install-uv 21 | install-uv: 22 | pip install uv 23 | 24 | setup: install-uv 25 | uv sync --all-extras 26 | 27 | setup-macos: install-uv 28 | uv sync --all-extras 29 | uv pip install polars-lts-cpu 30 | 31 | docs-clean: 32 | rm -rf docs/source/reference/generated docs/**/generated docs/**/methods docs/_build docs/source/_contents 33 | 34 | docs: docs-clean 35 | python -m sphinx -W -E "docs/source" "docs/_build" && make -C docs doctest 36 | 37 | quick-docs: 38 | python -m sphinx -E "docs/source" "docs/_build" && make -C docs doctest 39 | 40 | code-cov: 41 | pytest --cov-report=html --cov=pandera tests/ 42 | 43 | NOX_FLAGS ?= "-r" 44 | 45 | deps-from-environment.yml: 46 | python scripts/generate_pip_deps_from_conda.py 47 | 48 | unit-tests: 49 | pytest tests/core 50 | 51 | nox-tests: 52 | nox -db uv -s tests ${NOX_FLAGS} 53 | -------------------------------------------------------------------------------- /asv_bench/README.md: -------------------------------------------------------------------------------- 1 | # Airspeed Velocity 2 | 3 | `pandera`'s performance benchmarks over time can be [viewed on this airspeed-velocity dashboard](https://pandera-dev.github.io/pandera-asv-logs/). 4 | 5 | The [config](https://github.com/pandera-dev/pandera-asv-logs/tree/master/asv_bench/asv.conf.json) and [results files](https://github.com/pandera-dev/pandera-asv-logs/tree/master/results) files are tracked in the [pandera-asv-logs](https://github.com/pandera-dev/pandera-asv-logs) repo to avoid build files in the main repo. 6 | 7 | The [benchmarks](https://github.com/pandera-dev/pandera/tree/master/benchmarks/) are tracked in the main [pandera repo](https://github.com/pandera-dev/pandera). 8 | 9 | ## Running `asv` 10 | 11 | Ensure both the `pandera` and `pandera-asv-logs` repos are checked out to the same parent directory. 12 | 13 | From the `pandera-asv-logs` repo, run: 14 | ``` 15 | asv run ALL --config asv_bench/asv.conf.json 16 | ``` 17 | 18 | ## Publishing results: 19 | 20 | To build the html and preview the results: 21 | ``` 22 | asv publish --config asv_bench/asv.conf.json 23 | asv preview --config asv_bench/asv.conf.json 24 | ``` 25 | 26 | The `.json` results files are committed or PR'd into the master branch of `pandera-asv-logs`. 27 | 28 | The published html is pushed directly to the gh-pages branch of `pandera-asv-logs` by running: 29 | 30 | ``` 31 | asv gh-pages --rewrite --config asv_bench/asv.conf.json 32 | ``` 33 | 34 | The `--rewrite` flag overwrites the existing `gh-pages`, avoiding duplication of data. 35 | 36 | The `asv` docs are [here](https://asv.readthedocs.io/en/stable/index.html). 37 | -------------------------------------------------------------------------------- /asv_bench/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/asv_bench/benchmarks/__init__.py -------------------------------------------------------------------------------- /asv_bench/benchmarks/dataframe_schema.py: -------------------------------------------------------------------------------- 1 | # Airspeed Velocity Benchmarks for pandera 2 | import pandas as pd 3 | 4 | from pandera.pandas import ( 5 | Column, 6 | DataFrameSchema, 7 | Bool, 8 | Category, 9 | Check, 10 | DateTime, 11 | Float, 12 | Int, 13 | Object, 14 | String, 15 | Timedelta, 16 | check_input, 17 | check_output, 18 | ) 19 | 20 | 21 | class Validate: 22 | """ 23 | Benchmarking schema.validate 24 | """ 25 | 26 | def setup(self): 27 | self.schema = DataFrameSchema( 28 | { 29 | "a": Column(Int), 30 | "b": Column(Float), 31 | "c": Column(String), 32 | "d": Column(Bool), 33 | "e": Column(Category), 34 | "f": Column(Object), 35 | "g": Column(DateTime), 36 | "i": Column(Timedelta), 37 | }, 38 | ) 39 | self.df = pd.DataFrame( 40 | { 41 | "a": [1, 2, 3], 42 | "b": [1.1, 2.5, 9.9], 43 | "c": ["z", "y", "x"], 44 | "d": [True, True, False], 45 | "e": pd.Series(["c2", "c1", "c3"], dtype="category"), 46 | "f": [(3,), (2,), (1,)], 47 | "g": [ 48 | pd.Timestamp("2015-02-01"), 49 | pd.Timestamp("2015-02-02"), 50 | pd.Timestamp("2015-02-03"), 51 | ], 52 | "i": [ 53 | pd.Timedelta(1, unit="D"), 54 | pd.Timedelta(5, unit="D"), 55 | pd.Timedelta(9, unit="D"), 56 | ], 57 | } 58 | ) 59 | 60 | def time_df_schema(self): 61 | self.schema.validate(self.df) 62 | 63 | def mem_df_schema(self): 64 | self.schema.validate(self.df) 65 | 66 | def peakmem_df_schema(self): 67 | self.schema.validate(self.df) 68 | 69 | 70 | class Decorators: 71 | """ 72 | Benchmarking input and output decorator performance. 73 | """ 74 | 75 | def transformer(df): 76 | return df.assign(column2=[1, 2, 3]) 77 | 78 | def setup(self): 79 | self.in_schema = DataFrameSchema({"column1": Column(String)}) 80 | self.out_schema = DataFrameSchema({"column2": Column(Int)}) 81 | self.df = pd.DataFrame({"column1": ["a", "b", "c"]}) 82 | 83 | def time_check_input(self): 84 | @check_input(self.in_schema) 85 | def transform_first_arg(self): 86 | return Decorators.transformer(self.df) 87 | 88 | def mem_check_input(self): 89 | @check_input(self.in_schema) 90 | def transform_first_arg(self): 91 | return Decorators.transformer(self.df) 92 | 93 | def peakmem_check_input(self): 94 | @check_input(self.in_schema) 95 | def transform_first_arg(self): 96 | return Decorators.transformer(self.df) 97 | 98 | def time_check_output(self): 99 | @check_output(self.out_schema) 100 | def transform_first_arg(self): 101 | return Decorators.transformer(self.df) 102 | 103 | def mem_check_output(self): 104 | @check_output(self.out_schema) 105 | def transform_first_arg(self): 106 | return Decorators.transformer(self.df) 107 | 108 | def peakmem_check_output(self): 109 | @check_output(self.out_schema) 110 | def transform_first_arg(self): 111 | return Decorators.transformer(self.df) 112 | -------------------------------------------------------------------------------- /asv_bench/benchmarks/series_schema.py: -------------------------------------------------------------------------------- 1 | # Airspeed Velocity Benchmarks for pandera 2 | import pandas as pd 3 | 4 | from pandera.pandas import ( 5 | Column, 6 | DataFrameSchema, 7 | SeriesSchema, 8 | Bool, 9 | Category, 10 | Check, 11 | DateTime, 12 | Float, 13 | Int, 14 | Object, 15 | String, 16 | Timedelta, 17 | String, 18 | ) 19 | 20 | 21 | class Validate: 22 | """ 23 | Benchmarking Series schema.validate 24 | """ 25 | 26 | def setup(self): 27 | self.schema = SeriesSchema( 28 | String, 29 | checks=[ 30 | Check(lambda s: s.str.startswith("foo")), 31 | Check(lambda s: s.str.endswith("bar")), 32 | Check(lambda x: len(x) > 3, element_wise=True), 33 | ], 34 | nullable=False, 35 | unique=False, 36 | name="my_series", 37 | ) 38 | self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series") 39 | 40 | def time_series_schema(self): 41 | self.schema.validate(self.series) 42 | 43 | def mem_series_schema(self): 44 | self.schema.validate(self.series) 45 | 46 | def peakmem_series_schema(self): 47 | self.schema.validate(self.series) 48 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_static/custom.js: -------------------------------------------------------------------------------- 1 | // Add event listener for DOMContentLoaded event 2 | window.addEventListener("DOMContentLoaded", function() { 3 | // Select all elements with class "external" 4 | var externalLinks = document.querySelectorAll("a.external"); 5 | 6 | // Loop through each element with class "external" 7 | externalLinks.forEach(function(link) { 8 | // Set the target attribute to "_blank" 9 | link.setAttribute("target", "_blank"); 10 | }); 11 | }); 12 | 13 | 14 | function setHtmlDataTheme() { 15 | // Set theme at the root html element 16 | setTimeout(() => { 17 | const theme = document.body.dataset.theme; 18 | const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches; 19 | 20 | if (theme === "auto") { 21 | document.documentElement.dataset.theme = prefersDark ? "dark" : "light"; 22 | } else { 23 | document.documentElement.dataset.theme = theme; 24 | } 25 | }, 10) 26 | } 27 | 28 | function setupAlgoliaTheme() { 29 | // To get darkmode in the algolia search modal, we need to set the theme in 30 | // the root html element. This function propagates the theme set by furo 31 | // that's set in the body element. 32 | const buttons = document.getElementsByClassName("theme-toggle"); 33 | 34 | // set for initial document load 35 | setHtmlDataTheme(); 36 | 37 | // listen for when theme button is clicked. 38 | Array.from(buttons).forEach((btn) => { 39 | btn.addEventListener("click", setHtmlDataTheme); 40 | }); 41 | } 42 | 43 | function main() { 44 | setupAlgoliaTheme() 45 | } 46 | 47 | document.addEventListener('DOMContentLoaded', main); 48 | window.addEventListener('keydown', (event) => { 49 | if (event.code === "Escape") { 50 | // make sure to prevent default behavior with escape key so that algolia 51 | // modal can be closed properly. 52 | event.preventDefault(); 53 | } 54 | }); 55 | -------------------------------------------------------------------------------- /docs/source/_static/docsearch_config.js_t: -------------------------------------------------------------------------------- 1 | docsearch({ 2 | container: "{{ docsearch_container|default('#docsearch') }}", 3 | appId: "{{ docsearch_app_id }}", 4 | apiKey: "{{ docsearch_api_key }}", 5 | indexName: "{{ docsearch_index_name }}", 6 | {%- if docsearch_search_parameters %} 7 | searchParameters: { 8 | {% for key, value in docsearch_search_parameters.items() %} 9 | {{ key }}: {% if value is string %}"{{ value }}"{% else %}{{ value }}{% endif %}{% if not loop.last %},{% endif %} 10 | {% endfor %} 11 | } 12 | {%- endif %} 13 | }); 14 | -------------------------------------------------------------------------------- /docs/source/_static/pandera-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-banner.png -------------------------------------------------------------------------------- /docs/source/_static/pandera-favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-favicon.png -------------------------------------------------------------------------------- /docs/source/_static/pandera-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/docs/source/_static/pandera-logo.png -------------------------------------------------------------------------------- /docs/source/_templates/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | 7 | {% block attributes %} 8 | {% if attributes %} 9 | .. rubric:: Attributes 10 | 11 | .. autosummary:: 12 | :nosignatures: 13 | 14 | {% for item in attributes %} 15 | ~{{ name }}.{{ item }} 16 | {%- endfor %} 17 | 18 | {% endif %} 19 | {% endblock %} 20 | 21 | {% block methods %} 22 | {% if methods %} 23 | .. rubric:: Methods 24 | 25 | {% for item in methods %} 26 | {%- if item not in inherited_members %} 27 | .. automethod:: {{ item }} 28 | {% endif %} 29 | {%- endfor %} 30 | 31 | {% endif %} 32 | 33 | {%- if members and '__call__' in members %} 34 | .. automethod:: __call__ 35 | {%- endif %} 36 | 37 | {% endblock %} 38 | -------------------------------------------------------------------------------- /docs/source/_templates/dtype.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | 7 | {% block attributes %} 8 | {% if attributes %} 9 | .. rubric:: Attributes 10 | 11 | .. autosummary:: 12 | :nosignatures: 13 | 14 | {% for item in attributes %} 15 | ~{{ name }}.{{ item }} 16 | {%- endfor %} 17 | 18 | {% endif %} 19 | {% endblock %} 20 | 21 | {% block methods %} 22 | {% if methods %} 23 | .. rubric:: Methods 24 | 25 | {% for item in methods %} 26 | .. automethod:: {{ item }} 27 | {%- endfor %} 28 | 29 | {%- if members and '__call__' in members %} 30 | .. automethod:: __call__ 31 | {%- endif %} 32 | 33 | {%- endif %} 34 | {% endblock %} 35 | -------------------------------------------------------------------------------- /docs/source/_templates/model_component_class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | :show-inheritance: 7 | :exclude-members: 8 | 9 | {% block attributes %} 10 | {% if attributes %} 11 | .. rubric:: Attributes 12 | 13 | .. autosummary:: 14 | :nosignatures: 15 | 16 | {% for item in attributes %} 17 | ~{{ name }}.{{ item }} 18 | {%- endfor %} 19 | 20 | {% endif %} 21 | {% endblock %} 22 | -------------------------------------------------------------------------------- /docs/source/_templates/module.rst: -------------------------------------------------------------------------------- 1 | .. empty 2 | 3 | {{ fullname | escape | underline }} 4 | 5 | .. currentmodule:: {{ fullname }} 6 | 7 | .. automodule:: {{ fullname }} 8 | 9 | {% block classes %} 10 | 11 | {% for item in classes %} 12 | .. autoclass:: {{ item }} 13 | :members: 14 | :member-order: bysource 15 | :show-inheritance: 16 | :exclude-members: 17 | {%- endfor %} 18 | 19 | {% endblock %} 20 | 21 | {% block functions %} 22 | 23 | {% for item in functions %} 24 | .. autofunction:: {{ item }} 25 | {%- endfor %} 26 | 27 | {% endblock %} 28 | -------------------------------------------------------------------------------- /docs/source/_templates/page.html: -------------------------------------------------------------------------------- 1 | {% extends "!page.html" %} 2 | 3 | {% block body -%} 4 | {{ super() }} 5 | 6 | 11 | 12 | {%- endblock %} 13 | -------------------------------------------------------------------------------- /docs/source/_templates/sidebar/search.html: -------------------------------------------------------------------------------- 1 |
2 | -------------------------------------------------------------------------------- /docs/source/_templates/strategies_module.rst: -------------------------------------------------------------------------------- 1 | .. empty 2 | 3 | {{ fullname | escape | underline }} 4 | 5 | .. currentmodule:: {{ fullname }} 6 | 7 | .. automodule:: {{ fullname }} 8 | 9 | {% block functions %} 10 | 11 | {% for item in functions %} 12 | {% if item not in ["null_dataframe_masks", "null_field_masks", "set_pandas_index", "strategy_import_error"] %} 13 | .. autofunction:: {{ item }} 14 | {% endif %} 15 | {%- endfor %} 16 | 17 | {% endblock %} 18 | -------------------------------------------------------------------------------- /docs/source/configuration.md: -------------------------------------------------------------------------------- 1 | (configuration)= 2 | 3 | # Configuration 4 | 5 | *New in version 0.17.3* 6 | 7 | `pandera` provides a global config `~pandera.config.PanderaConfig`. The 8 | global configuration is available through `pandera.config.CONFIG`. It can also 9 | be modified with a configuration context `~pandera.config.config_context` and 10 | fetched with `~pandera.config.get_config_context` in custom code. 11 | 12 | This configuration can also be set using environment variables. 13 | 14 | ## Validation depth 15 | 16 | Validation depth determines whether pandera only runs schema-level validations 17 | (column names and datatypes), data-level validations (checks on actual values), 18 | or both: 19 | 20 | ``` 21 | export PANDERA_VALIDATION_ENABLED=False 22 | export PANDERA_VALIDATION_DEPTH=DATA_ONLY # SCHEMA_AND_DATA, SCHEMA_ONLY, DATA_ONLY 23 | ``` 24 | 25 | ## Enabling/disabling validation 26 | 27 | Runtime data validation incurs a performance overhead. To mitigate this in the 28 | appropriate contexts, you have the option to disable validation globally. 29 | 30 | This can be achieved by setting the environment variable 31 | `PANDERA_VALIDATION_ENABLED=False`. When validation is disabled, any 32 | `validate` call not actually run any validation checks. 33 | -------------------------------------------------------------------------------- /docs/source/dask.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | ```{currentmodule} pandera 6 | ``` 7 | 8 | (scaling-dask)= 9 | 10 | # Data Validation with Dask 11 | 12 | *new in 0.8.0* 13 | 14 | [Dask](https://docs.dask.org/en/latest/dataframe.html) is a distributed 15 | compute framework that offers a pandas-like dataframe API. 16 | You can use pandera to validate {py:func}`~dask.dataframe.DataFrame` 17 | and {py:func}`~dask.dataframe.Series` objects directly. First, install 18 | `pandera` with the `dask` extra: 19 | 20 | ```bash 21 | pip install 'pandera[dask]' 22 | ``` 23 | 24 | Then you can use pandera schemas to validate dask dataframes. In the example 25 | below we'll use the {ref}`class-based API ` to define a 26 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation. 27 | 28 | ```{code-cell} python 29 | import dask.dataframe as dd 30 | import pandas as pd 31 | import pandera.pandas as pa 32 | 33 | from pandera.typing.dask import DataFrame, Series 34 | 35 | 36 | class Schema(pa.DataFrameModel): 37 | state: Series[str] 38 | city: Series[str] 39 | price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20}) 40 | 41 | 42 | ddf = dd.from_pandas( 43 | pd.DataFrame( 44 | { 45 | 'state': ['FL','FL','FL','CA','CA','CA'], 46 | 'city': [ 47 | 'Orlando', 48 | 'Miami', 49 | 'Tampa', 50 | 'San Francisco', 51 | 'Los Angeles', 52 | 'San Diego', 53 | ], 54 | 'price': [8, 12, 10, 16, 20, 18], 55 | } 56 | ), 57 | npartitions=2 58 | ) 59 | pandera_ddf = Schema(ddf) 60 | pandera_ddf 61 | ``` 62 | 63 | As you can see, passing the dask dataframe into `Schema` will produce 64 | another dask dataframe which hasn't been evaluated yet. What this means is 65 | that pandera will only validate when the dask graph is evaluated. 66 | 67 | ```{code-cell} python 68 | pandera_ddf.compute() 69 | ``` 70 | 71 | You can also use the {py:func}`~pandera.check_types` decorator to validate 72 | dask dataframes at runtime: 73 | 74 | ```{code-cell} python 75 | @pa.check_types 76 | def function(ddf: DataFrame[Schema]) -> DataFrame[Schema]: 77 | return ddf[ddf["state"] == "CA"] 78 | 79 | function(ddf).compute() 80 | ``` 81 | 82 | And of course, you can use the object-based API to validate dask dataframes: 83 | 84 | ```{code-cell} python 85 | schema = pa.DataFrameSchema({ 86 | "state": pa.Column(str), 87 | "city": pa.Column(str), 88 | "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20)) 89 | }) 90 | schema(ddf).compute() 91 | ``` 92 | -------------------------------------------------------------------------------- /docs/source/drop_invalid_rows.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | ```{currentmodule} pandera 6 | ``` 7 | 8 | (drop-invalid-rows)= 9 | 10 | # Dropping Invalid Rows 11 | 12 | *New in version 0.16.0* 13 | 14 | If you wish to use the validation step to remove invalid data, you can pass the 15 | `drop_invalid_rows=True` argument to the `schema` object on creation. On `schema.validate()`, 16 | if a data-level check fails, then that row which caused the failure will be removed from the dataframe 17 | when it is returned. 18 | 19 | `drop_invalid_rows` will prevent data-level schema errors being raised and will instead 20 | remove the rows which causes the failure. 21 | 22 | This functionality is available on `DataFrameSchema`, `SeriesSchema`, `Column`, 23 | as well as `DataFrameModel` schemas. 24 | 25 | **Note** that this functionality works by identifying the index or multi-index of the failing rows. 26 | If the index is not unique on the dataframe, this could result in incorrect rows being dropped. 27 | 28 | Dropping invalid rows with {class}`~pandera.api.pandas.container.DataFrameSchema`: 29 | 30 | ```{code-cell} python 31 | import pandas as pd 32 | import pandera.pandas as pa 33 | 34 | 35 | df = pd.DataFrame({"counter": [1, 2, 3]}) 36 | schema = pa.DataFrameSchema( 37 | {"counter": pa.Column(int, checks=[pa.Check(lambda x: x >= 3)])}, 38 | drop_invalid_rows=True, 39 | ) 40 | 41 | schema.validate(df, lazy=True) 42 | ``` 43 | 44 | Dropping invalid rows with {class}`~pandera.api.pandas.array.SeriesSchema`: 45 | 46 | ```{code-cell} python 47 | import pandas as pd 48 | import pandera.pandas as pa 49 | 50 | 51 | series = pd.Series([1, 2, 3]) 52 | schema = pa.SeriesSchema( 53 | int, 54 | checks=[pa.Check(lambda x: x >= 3)], 55 | drop_invalid_rows=True, 56 | ) 57 | 58 | schema.validate(series, lazy=True) 59 | ``` 60 | 61 | Dropping invalid rows with {class}`~pandera.api.pandas.components.Column`: 62 | 63 | ```{code-cell} python 64 | import pandas as pd 65 | import pandera.pandas as pa 66 | 67 | 68 | df = pd.DataFrame({"counter": [1, 2, 3]}) 69 | schema = pa.Column( 70 | int, 71 | name="counter", 72 | drop_invalid_rows=True, 73 | checks=[pa.Check(lambda x: x >= 3)] 74 | ) 75 | 76 | schema.validate(df, lazy=True) 77 | ``` 78 | 79 | Dropping invalid rows with {class}`~pandera.api.pandas.model.DataFrameModel`: 80 | 81 | ```{code-cell} python 82 | import pandas as pd 83 | import pandera.pandas as pa 84 | 85 | 86 | class MySchema(pa.DataFrameModel): 87 | counter: int = pa.Field(in_range={"min_value": 3, "max_value": 5}) 88 | 89 | class Config: 90 | drop_invalid_rows = True 91 | 92 | 93 | MySchema.validate( 94 | pd.DataFrame({"counter": [1, 2, 3, 4, 5, 6]}), lazy=True 95 | ) 96 | ``` 97 | 98 | ```{note} 99 | In order to use `drop_invalid_rows=True`, `lazy=True` must 100 | be passed to the `schema.validate()`. {ref}`lazy-validation` enables all schema 101 | errors to be collected and raised together, meaning all invalid rows can be dropped together. 102 | This provides clear API for ensuring the validated dataframe contains only valid data. 103 | ``` 104 | -------------------------------------------------------------------------------- /docs/source/error_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | (error-report)= 6 | 7 | # Error Reports 8 | 9 | *new in 0.19.0* 10 | 11 | The pandera error report is a generalised machine-readable summary of failures 12 | which occurred during schema validation. It is available for both `pysparksql` and 13 | `pandas` objects. 14 | 15 | By default, error reports are generated for both schema and data level validation, 16 | but more granular control over schema or data only validations is available. 17 | 18 | This is achieved by introducing configurable settings using environment variables 19 | that allow you to control execution at three different levels: 20 | 21 | 1. `SCHEMA_ONLY`: perform schema validations only. It checks that data conforms 22 | to the schema definition, but does not perform any data-level validations on dataframe. 23 | 2. `DATA_ONLY`: perform data-level validations only. It validates that data 24 | conforms to the defined `checks`, but does not validate the schema. 25 | 3. `SCHEMA_AND_DATA`: (**default**) perform both schema and data level 26 | validations. It runs most exhaustive validation and could be compute intensive. 27 | 28 | You can override default behaviour by setting an environment variable from terminal 29 | before running the `pandera` process as: 30 | 31 | ```bash 32 | export PANDERA_VALIDATION_DEPTH=SCHEMA_ONLY 33 | ``` 34 | 35 | This will be picked up by `pandera` to only enforce SCHEMA level validations. 36 | 37 | ## Error reports with `pandas` 38 | 39 | To create an error report with pandas, you must specify `lazy=True` to allow all errors 40 | to be aggregated and raised together as a `SchemaErrors`. 41 | 42 | ```{code-cell} python 43 | import pandas as pd 44 | import pandera.pandas as pa 45 | import json 46 | 47 | pandas_schema = pa.DataFrameSchema( 48 | { 49 | "color": pa.Column(str, pa.Check.isin(["red", "green", "blue"])), 50 | "length": pa.Column(int, pa.Check.gt(10)), 51 | } 52 | ) 53 | data = [("red", 4), ("blue", 11), ("purple", 15), ("green", 39)] 54 | 55 | df = pd.DataFrame( 56 | { 57 | "color": ["red", "blue", "purple", "green"], 58 | "length": [4, 11, 15, 39], 59 | } 60 | ) 61 | 62 | try: 63 | pandas_schema.validate(df, lazy=True) 64 | except pa.errors.SchemaErrors as e: 65 | print(json.dumps(e.message, indent=2)) 66 | ``` 67 | 68 | ## Error reports with `pyspark.sql` 69 | 70 | Accessing the error report on a validated `pyspark` dataframe can be done via the 71 | `errors` attribute on the `pandera` accessor. 72 | 73 | ```{code-cell} python 74 | import pandera.pyspark as pa 75 | import pyspark.sql.types as T 76 | import json 77 | 78 | from decimal import Decimal 79 | from pyspark.sql import SparkSession 80 | from pandera.pyspark import DataFrameModel 81 | 82 | spark = SparkSession.builder.getOrCreate() 83 | 84 | class PysparkPanderSchema(DataFrameModel): 85 | color: T.StringType() = pa.Field(isin=["red", "green", "blue"]) 86 | length: T.IntegerType() = pa.Field(gt=10) 87 | 88 | data = [("red", 4), ("blue", 11), ("purple", 15), ("green", 39)] 89 | 90 | spark_schema = T.StructType( 91 | [ 92 | T.StructField("color", T.StringType(), False), 93 | T.StructField("length", T.IntegerType(), False), 94 | ], 95 | ) 96 | 97 | df = spark.createDataFrame(data, spark_schema) 98 | df_out = PysparkPanderSchema.validate(check_obj=df) 99 | 100 | print(json.dumps(dict(df_out.pandera.errors), indent=4)) 101 | ``` 102 | -------------------------------------------------------------------------------- /docs/source/fastapi.md: -------------------------------------------------------------------------------- 1 | ```{eval-rst} 2 | .. currentmodule:: pandera 3 | ``` 4 | 5 | (fastapi-integration)= 6 | 7 | # FastAPI 8 | 9 | *new in 0.9.0* 10 | 11 | Since both FastAPI and Pandera integrates seamlessly with Pydantic, you can 12 | use the {py:class}`~pandera.api.pandas.model.DataFrameModel` types to validate incoming 13 | or outgoing data with respect to your API endpoints. 14 | 15 | ## Using DataFrameModels to Validate Endpoint Inputs and Outputs 16 | 17 | Suppose we want to process transactions, where each transaction has an 18 | `id` and `cost`. We can model this with a pandera dataframe model: 19 | 20 | ```{literalinclude} ../../tests/fastapi/models.py 21 | :language: python 22 | :lines: 1-14 23 | ``` 24 | 25 | Also suppose that we expect our endpoint to add a `name` to the transaction 26 | data: 27 | 28 | ```{literalinclude} ../../tests/fastapi/models.py 29 | :language: python 30 | :lines: 22-25 31 | ``` 32 | 33 | Let's also assume that the output of the endpoint should be a list of dictionary 34 | records containing the named transactions data. We can do this easily with the 35 | `to_format` option in the dataframe model {py:class}`~pandera.typing.config.BaseConfig`. 36 | 37 | ```{literalinclude} ../../tests/fastapi/models.py 38 | :language: python 39 | :lines: 34-37 40 | ``` 41 | 42 | Note that the `to_format_kwargs` is a dictionary of key-word arguments 43 | to be passed into the respective pandas `to_{format}` method. 44 | 45 | % TODO: create new page for the to/from_format config option 46 | 47 | Next we'll create a FastAPI app and define a `/transactions/` POST endpoint: 48 | 49 | ```{literalinclude} ../../tests/fastapi/app.py 50 | :language: python 51 | :lines: 2-6,14-21,28-34 52 | ``` 53 | 54 | ## Reading File Uploads 55 | 56 | Similar to the `TransactionsDictOut` example to convert dataframes to a 57 | particular format as an endpoint response, pandera also provides a 58 | `from_format` dataframe model configuration option to read a dataframe from 59 | a particular serialization format. 60 | 61 | ```{literalinclude} ../../tests/fastapi/models.py 62 | :language: python 63 | :lines: 17-19 64 | ``` 65 | 66 | Let's also define a response model for the `/file/` upload endpoint: 67 | 68 | ```{literalinclude} ../../tests/fastapi/models.py 69 | :language: python 70 | :lines: 28-32,46-48 71 | ``` 72 | 73 | In the next example, we use the pandera 74 | {py:class}`~pandera.typing.fastapi.UploadFile` type to upload a parquet file 75 | to the `/file/` POST endpoint and return a response containing the filename 76 | and the modified data in json format. 77 | 78 | ```{literalinclude} ../../tests/fastapi/app.py 79 | :language: python 80 | :lines: 37-44 81 | ``` 82 | 83 | Pandera's {py:class}`~pandera.typing.fastapi.UploadFile` type is a subclass of FastAPI's 84 | [UploadFile](https://fastapi.tiangolo.com/tutorial/request-files/?h=uploadfile#uploadfile) 85 | but it exposes a `.data` property containing the pandera-validated dataframe. 86 | 87 | ## Takeaway 88 | 89 | With the FastAPI and Pandera integration, you can use Pandera 90 | {py:class}`~pandera.api.pandas.model.DataFrameModel` types to validate the dataframe inputs 91 | and outputs of your FastAPI endpoints. 92 | -------------------------------------------------------------------------------- /docs/source/frictionless.md: -------------------------------------------------------------------------------- 1 | ```{eval-rst} 2 | .. currentmodule:: pandera 3 | ``` 4 | 5 | (frictionless-integration)= 6 | 7 | # Reading Third-Party Schema 8 | 9 | *new in 0.7.0* 10 | 11 | Pandera now accepts schema from other data validation frameworks. This requires 12 | a pandera installation with the `io` extension; please see the 13 | {ref}`installation` instructions for more details. 14 | 15 | ## Frictionless Data Schema 16 | 17 | :::{note} 18 | Please see the 19 | [Frictionless schema](https://specs.frictionlessdata.io/table-schema/) 20 | documentation for more information on this standard. 21 | ::: 22 | 23 | ```{eval-rst} 24 | .. autofunction:: pandera.io.from_frictionless_schema 25 | ``` 26 | 27 | under the hood, this uses the {class}`~pandera.io.pandas_io.FrictionlessFieldParser` class 28 | to parse each frictionless field (column): 29 | 30 | ```{eval-rst} 31 | .. autoclass:: pandera.io.pandas_io.FrictionlessFieldParser 32 | :members: 33 | ``` 34 | -------------------------------------------------------------------------------- /docs/source/geopandas.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: pandera 7 | ``` 8 | 9 | (supported-lib-geopandas)= 10 | 11 | # Data Validation with GeoPandas 12 | 13 | *new in 0.9.0* 14 | 15 | [GeoPandas](https://geopandas.org/en/stable/docs.html) is an extension of Pandas that adds 16 | support for geospatial data. You can use pandera to validate {py:func}`~geopandas.GeoDataFrame` 17 | and {py:func}`~geopandas.GeoSeries` objects directly. First, install 18 | `pandera` with the `geopandas` extra: 19 | 20 | ```bash 21 | pip install 'pandera[geopandas]' 22 | ``` 23 | 24 | Then you can use pandera schemas to validate geodataframes. In the example 25 | below we'll use the {ref}`class-based API ` to define a 26 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation. 27 | 28 | ```{code-cell} python 29 | import geopandas as gpd 30 | import pandas as pd 31 | import pandera.pandas as pa 32 | from shapely.geometry import Polygon 33 | 34 | geo_schema = pa.DataFrameSchema({ 35 | "geometry": pa.Column("geometry"), 36 | "region": pa.Column(str), 37 | }) 38 | 39 | geo_df = gpd.GeoDataFrame({ 40 | "geometry": [ 41 | Polygon(((0, 0), (0, 1), (1, 1), (1, 0))), 42 | Polygon(((0, 0), (0, -1), (-1, -1), (-1, 0))) 43 | ], 44 | "region": ["NA", "SA"] 45 | }) 46 | 47 | geo_schema.validate(geo_df) 48 | ``` 49 | 50 | You can also use the `GeometryDtype` data type in either instantiated or 51 | un-instantiated form: 52 | 53 | ```{code-cell} python 54 | geo_schema = pa.DataFrameSchema({ 55 | "geometry": pa.Column(gpd.array.GeometryDtype), 56 | # or 57 | "geometry": pa.Column(gpd.array.GeometryDtype()), 58 | }) 59 | ``` 60 | 61 | If you want to validate-on-instantiation, you can use the 62 | {py:class}`~pandera.typing.geopangas.GeoDataFrame` generic type with the 63 | dataframe model defined above: 64 | 65 | ```{code-cell} python 66 | from pandera.typing import Series 67 | from pandera.typing.geopandas import GeoDataFrame, GeoSeries 68 | 69 | 70 | class Schema(pa.DataFrameModel): 71 | geometry: GeoSeries 72 | region: Series[str] 73 | 74 | 75 | # create a geodataframe that's validated on object initialization 76 | df = GeoDataFrame[Schema]( 77 | { 78 | 'geometry': [ 79 | Polygon(((0, 0), (0, 1), (1, 1), (1, 0))), 80 | Polygon(((0, 0), (0, -1), (-1, -1), (-1, 0))) 81 | ], 82 | 'region': ['NA','SA'] 83 | } 84 | ) 85 | df 86 | ``` 87 | -------------------------------------------------------------------------------- /docs/source/integrations.md: -------------------------------------------------------------------------------- 1 | (integrations)= 2 | 3 | # Integrations 4 | 5 | Pandera ships with integrations with other tools in the Python ecosystem, with 6 | the goal of interoperating with libraries that you know and love. 7 | 8 | ```{eval-rst} 9 | .. list-table:: 10 | :widths: 25 75 11 | 12 | * - :ref:`FastAPI ` 13 | - Use pandera DataFrameModels in your FastAPI app 14 | * - :ref:`Frictionless ` 15 | - Convert frictionless schemas to pandera schemas 16 | * - :ref:`Hypothesis ` 17 | - Use the hypothesis library to generate valid data under your schema's constraints. 18 | * - :ref:`Mypy ` 19 | - Type-lint your pandas and pandera code with mypy for static type safety [experimental 🧪] 20 | * - :ref:`Pydantic ` 21 | - Use pandera DataFrameModels when defining your pydantic BaseModels 22 | ``` 23 | 24 | ```{toctree} 25 | :caption: Introduction 26 | :hidden: true 27 | :maxdepth: 1 28 | 29 | FastAPI 30 | Frictionless 31 | Hypothesis 32 | Mypy 33 | Pydantic 34 | ``` 35 | 36 | :::{note} 37 | Don't see a library that you want supported? Check out the 38 | [github issues](https://github.com/pandera-dev/pandera/issues) to see if 39 | that library is in the roadmap. If it isn't, open up a 40 | [new issue](https://github.com/pandera-dev/pandera/issues/new?assignees=&labels=enhancement&template=feature_request.md&title=) 41 | to add support for it! 42 | ::: 43 | -------------------------------------------------------------------------------- /docs/source/jupyterlite_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "LiteBuildConfig": { 3 | "federated_extensions": [], 4 | "ignore_sys_prefix": true, 5 | "piplite_urls": [] 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/source/lazy_validation.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | ```{currentmodule} pandera 6 | ``` 7 | 8 | (lazy-validation)= 9 | 10 | # Lazy Validation 11 | 12 | *New in version 0.4.0* 13 | 14 | By default, when you call the `validate` method on schema or schema component 15 | objects, a {class}`~pandera.errors.SchemaError` is raised as soon as one of the 16 | assumptions specified in the schema is falsified. For example, for a 17 | {class}`~pandera.api.pandas.container.DataFrameSchema` object, the following situations will raise an 18 | exception: 19 | 20 | - a column specified in the schema is not present in the dataframe. 21 | - if `strict=True`, a column in the dataframe is not specified in the schema. 22 | - the `data type` does not match. 23 | - if `coerce=True`, the dataframe column cannot be coerced into the specified 24 | `data type`. 25 | - the {class}`~pandera.api.checks.Check` specified in one of the columns returns `False` or 26 | a boolean series containing at least one `False` value. 27 | 28 | For example: 29 | 30 | ```{code-cell} python 31 | import pandas as pd 32 | import pandera.pandas as pa 33 | 34 | 35 | df = pd.DataFrame({"column": ["a", "b", "c"]}) 36 | 37 | schema = pa.DataFrameSchema({"column": pa.Column(int)}) 38 | 39 | try: 40 | schema.validate(df) 41 | except pa.errors.SchemaError as exc: 42 | print(exc) 43 | ``` 44 | 45 | For more complex cases, it is useful to see all of the errors raised during 46 | the `validate` call so that you can debug the causes of errors on different 47 | columns and checks. The `lazy` keyword argument in the `validate` method 48 | of all schemas and schema components gives you the option of doing just this: 49 | 50 | ```{code-cell} python 51 | import json 52 | 53 | import pandas as pd 54 | import pandera.pandas as pa 55 | 56 | 57 | schema = pa.DataFrameSchema( 58 | columns={ 59 | "int_column": pa.Column(int), 60 | "float_column": pa.Column(float, pa.Check.greater_than(0)), 61 | "str_column": pa.Column(str, pa.Check.equal_to("a")), 62 | "date_column": pa.Column(pa.DateTime), 63 | }, 64 | strict=True 65 | ) 66 | 67 | df = pd.DataFrame({ 68 | "int_column": ["a", "b", "c"], 69 | "float_column": [0, 1, 2], 70 | "str_column": ["a", "b", "d"], 71 | "unknown_column": None, 72 | }) 73 | 74 | try: 75 | schema.validate(df, lazy=True) 76 | except pa.errors.SchemaErrors as exc: 77 | print(json.dumps(exc.message, indent=2)) 78 | ``` 79 | 80 | As you can see from the output above, a {class}`~pandera.errors.SchemaErrors` 81 | exception is raised with a summary of the error counts and failure cases 82 | caught by the schema. This summary is called an {ref}`error-report`. 83 | 84 | You can also inspect the failure cases in a more granular form: 85 | 86 | ```{code-cell} python 87 | try: 88 | schema.validate(df, lazy=True) 89 | except pa.errors.SchemaErrors as exc: 90 | print("Schema errors and failure cases:") 91 | print(exc.failure_cases) 92 | print("\nDataFrame object that failed validation:") 93 | print(exc.data) 94 | ``` 95 | -------------------------------------------------------------------------------- /docs/source/modin.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | ```{currentmodule} pandera 6 | ``` 7 | 8 | (scaling-modin)= 9 | 10 | # Data Validation with Modin 11 | 12 | *new in 0.8.0* 13 | 14 | [Modin](https://modin.readthedocs.io/en/latest/) is a distributed 15 | compute framework that offers a pandas drop-in replacement dataframe 16 | implementation. You can use pandera to validate {py:func}`~modin.pandas.DataFrame` 17 | and {py:func}`~modin.pandas.Series` objects directly. First, install 18 | `pandera` with the `dask` extra: 19 | 20 | ```bash 21 | pip install 'pandera[modin]' # installs both ray and dask backends 22 | pip install 'pandera[modin-ray]' # only ray backend 23 | pip install 'pandera[modin-dask]' # only dask backend 24 | ``` 25 | 26 | Then you can use pandera schemas to validate modin dataframes. In the example 27 | below we'll use the {ref}`class-based API ` to define a 28 | {py:class}`~pandera.api.model.pandas.DataFrameModel` for validation. 29 | 30 | ```python 31 | import modin.pandas as pd 32 | import pandera.pandas as pa 33 | 34 | from pandera.typing.modin import DataFrame, Series 35 | 36 | 37 | class Schema(pa.DataFrameModel): 38 | state: Series[str] 39 | city: Series[str] 40 | price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20}) 41 | 42 | 43 | # create a modin dataframe that's validated on object initialization 44 | df = DataFrame[Schema]( 45 | { 46 | 'state': ['FL','FL','FL','CA','CA','CA'], 47 | 'city': [ 48 | 'Orlando', 49 | 'Miami', 50 | 'Tampa', 51 | 'San Francisco', 52 | 'Los Angeles', 53 | 'San Diego', 54 | ], 55 | 'price': [8, 12, 10, 16, 20, 18], 56 | } 57 | ) 58 | print(df) 59 | ``` 60 | 61 | ``` 62 | state city price 63 | 0 FL Orlando 8 64 | 1 FL Miami 12 65 | 2 FL Tampa 10 66 | 3 CA San Francisco 16 67 | 4 CA Los Angeles 20 68 | 5 CA San Diego 18 69 | ``` 70 | 71 | You can also use the {py:func}`~pandera.check_types` decorator to validate 72 | modin dataframes at runtime: 73 | 74 | ```python 75 | @pa.check_types 76 | def function(df: DataFrame[Schema]) -> DataFrame[Schema]: 77 | return df[df["state"] == "CA"] 78 | 79 | function(df) 80 | ``` 81 | 82 | ``` 83 | state city price 84 | 3 CA San Francisco 16 85 | 4 CA Los Angeles 20 86 | 5 CA San Diego 18 87 | ``` 88 | 89 | And of course, you can use the object-based API to validate modin dataframes: 90 | 91 | ```python 92 | schema = pa.DataFrameSchema({ 93 | "state": pa.Column(str), 94 | "city": pa.Column(str), 95 | "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20)) 96 | }) 97 | schema(df) 98 | ``` 99 | 100 | ``` 101 | state city price 102 | 0 FL Orlando 8 103 | 1 FL Miami 12 104 | 2 FL Tampa 10 105 | 3 CA San Francisco 16 106 | 4 CA Los Angeles 20 107 | 5 CA San Diego 18 108 | ``` 109 | -------------------------------------------------------------------------------- /docs/source/pyspark.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | ```{currentmodule} pandera 6 | ``` 7 | 8 | (scaling-pyspark)= 9 | 10 | # Data Validation with Pyspark Pandas 11 | 12 | *new in 0.10.0* 13 | 14 | [Pyspark](https://spark.apache.org/docs/3.2.0/api/python/index.html) is a 15 | distributed compute framework that offers a pandas drop-in replacement dataframe 16 | implementation via the [pyspark.pandas API](https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/index.html) . 17 | You can use pandera to validate {py:func}`~pyspark.pandas.DataFrame` 18 | and {py:func}`~pyspark.pandas.Series` objects directly. First, install 19 | `pandera` with the `pyspark` extra: 20 | 21 | ```bash 22 | pip install 'pandera[pyspark]' 23 | ``` 24 | 25 | Then you can use pandera schemas to validate pyspark dataframes. In the example 26 | below we'll use the {ref}`class-based API ` to define a 27 | {py:class}`~pandera.api.pandas.model.DataFrameModel` for validation. 28 | 29 | ```{code-cell} python 30 | import pyspark.pandas as ps 31 | import pandas as pd 32 | import pandera.pandas as pa 33 | 34 | from pandera.typing.pyspark import DataFrame, Series 35 | 36 | 37 | class Schema(pa.DataFrameModel): 38 | state: Series[str] 39 | city: Series[str] 40 | price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20}) 41 | 42 | 43 | # create a pyspark.pandas dataframe that's validated on object initialization 44 | df = DataFrame[Schema]( 45 | { 46 | 'state': ['FL','FL','FL','CA','CA','CA'], 47 | 'city': [ 48 | 'Orlando', 49 | 'Miami', 50 | 'Tampa', 51 | 'San Francisco', 52 | 'Los Angeles', 53 | 'San Diego', 54 | ], 55 | 'price': [8, 12, 10, 16, 20, 18], 56 | } 57 | ) 58 | print(df) 59 | ``` 60 | 61 | You can also use the {py:func}`~pandera.check_types` decorator to validate 62 | pyspark pandas dataframes at runtime: 63 | 64 | ```{code-cell} python 65 | @pa.check_types 66 | def function(df: DataFrame[Schema]) -> DataFrame[Schema]: 67 | return df[df["state"] == "CA"] 68 | 69 | print(function(df)) 70 | ``` 71 | 72 | And of course, you can use the object-based API to validate dask dataframes: 73 | 74 | ```{code-cell} python 75 | schema = pa.DataFrameSchema({ 76 | "state": pa.Column(str), 77 | "city": pa.Column(str), 78 | "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20)) 79 | }) 80 | schema(df) 81 | ``` 82 | -------------------------------------------------------------------------------- /docs/source/reference/core.rst: -------------------------------------------------------------------------------- 1 | .. _api-core: 2 | 3 | Core 4 | ==== 5 | 6 | Schemas 7 | ------- 8 | 9 | .. autosummary:: 10 | :toctree: generated 11 | :template: class.rst 12 | :nosignatures: 13 | 14 | pandera.api.pandas.container.DataFrameSchema 15 | pandera.api.pandas.array.SeriesSchema 16 | pandera.api.polars.container.DataFrameSchema 17 | pandera.api.pyspark.container.DataFrameSchema 18 | pandera.api.dataframe.container.DataFrameSchema 19 | 20 | Schema Components 21 | ----------------- 22 | 23 | .. autosummary:: 24 | :toctree: generated 25 | :template: class.rst 26 | :nosignatures: 27 | 28 | pandera.api.pandas.components.Column 29 | pandera.api.pandas.components.Index 30 | pandera.api.pandas.components.MultiIndex 31 | pandera.api.polars.components.Column 32 | pandera.api.pyspark.components.Column 33 | pandera.api.dataframe.components.ComponentSchema 34 | 35 | Checks 36 | ------ 37 | 38 | .. autosummary:: 39 | :toctree: generated 40 | :template: class.rst 41 | :nosignatures: 42 | 43 | pandera.api.checks.Check 44 | pandera.api.hypotheses.Hypothesis 45 | 46 | Data Objects 47 | ------------ 48 | 49 | .. autosummary:: 50 | :toctree: generated 51 | :template: class.rst 52 | :nosignatures: 53 | 54 | pandera.api.polars.types.PolarsData 55 | pandera.api.pyspark.types.PysparkDataframeColumnObject 56 | 57 | Configuration 58 | ------------- 59 | 60 | .. autosummary:: 61 | :toctree: generated 62 | :template: class.rst 63 | :nosignatures: 64 | 65 | pandera.config.PanderaConfig 66 | pandera.config.ValidationDepth 67 | pandera.config.ValidationScope 68 | pandera.config.config_context 69 | pandera.config.get_config_context 70 | -------------------------------------------------------------------------------- /docs/source/reference/dataframe_models.rst: -------------------------------------------------------------------------------- 1 | .. _api-dataframe-models: 2 | 3 | DataFrame Models 4 | ================ 5 | 6 | DataFrame Model 7 | --------------- 8 | 9 | .. autosummary:: 10 | :toctree: generated 11 | :template: class.rst 12 | 13 | pandera.api.pandas.model.DataFrameModel 14 | pandera.api.polars.model.DataFrameModel 15 | pandera.api.pyspark.model.DataFrameModel 16 | pandera.api.dataframe.model.DataFrameModel 17 | 18 | Model Components 19 | ---------------- 20 | 21 | .. autosummary:: 22 | :toctree: generated 23 | 24 | pandera.api.dataframe.model_components.Field 25 | pandera.api.dataframe.model_components.check 26 | pandera.api.dataframe.model_components.dataframe_check 27 | pandera.api.dataframe.model_components.parser 28 | pandera.api.dataframe.model_components.dataframe_parser 29 | 30 | 31 | Config 32 | ------ 33 | 34 | .. autosummary:: 35 | :toctree: generated 36 | :template: model_component_class.rst 37 | :nosignatures: 38 | 39 | pandera.api.pandas.model_config.BaseConfig 40 | pandera.api.polars.model_config.BaseConfig 41 | pandera.api.pyspark.model_config.BaseConfig 42 | 43 | 44 | Typing 45 | ------ 46 | 47 | Pandas 48 | ****** 49 | 50 | .. autosummary:: 51 | :toctree: generated 52 | :template: class.rst 53 | 54 | pandera.typing.DataFrame 55 | pandera.typing.Series 56 | pandera.typing.Index 57 | 58 | Geopandas 59 | ********* 60 | 61 | .. autosummary:: 62 | :toctree: generated 63 | :template: class.rst 64 | 65 | pandera.typing.geopandas.GeoDataFrame 66 | pandera.typing.geopandas.GeoSeries 67 | 68 | Dask 69 | **** 70 | 71 | .. autosummary:: 72 | :toctree: generated 73 | :template: class.rst 74 | 75 | pandera.typing.dask.DataFrame 76 | pandera.typing.dask.Series 77 | pandera.typing.dask.Index 78 | 79 | Pyspark 80 | ******* 81 | 82 | .. autosummary:: 83 | :toctree: generated 84 | :template: class.rst 85 | 86 | pandera.typing.pyspark.DataFrame 87 | pandera.typing.pyspark.Series 88 | pandera.typing.pyspark.Index 89 | 90 | Modin 91 | ***** 92 | 93 | .. autosummary:: 94 | :toctree: generated 95 | :template: class.rst 96 | 97 | pandera.typing.modin.DataFrame 98 | pandera.typing.modin.Series 99 | pandera.typing.modin.Index 100 | 101 | FastAPI 102 | ******* 103 | 104 | .. autosummary:: 105 | :toctree: generated 106 | :template: class.rst 107 | 108 | pandera.typing.fastapi.UploadFile 109 | 110 | 111 | Serialization Formats 112 | ********************* 113 | 114 | .. autosummary:: 115 | :toctree: generated 116 | :template: class.rst 117 | 118 | pandera.typing.formats.Formats 119 | -------------------------------------------------------------------------------- /docs/source/reference/decorators.rst: -------------------------------------------------------------------------------- 1 | .. _api-decorators: 2 | 3 | Decorators 4 | ========== 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :nosignatures: 9 | 10 | pandera.decorators.check_input 11 | pandera.decorators.check_output 12 | pandera.decorators.check_io 13 | pandera.decorators.check_types 14 | -------------------------------------------------------------------------------- /docs/source/reference/errors.rst: -------------------------------------------------------------------------------- 1 | .. _api-errors: 2 | 3 | Errors 4 | ====== 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :template: class.rst 9 | :nosignatures: 10 | 11 | pandera.errors.SchemaError 12 | pandera.errors.SchemaErrors 13 | pandera.errors.SchemaInitError 14 | pandera.errors.SchemaDefinitionError 15 | -------------------------------------------------------------------------------- /docs/source/reference/extensions.rst: -------------------------------------------------------------------------------- 1 | .. _api-extensions: 2 | 3 | Extensions 4 | ========== 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :template: module.rst 9 | :nosignatures: 10 | 11 | pandera.extensions 12 | -------------------------------------------------------------------------------- /docs/source/reference/index.md: -------------------------------------------------------------------------------- 1 | % pandera package index documentation toctree 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: pandera 5 | ``` 6 | 7 | # API 8 | 9 | ```{eval-rst} 10 | .. list-table:: 11 | :widths: 30 70 12 | 13 | * - :ref:`Core ` 14 | - The core objects for defining pandera schemas 15 | * - :ref:`Data Types ` 16 | - Data types for type checking and coercion. 17 | * - :ref:`DataFrame Models ` 18 | - Alternative class-based API for defining types for tabular/array-like data. 19 | * - :ref:`Decorators ` 20 | - Decorators for integrating pandera schemas with python functions. 21 | * - :ref:`Schema Inference ` 22 | - Bootstrap schemas from real data 23 | * - :ref:`IO Utilities ` 24 | - Utility functions for reading/writing schemas 25 | * - :ref:`Data Synthesis Strategies ` 26 | - Module of functions for generating data from schemas. 27 | * - :ref:`Extensions ` 28 | - Utility functions for extending pandera functionality 29 | * - :ref:`Errors ` 30 | - Pandera-specific exceptions 31 | ``` 32 | 33 | ```{toctree} 34 | :hidden: true 35 | 36 | core 37 | dtypes 38 | dataframe_models 39 | decorators 40 | schema_inference 41 | io 42 | strategies 43 | extensions 44 | errors 45 | ``` 46 | -------------------------------------------------------------------------------- /docs/source/reference/io.rst: -------------------------------------------------------------------------------- 1 | .. _api-io-utils: 2 | 3 | IO Utilities 4 | ============ 5 | 6 | The ``io`` module and built-in ``Hypothesis`` checks require a pandera 7 | installation with the corresponding extension, see the 8 | :ref:`installation` instructions for more details. 9 | 10 | .. autosummary:: 11 | :toctree: generated 12 | :nosignatures: 13 | 14 | pandera.io.from_yaml 15 | pandera.io.to_yaml 16 | pandera.io.to_script 17 | -------------------------------------------------------------------------------- /docs/source/reference/schema_inference.rst: -------------------------------------------------------------------------------- 1 | .. _api-schema-inference: 2 | 3 | Schema Inference 4 | ================ 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :nosignatures: 9 | 10 | pandera.schema_inference.pandas.infer_schema 11 | -------------------------------------------------------------------------------- /docs/source/reference/strategies.rst: -------------------------------------------------------------------------------- 1 | .. _api-strategies: 2 | 3 | Data Synthesis Strategies 4 | ========================= 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :template: strategies_module.rst 9 | :nosignatures: 10 | 11 | pandera.strategies.pandas_strategies 12 | -------------------------------------------------------------------------------- /docs/source/series_schemas.md: -------------------------------------------------------------------------------- 1 | --- 2 | file_format: mystnb 3 | --- 4 | 5 | % pandera documentation for seriesschemas 6 | 7 | ```{currentmodule} pandera 8 | ``` 9 | 10 | (seriesschemas)= 11 | 12 | # Series Schemas 13 | 14 | The {class}`~pandera.api.pandas.array.SeriesSchema` class allows for the validation of pandas 15 | `Series` objects, and are very similar to {ref}`columns` and 16 | {ref}`indexes` described in {ref}`DataFrameSchemas`. 17 | 18 | ```{code-cell} python 19 | import pandas as pd 20 | import pandera.pandas as pa 21 | 22 | schema = pa.SeriesSchema( 23 | str, 24 | checks=[ 25 | pa.Check(lambda s: s.str.startswith("foo")), 26 | pa.Check(lambda s: s.str.endswith("bar")), 27 | pa.Check(lambda x: len(x) > 3, element_wise=True) 28 | ], 29 | nullable=False, 30 | unique=False, 31 | name="my_series") 32 | 33 | validated_series = schema.validate( 34 | pd.Series(["foobar", "foobar", "foobar"], name="my_series") 35 | ) 36 | 37 | validated_series 38 | ``` 39 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pandera-dev 2 | channels: 3 | - conda-forge 4 | 5 | dependencies: 6 | # environment management 7 | - pip 8 | 9 | # pandera dependencies 10 | - packaging >= 20.0 11 | - typing_extensions 12 | - hypothesis >= 6.92.7 13 | - pyyaml >= 5.1 14 | - typing_inspect >= 0.6.0 15 | - frictionless <= 4.40.8 # v5.* introduces breaking changes 16 | - pyarrow 17 | - pydantic 18 | 19 | # hypotheses extra 20 | - scipy 21 | 22 | # mypy extra 23 | - pandas-stubs 24 | 25 | # pyspark extra 26 | - pyspark[connect] >= 3.2.0, < 4.0.0 27 | 28 | # polars extra 29 | - polars >= 0.20.0 30 | 31 | # modin extra 32 | - modin 33 | - protobuf 34 | 35 | # geopandas extra 36 | - geopandas 37 | - shapely 38 | 39 | # fastapi extra 40 | - fastapi 41 | 42 | # testing and dependencies 43 | - black >= 24.0 44 | 45 | # testing 46 | - numpy >= 1.24.4 47 | - pandas >= 2.1.1 48 | - isort >= 5.7.0 49 | - joblib 50 | - mypy = 1.10.0 51 | - pylint < 3.3 52 | - pytest 53 | - pytest-cov 54 | - pytest-xdist 55 | - pytest-asyncio 56 | - pytz 57 | - xdoctest 58 | - nox 59 | - uv 60 | - setuptools # required in noxfile and not automatically provided by python >= 3.12 61 | 62 | # fastapi testing 63 | - uvicorn 64 | - python-multipart 65 | 66 | # documentation 67 | - sphinx 68 | - sphinx-design 69 | - sphinx-autodoc-typehints <= 1.14.1 70 | - sphinx-copybutton 71 | - recommonmark 72 | - myst-nb 73 | 74 | # packaging 75 | - twine 76 | 77 | # performance testing 78 | - asv >= 0.5.1 79 | 80 | # optional 81 | - pre_commit 82 | 83 | - pip: 84 | # dask extra 85 | - dask[dataframe] 86 | - distributed 87 | 88 | # docs 89 | - furo 90 | - sphinx-docsearch 91 | - grpcio 92 | - ray 93 | - typeguard 94 | - types-click 95 | - types-pytz 96 | - types-pyyaml 97 | - types-requests 98 | - types-setuptools 99 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | disable_error_code =annotation-unchecked 3 | ignore_missing_imports = True 4 | follow_imports = normal 5 | allow_redefinition = True 6 | warn_return_any = False 7 | warn_unused_configs = True 8 | show_error_codes = True 9 | exclude=(?x)( 10 | ^tests/mypy/pandas_modules 11 | | ^pandera/engines/pyspark_engine 12 | | ^pandera/api/pyspark 13 | | ^pandera/backends/pyspark 14 | | ^tests/pyspark 15 | ) 16 | [mypy-pandera.api.pyspark.*] 17 | follow_imports = skip 18 | 19 | [mypy-docs.*] 20 | follow_imports = skip 21 | -------------------------------------------------------------------------------- /pandera/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=wrong-import-position 2 | """A flexible and expressive dataframe validation library.""" 3 | 4 | from pandera._version import __version__ 5 | 6 | 7 | _warning_msg = """Pandas and numpy have been removed from the base pandera 8 | dependencies. Please install pandas as part of your environment's 9 | dependencies or install the pandas extra with: 10 | 11 | ```bash 12 | pip install pandas pandera 13 | 14 | # or 15 | pip install 'pandera[pandas]' 16 | ``` 17 | """ 18 | 19 | 20 | try: 21 | # Only add pandas to the top-level pandera namespace 22 | # if pandas and numpy are installed 23 | import pandas as pd 24 | import numpy as np 25 | 26 | from pandera._pandas_deprecated import * 27 | from pandera._pandas_deprecated import __all__ as _pandas_deprecated_all 28 | from pandera import dtypes 29 | from pandera import typing 30 | 31 | __all__ = [ 32 | "__version__", 33 | *_pandas_deprecated_all, 34 | ] 35 | 36 | except ImportError as err: 37 | import warnings 38 | 39 | if "pandas" in str(err) or "numpy" in str(err): 40 | warnings.warn(_warning_msg, UserWarning) 41 | else: 42 | raise # Re-raise any other `ImportError` exceptions 43 | 44 | from pandera import dtypes 45 | from pandera import typing 46 | from pandera.api.checks import Check 47 | from pandera.api.dataframe.model_components import ( 48 | Field, 49 | check, 50 | dataframe_check, 51 | dataframe_parser, 52 | parser, 53 | ) 54 | 55 | __all__ = [ 56 | "__version__", 57 | "Check", 58 | "Field", 59 | "check", 60 | "dataframe_check", 61 | "dataframe_parser", 62 | "parser", 63 | "dtypes", 64 | "typing", 65 | ] 66 | -------------------------------------------------------------------------------- /pandera/_patch_numpy2.py: -------------------------------------------------------------------------------- 1 | """Patch numpy 2 to prevent errors.""" 2 | 3 | from functools import lru_cache 4 | 5 | 6 | @lru_cache 7 | def _patch_numpy2(): 8 | """This is a temporary fix for numpy 2. 9 | 10 | pyspark uses np.NaN, which is deprecated in numpy 2. 11 | """ 12 | import numpy as np 13 | 14 | expired_attrs = getattr(np, "_expired_attrs_2_0", None) 15 | 16 | if expired_attrs: 17 | attrs_replacement = { 18 | "NaN": np.nan, 19 | "string_": np.bytes_, 20 | "float_": np.float64, 21 | "unicode_": np.str_, 22 | } 23 | for attr, replacement in attrs_replacement.items(): 24 | has_attr = expired_attrs.__expired_attributes__.pop(attr, None) 25 | if has_attr: 26 | setattr(np, attr, replacement) 27 | -------------------------------------------------------------------------------- /pandera/accessors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/accessors/__init__.py -------------------------------------------------------------------------------- /pandera/accessors/dask_accessor.py: -------------------------------------------------------------------------------- 1 | """Register dask accessor for pandera schema metadata.""" 2 | 3 | from dask.dataframe.extensions import ( 4 | register_dataframe_accessor, 5 | register_series_accessor, 6 | ) 7 | 8 | from pandera.accessors.pandas_accessor import ( 9 | PanderaDataFrameAccessor, 10 | PanderaSeriesAccessor, 11 | ) 12 | 13 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor) 14 | register_series_accessor("pandera")(PanderaSeriesAccessor) 15 | -------------------------------------------------------------------------------- /pandera/accessors/modin_accessor.py: -------------------------------------------------------------------------------- 1 | """Custom accessor functionality for modin. 2 | 3 | Source code adapted from pyspark.pandas implementation: 4 | https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/api/pyspark.pandas.extensions.register_dataframe_accessor.html?highlight=register_dataframe_accessor#pyspark.pandas.extensions.register_dataframe_accessor 5 | """ 6 | 7 | import warnings 8 | 9 | from pandera.accessors.pandas_accessor import ( 10 | PanderaDataFrameAccessor, 11 | PanderaSeriesAccessor, 12 | ) 13 | 14 | 15 | # pylint: disable=too-few-public-methods 16 | class CachedAccessor: 17 | """ 18 | Custom property-like object. 19 | 20 | A descriptor for caching accessors: 21 | 22 | :param name: Namespace that accessor's methods, properties, etc will be 23 | accessed under, e.g. "foo" for a dataframe accessor yields the accessor 24 | ``df.foo`` 25 | :param cls: Class with the extension methods. 26 | 27 | For accessor, the class's __init__ method assumes that you are registering 28 | an accessor for one of ``Series``, ``DataFrame``, or ``Index``. 29 | """ 30 | 31 | def __init__(self, name, accessor): 32 | self._name = name 33 | self._accessor = accessor 34 | 35 | def __get__(self, obj, cls): 36 | if obj is None: # pragma: no cover 37 | return self._accessor 38 | accessor_obj = self._accessor(obj) 39 | object.__setattr__(obj, self._name, accessor_obj) 40 | return accessor_obj 41 | 42 | 43 | def _register_accessor(name, cls): 44 | """ 45 | Register a custom accessor on {class} objects. 46 | 47 | :param name: Name under which the accessor should be registered. A warning 48 | is issued if this name conflicts with a preexisting attribute. 49 | :returns: A class decorator callable. 50 | """ 51 | 52 | def decorator(accessor): 53 | if hasattr(cls, name): 54 | msg = ( 55 | f"registration of accessor {accessor} under name '{name}' for " 56 | "type {cls.__name__} is overriding a preexisting attribute " 57 | "with the same name." 58 | ) 59 | 60 | warnings.warn( 61 | msg, 62 | UserWarning, 63 | stacklevel=2, 64 | ) 65 | setattr(cls, name, CachedAccessor(name, accessor)) 66 | return accessor 67 | 68 | return decorator 69 | 70 | 71 | def register_dataframe_accessor(name): 72 | """ 73 | Register a custom accessor with a DataFrame 74 | 75 | :param name: name used when calling the accessor after its registered 76 | :returns: a class decorator callable. 77 | """ 78 | # pylint: disable=import-outside-toplevel 79 | from modin.pandas import DataFrame 80 | 81 | return _register_accessor(name, DataFrame) 82 | 83 | 84 | def register_series_accessor(name): 85 | """ 86 | Register a custom accessor with a Series object 87 | 88 | :param name: name used when calling the accessor after its registered 89 | :returns: a callable class decorator 90 | """ 91 | # pylint: disable=import-outside-toplevel 92 | from modin.pandas import Series 93 | 94 | return _register_accessor(name, Series) 95 | 96 | 97 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor) 98 | register_series_accessor("pandera")(PanderaSeriesAccessor) 99 | -------------------------------------------------------------------------------- /pandera/accessors/pandas_accessor.py: -------------------------------------------------------------------------------- 1 | """Register pandas accessor for pandera schema metadata.""" 2 | 3 | from typing import Optional, Union 4 | 5 | import pandas as pd 6 | 7 | from pandera.api.pandas.array import SeriesSchema 8 | from pandera.api.pandas.container import DataFrameSchema 9 | 10 | Schemas = Union[DataFrameSchema, SeriesSchema] 11 | 12 | 13 | class PanderaAccessor: 14 | """Pandera accessor for pandas object.""" 15 | 16 | def __init__(self, pandas_obj): 17 | """Initialize the pandera accessor.""" 18 | self._pandas_obj = pandas_obj 19 | self._schema: Optional[Schemas] = None 20 | 21 | @staticmethod 22 | def check_schema_type(schema: Schemas): 23 | """Abstract method for checking the schema type.""" 24 | raise NotImplementedError 25 | 26 | def add_schema(self, schema): 27 | """Add a schema to the pandas object.""" 28 | self.check_schema_type(schema) 29 | self._schema = schema 30 | return self._pandas_obj 31 | 32 | @property 33 | def schema(self) -> Optional[Schemas]: 34 | """Access schema metadata.""" 35 | return self._schema 36 | 37 | 38 | @pd.api.extensions.register_dataframe_accessor("pandera") 39 | class PanderaDataFrameAccessor(PanderaAccessor): 40 | """Pandera accessor for pandas DataFrame.""" 41 | 42 | @staticmethod 43 | def check_schema_type(schema): 44 | if not isinstance(schema, DataFrameSchema): 45 | raise TypeError( 46 | f"schema arg must be a {DataFrameSchema}, found {type(schema)}" 47 | ) 48 | 49 | 50 | @pd.api.extensions.register_series_accessor("pandera") 51 | class PanderaSeriesAccessor(PanderaAccessor): 52 | """Pandera accessor for pandas Series.""" 53 | 54 | @staticmethod 55 | def check_schema_type(schema): 56 | if not isinstance(schema, SeriesSchema): 57 | raise TypeError( 58 | f"schema arg must be a {SeriesSchema}, found {type(schema)}" 59 | ) 60 | -------------------------------------------------------------------------------- /pandera/accessors/polars_accessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/accessors/polars_accessor.py -------------------------------------------------------------------------------- /pandera/accessors/pyspark_accessor.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | # NOTE: skip file since py=3.10 yields these errors: 3 | # https://github.com/pandera-dev/pandera/runs/4998710717?check_suite_focus=true 4 | """Register pyspark accessor for pandera schema metadata.""" 5 | 6 | from pyspark.pandas.extensions import ( 7 | register_dataframe_accessor, 8 | register_series_accessor, 9 | ) 10 | 11 | from pandera.accessors.pandas_accessor import ( 12 | PanderaDataFrameAccessor, 13 | PanderaSeriesAccessor, 14 | ) 15 | 16 | register_dataframe_accessor("pandera")(PanderaDataFrameAccessor) 17 | register_series_accessor("pandera")(PanderaSeriesAccessor) 18 | -------------------------------------------------------------------------------- /pandera/api/__init__.py: -------------------------------------------------------------------------------- 1 | """Pandera api package. 2 | 3 | This package contains the public-facing api schema specifications for all 4 | supported data objects. 5 | """ 6 | -------------------------------------------------------------------------------- /pandera/api/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/base/__init__.py -------------------------------------------------------------------------------- /pandera/api/base/model_config.py: -------------------------------------------------------------------------------- 1 | """Class-based dataframe model API configuration.""" 2 | 3 | from typing import Any, Optional 4 | 5 | 6 | class BaseModelConfig: # pylint:disable=R0903 7 | """Model configuration base class.""" 8 | 9 | #: datatype of the data container. This overrides the data types specified 10 | #: in any of the fields. 11 | dtype: Optional[Any] = None 12 | 13 | name: Optional[str] = None #: name of schema 14 | title: Optional[str] = None #: human-readable label for schema 15 | description: Optional[str] = None #: arbitrary textual description 16 | coerce: bool = False #: coerce types of all schema components 17 | -------------------------------------------------------------------------------- /pandera/api/base/parsers.py: -------------------------------------------------------------------------------- 1 | """Data validation base parse.""" 2 | 3 | import inspect 4 | from typing import Any, Dict, NamedTuple, Optional, Tuple, Type 5 | 6 | from pandera.backends.base import BaseParserBackend 7 | 8 | 9 | class ParserResult(NamedTuple): 10 | """Parser result for user-defined parsers.""" 11 | 12 | parser_output: Any 13 | parsed_object: Any 14 | 15 | 16 | class MetaParser(type): 17 | """Parser metaclass.""" 18 | 19 | BACKEND_REGISTRY: Dict[Tuple[Type, Type], Type[BaseParserBackend]] = {} 20 | """Registry of parser backends implemented for specific data objects.""" 21 | 22 | 23 | class BaseParser(metaclass=MetaParser): 24 | """Parser base class.""" 25 | 26 | def __init__(self, name: Optional[str] = None): 27 | self.name = name 28 | 29 | @classmethod 30 | def register_backend(cls, type_: Type, backend: Type[BaseParserBackend]): 31 | """Register a backend for the specified type.""" 32 | cls.BACKEND_REGISTRY[(cls, type_)] = backend 33 | 34 | @classmethod 35 | def get_backend(cls, parse_obj: Any) -> Type[BaseParserBackend]: 36 | """Get the backend associated with the type of ``parse_obj`` .""" 37 | 38 | parse_obj_cls = type(parse_obj) 39 | classes = inspect.getmro(parse_obj_cls) 40 | for _class in classes: 41 | try: 42 | return cls.BACKEND_REGISTRY[(cls, _class)] 43 | except KeyError: 44 | pass 45 | raise KeyError( 46 | f"Backend not found for class: {parse_obj_cls}. Looked up the " 47 | f"following base classes: {classes}" 48 | ) 49 | 50 | def __eq__(self, other: object) -> bool: 51 | if not isinstance(other, type(self)): 52 | return NotImplemented 53 | 54 | are_parser_fn_objects_equal = ( 55 | self._get_parser_fn_code() == other._get_parser_fn_code() 56 | ) 57 | 58 | are_all_other_parser_attributes_equal = { 59 | k: v for k, v in self.__dict__.items() if k != "_parser_fn" 60 | } == {k: v for k, v in other.__dict__.items() if k != "_parser_fn"} 61 | 62 | return ( 63 | are_parser_fn_objects_equal 64 | and are_all_other_parser_attributes_equal 65 | ) 66 | 67 | def _get_parser_fn_code(self): 68 | parser_fn = self.__dict__["_parser_fn"] 69 | code = parser_fn.__code__.co_code 70 | 71 | return code 72 | 73 | def __repr__(self) -> str: 74 | return f"" 75 | -------------------------------------------------------------------------------- /pandera/api/base/types.py: -------------------------------------------------------------------------------- 1 | """Base type definitions for pandera.""" 2 | 3 | from typing import List, Union 4 | 5 | from pandera.api.checks import Check 6 | from pandera.api.hypotheses import Hypothesis 7 | from pandera.api.parsers import Parser 8 | 9 | try: 10 | # python 3.8+ 11 | from typing import Literal # type: ignore[attr-defined] 12 | except ImportError: # pragma: no cover 13 | from typing_extensions import Literal # type: ignore[assignment] 14 | 15 | 16 | StrictType = Union[bool, Literal["filter"]] 17 | CheckList = Union[Check, List[Union[Check, Hypothesis]]] 18 | ParserList = Union[Parser, List[Parser]] 19 | -------------------------------------------------------------------------------- /pandera/api/dataframe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/dataframe/__init__.py -------------------------------------------------------------------------------- /pandera/api/function_dispatch.py: -------------------------------------------------------------------------------- 1 | """Multidispatcher implementation.""" 2 | 3 | from inspect import signature 4 | from typing import Callable, Dict, Tuple, Type, Union 5 | import typing_inspect 6 | 7 | 8 | class Dispatcher: 9 | """Dispatch implementation.""" 10 | 11 | def __init__(self): 12 | self._function_registry: Dict[Type, Callable] = {} 13 | self._name = None 14 | 15 | def register(self, fn): 16 | # Get function signature 17 | self._name = fn.__name__ 18 | data_types = get_first_arg_type(fn) 19 | for data_type in data_types: 20 | self._function_registry[data_type] = fn 21 | 22 | def __call__(self, *args, **kwargs): 23 | input_data_type = type(args[0]) 24 | fn = self._function_registry[input_data_type] 25 | return fn(*args, **kwargs) 26 | 27 | @property 28 | def co_code(self): 29 | """Method for getting bytecode of all the registered functions.""" 30 | _code = b"" 31 | for fn in self._function_registry.values(): 32 | _code += fn.__code__.co_code 33 | return _code 34 | 35 | @property 36 | def __name__(self): 37 | return f"{self._name}" 38 | 39 | def __str__(self): 40 | return f"{self._name}" 41 | 42 | def __repr__(self): 43 | return f"{self._name}" 44 | 45 | 46 | def get_first_arg_type(fn): 47 | fn_sig = signature(fn) 48 | 49 | # register the check strategy for this particular check, identified 50 | # by the check `name`, and the data type of the check function. This 51 | # supports Union types. Also assume that the data type of the data 52 | # object to validate is the first argument. 53 | data_type = [*fn_sig.parameters.values()][0].annotation 54 | 55 | if typing_inspect.get_origin(data_type) in (tuple, Tuple): 56 | data_type, *_ = typing_inspect.get_args(data_type) 57 | 58 | if typing_inspect.get_origin(data_type) is Union: 59 | data_types = typing_inspect.get_args(data_type) 60 | else: 61 | data_types = (data_type,) 62 | 63 | return data_types 64 | -------------------------------------------------------------------------------- /pandera/api/pandas/__init__.py: -------------------------------------------------------------------------------- 1 | """Pandas core.""" 2 | -------------------------------------------------------------------------------- /pandera/api/pandas/model_config.py: -------------------------------------------------------------------------------- 1 | """Class-based dataframe model API configuration for pandas.""" 2 | 3 | from typing import Optional 4 | 5 | from pandera.api.dataframe.model_config import BaseConfig as _BaseConfig 6 | from pandera.api.pandas.types import PandasDtypeInputTypes 7 | 8 | 9 | class BaseConfig(_BaseConfig): # pylint:disable=R0903 10 | """Define pandas DataFrameSchema-wide options.""" 11 | 12 | #: datatype of the dataframe. This overrides the data types specified in 13 | #: any of the fields. 14 | dtype: Optional[PandasDtypeInputTypes] = None 15 | -------------------------------------------------------------------------------- /pandera/api/polars/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/api/polars/__init__.py -------------------------------------------------------------------------------- /pandera/api/polars/model_config.py: -------------------------------------------------------------------------------- 1 | """Class-based dataframe model API configuration for pandas.""" 2 | 3 | from typing import Optional 4 | 5 | from pandera.api.dataframe.model_config import BaseConfig as _BaseConfig 6 | from pandera.api.polars.types import PolarsDtypeInputTypes 7 | 8 | 9 | class BaseConfig(_BaseConfig): # pylint:disable=R0903 10 | """Define polars DataFrameSchema-wide options.""" 11 | 12 | #: datatype of the dataframe. This overrides the data types specified in 13 | #: any of the fields. 14 | dtype: Optional[PolarsDtypeInputTypes] = None 15 | -------------------------------------------------------------------------------- /pandera/api/polars/types.py: -------------------------------------------------------------------------------- 1 | """Polars types.""" 2 | 3 | from typing import NamedTuple, Union, TypeVar 4 | 5 | import polars as pl 6 | 7 | 8 | class PolarsData(NamedTuple): 9 | lazyframe: pl.LazyFrame 10 | key: str = "*" 11 | 12 | 13 | class CheckResult(NamedTuple): 14 | """Check result for user-defined checks.""" 15 | 16 | check_output: pl.LazyFrame 17 | check_passed: pl.LazyFrame 18 | checked_object: pl.LazyFrame 19 | failure_cases: pl.LazyFrame 20 | 21 | 22 | PolarsCheckObjects = Union[pl.LazyFrame, pl.DataFrame] 23 | PolarsFrame = TypeVar("PolarsFrame", pl.LazyFrame, pl.DataFrame) 24 | 25 | PolarsDtypeInputTypes = Union[ 26 | str, 27 | type, 28 | pl.datatypes.classes.DataTypeClass, 29 | ] 30 | -------------------------------------------------------------------------------- /pandera/api/polars/utils.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=cyclic-import 2 | """Polars validation engine utilities.""" 3 | 4 | from typing import Dict, List 5 | 6 | import polars as pl 7 | 8 | from pandera.api.polars.types import PolarsCheckObjects 9 | from pandera.engines.polars_engine import polars_version 10 | from pandera.config import ( 11 | ValidationDepth, 12 | get_config_context, 13 | get_config_global, 14 | ) 15 | 16 | 17 | def get_lazyframe_schema(lf: pl.LazyFrame) -> Dict[str, pl.DataType]: 18 | """Get a dict of column names and dtypes from a polars LazyFrame.""" 19 | if polars_version().release >= (1, 0, 0): 20 | return lf.collect_schema() 21 | return lf.schema 22 | 23 | 24 | def get_lazyframe_column_dtypes(lf: pl.LazyFrame) -> List[pl.DataType]: 25 | """Get a list of column dtypes from a polars LazyFrame.""" 26 | if polars_version().release >= (1, 0, 0): 27 | return lf.collect_schema().dtypes() 28 | return [*lf.schema.values()] 29 | 30 | 31 | def get_lazyframe_column_names(lf: pl.LazyFrame) -> List[str]: 32 | """Get a list of column names from a polars LazyFrame.""" 33 | if polars_version().release >= (1, 0, 0): 34 | return lf.collect_schema().names() 35 | return lf.columns 36 | 37 | 38 | def get_validation_depth(check_obj: PolarsCheckObjects) -> ValidationDepth: 39 | """Get validation depth for a given polars check object.""" 40 | is_dataframe = isinstance(check_obj, pl.DataFrame) 41 | 42 | config_global = get_config_global() 43 | config_ctx = get_config_context(validation_depth_default=None) 44 | 45 | if config_ctx.validation_depth is not None: 46 | # use context configuration if specified 47 | return config_ctx.validation_depth 48 | 49 | if config_global.validation_depth is not None: 50 | # use global configuration if specified 51 | return config_global.validation_depth 52 | 53 | if ( 54 | isinstance(check_obj, pl.LazyFrame) 55 | and config_global.validation_depth is None 56 | ): 57 | # if global validation depth is not set, use schema only validation 58 | # when validating LazyFrames 59 | validation_depth = ValidationDepth.SCHEMA_ONLY 60 | elif is_dataframe and ( 61 | config_ctx.validation_depth is None 62 | or config_ctx.validation_depth is None 63 | ): 64 | # if context validation depth is not set, use schema and data validation 65 | # when validating DataFrames 66 | validation_depth = ValidationDepth.SCHEMA_AND_DATA 67 | else: 68 | validation_depth = ValidationDepth.SCHEMA_ONLY 69 | 70 | return validation_depth 71 | -------------------------------------------------------------------------------- /pandera/api/pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | """PySpark native core.""" 2 | 3 | from pandera.api.pyspark.components import Column 4 | from pandera.api.pyspark.container import DataFrameSchema 5 | -------------------------------------------------------------------------------- /pandera/api/pyspark/model_config.py: -------------------------------------------------------------------------------- 1 | """Class-based dataframe model API configuration for pyspark.""" 2 | 3 | from typing import Any, Callable, Dict, List, Optional, Union 4 | 5 | from pandera.api.base.model_config import BaseModelConfig 6 | from pandera.api.base.types import StrictType 7 | from pandera.api.pyspark.types import PySparkDtypeInputTypes 8 | from pandera.typing.formats import Format 9 | 10 | 11 | class BaseConfig(BaseModelConfig): # pylint:disable=R0903 12 | """Define DataFrameSchema-wide options. 13 | 14 | *new in 0.16.0* 15 | """ 16 | 17 | #: datatype of the dataframe. This overrides the data types specified in 18 | #: any of the fields. 19 | dtype: Optional[PySparkDtypeInputTypes] = None 20 | 21 | name: Optional[str] = None #: name of schema 22 | title: Optional[str] = None #: human-readable label for schema 23 | description: Optional[str] = None #: arbitrary textual description 24 | coerce: bool = False #: coerce types of all schema components 25 | 26 | #: make sure certain column combinations are unique 27 | unique: Optional[Union[str, List[str]]] = None 28 | 29 | #: make sure all specified columns are in the validated dataframe - 30 | #: if ``"filter"``, removes columns not specified in the schema 31 | strict: StrictType = False 32 | 33 | ordered: bool = False #: validate columns order 34 | 35 | #: make sure dataframe column names are unique 36 | unique_column_names: bool = False 37 | 38 | #: data format before validation. This option only applies to 39 | #: schemas used in the context of the pandera type constructor 40 | #: ``pa.typing.DataFrame[Schema](data)``. If None, assumes a data structure 41 | #: compatible with the ``pyspark.sql.DataFrame`` constructor. 42 | from_format: Optional[Union[Format, Callable]] = None 43 | 44 | #: a dictionary keyword arguments to pass into the reader function that 45 | #: converts the object of type ``from_format`` to a pandera-validate-able 46 | #: data structure. The reader function is implemented in the pandera.typing 47 | #: generic types via the ``from_format`` and ``to_format`` methods. 48 | from_format_kwargs: Optional[Dict[str, Any]] = None 49 | 50 | #: data format to serialize into after validation. This option only applies 51 | #: to schemas used in the context of the pandera type constructor 52 | #: ``pa.typing.DataFrame[Schema](data)``. If None, returns a dataframe. 53 | to_format: Optional[Union[Format, Callable]] = None 54 | 55 | #: Buffer to be provided when to_format is a custom callable. See docs for 56 | #: example of how to implement an example of a to format function. 57 | to_format_buffer: Optional[Union[str, Callable]] = None 58 | 59 | #: a dictionary keyword arguments to pass into the writer function that 60 | #: converts the pandera-validate-able object to type ``to_format``. 61 | #: The writer function is implemented in the pandera.typing 62 | #: generic types via the ``from_format`` and ``to_format`` methods. 63 | to_format_kwargs: Optional[Dict[str, Any]] = None 64 | 65 | #: a dictionary object to store key-value data at schema level 66 | metadata: Optional[dict] = None 67 | -------------------------------------------------------------------------------- /pandera/api/pyspark/types.py: -------------------------------------------------------------------------------- 1 | """Utility functions for pyspark validation.""" 2 | 3 | from functools import lru_cache 4 | from typing import List, NamedTuple, Tuple, Type, Union 5 | from numpy import bool_ as np_bool 6 | from packaging import version 7 | 8 | import pyspark.sql.types as pst 9 | from pyspark.sql import DataFrame 10 | 11 | import pyspark 12 | from pandera.api.checks import Check 13 | from pandera.dtypes import DataType 14 | 15 | # pylint: disable=reimported 16 | # Handles optional Spark Connect imports for pyspark>=3.4 (if available) 17 | if version.parse(pyspark.__version__) >= version.parse("3.4"): 18 | from pyspark.sql.connect.dataframe import DataFrame as psc_DataFrame 19 | from pyspark.sql.connect.group import GroupedData 20 | else: 21 | from pyspark.sql import ( 22 | DataFrame as psc_DataFrame, 23 | ) 24 | from pyspark.sql.group import GroupedData 25 | 26 | DataFrameTypes = Union[DataFrame, psc_DataFrame] 27 | GroupbyObject = GroupedData 28 | 29 | CheckList = Union[Check, List[Check]] 30 | 31 | PysparkDefaultTypes = Union[ 32 | pst.BooleanType, 33 | pst.StringType, 34 | pst.IntegerType, 35 | pst.DecimalType, 36 | pst.FloatType, 37 | pst.DateType, 38 | pst.TimestampType, 39 | pst.DoubleType, 40 | pst.ShortType, 41 | pst.ByteType, 42 | pst.LongType, 43 | pst.BinaryType, 44 | ] 45 | 46 | PySparkDtypeInputTypes = Union[ 47 | str, 48 | int, 49 | float, 50 | bool, 51 | type, 52 | DataType, 53 | Type, 54 | pst.BooleanType, 55 | pst.StringType, 56 | pst.IntegerType, 57 | pst.DecimalType, 58 | pst.FloatType, 59 | pst.DateType, 60 | pst.TimestampType, 61 | pst.DoubleType, 62 | pst.ShortType, 63 | pst.ByteType, 64 | pst.LongType, 65 | pst.BinaryType, 66 | ] 67 | 68 | 69 | class SupportedTypes(NamedTuple): 70 | table_types: Tuple[type, ...] 71 | 72 | 73 | class PysparkDataframeColumnObject(NamedTuple): 74 | """Pyspark Object which holds dataframe and column value in a named tuble""" 75 | 76 | dataframe: DataFrameTypes 77 | column_name: str 78 | 79 | 80 | @lru_cache(maxsize=None) 81 | def supported_types() -> SupportedTypes: 82 | """Get the types supported by pandera schemas.""" 83 | # pylint: disable=import-outside-toplevel 84 | table_types = [DataFrame] 85 | 86 | try: 87 | table_types.append(DataFrame) 88 | table_types.append(psc_DataFrame) 89 | 90 | except ImportError: # pragma: no cover 91 | pass 92 | 93 | return SupportedTypes( 94 | tuple(table_types), 95 | ) 96 | 97 | 98 | def is_table(obj): 99 | """Verifies whether an object is table-like. 100 | 101 | Where a table is a 2-dimensional data matrix of rows and columns, which 102 | can be indexed in multiple different ways. 103 | """ 104 | return isinstance(obj, supported_types().table_types) 105 | 106 | 107 | def is_bool(x): 108 | """Verifies whether an object is a boolean type.""" 109 | return isinstance(x, (bool, type(pst.BooleanType()), np_bool)) 110 | -------------------------------------------------------------------------------- /pandera/backends/__init__.py: -------------------------------------------------------------------------------- 1 | """Pandera backends.""" 2 | -------------------------------------------------------------------------------- /pandera/backends/base/builtin_checks.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-function-docstring 2 | """Built-in check functions base implementation. 3 | 4 | This module contains check function abstract definitions that correspond to 5 | the pandera.api.base.checks.Check methods. These functions do not actually 6 | implement any validation logic and serve as the entrypoint for dispatching 7 | specific implementations based on the data object type, e.g. 8 | `pandas.DataFrame`s. 9 | """ 10 | 11 | import re 12 | from typing import Any, Iterable, Optional, TypeVar, Union 13 | 14 | from pandera.api.checks import Check 15 | 16 | T = TypeVar("T") 17 | 18 | 19 | @Check.register_builtin_check_fn 20 | def equal_to(data: Any, value: Any) -> Any: 21 | raise NotImplementedError 22 | 23 | 24 | @Check.register_builtin_check_fn 25 | def not_equal_to(data: Any, value: Any) -> Any: 26 | raise NotImplementedError 27 | 28 | 29 | @Check.register_builtin_check_fn 30 | def greater_than(data: Any, min_value: Any) -> Any: 31 | raise NotImplementedError 32 | 33 | 34 | @Check.register_builtin_check_fn 35 | def greater_than_or_equal_to(data: Any, min_value: Any) -> Any: 36 | raise NotImplementedError 37 | 38 | 39 | @Check.register_builtin_check_fn 40 | def less_than(data: Any, max_value: Any) -> Any: 41 | raise NotImplementedError 42 | 43 | 44 | @Check.register_builtin_check_fn 45 | def less_than_or_equal_to(data: Any, max_value: Any) -> Any: 46 | raise NotImplementedError 47 | 48 | 49 | @Check.register_builtin_check_fn 50 | def in_range( 51 | data: Any, 52 | min_value: T, 53 | max_value: T, 54 | include_min: bool = True, 55 | include_max: bool = True, 56 | ) -> Any: 57 | raise NotImplementedError 58 | 59 | 60 | @Check.register_builtin_check_fn 61 | def isin(data: Any, allowed_values: Iterable) -> Any: 62 | raise NotImplementedError 63 | 64 | 65 | @Check.register_builtin_check_fn 66 | def notin(data: Any, forbidden_values: Iterable) -> Any: 67 | raise NotImplementedError 68 | 69 | 70 | @Check.register_builtin_check_fn 71 | def str_matches(data: Any, pattern: Union[str, re.Pattern]) -> Any: 72 | raise NotImplementedError 73 | 74 | 75 | @Check.register_builtin_check_fn 76 | def str_contains(data: Any, pattern: Union[str, re.Pattern]) -> Any: 77 | raise NotImplementedError 78 | 79 | 80 | @Check.register_builtin_check_fn 81 | def str_startswith(data: Any, string: str) -> Any: 82 | raise NotImplementedError 83 | 84 | 85 | @Check.register_builtin_check_fn 86 | def str_endswith(data: Any, string: str) -> Any: 87 | raise NotImplementedError 88 | 89 | 90 | @Check.register_builtin_check_fn 91 | def str_length( 92 | data: Any, 93 | min_value: Optional[int] = None, 94 | max_value: Optional[int] = None, 95 | ) -> Any: 96 | raise NotImplementedError 97 | 98 | 99 | @Check.register_builtin_check_fn 100 | def unique_values_eq(data: Any, values: Iterable) -> Any: 101 | raise NotImplementedError 102 | -------------------------------------------------------------------------------- /pandera/backends/base/builtin_hypotheses.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-function-docstring 2 | """Built-in hypothesis functions base implementation. 3 | 4 | This module contains hypothesis function abstract definitions that 5 | correspond to the pandera.api.base.checks.Check methods. These functions do not 6 | actually implement any validation logic and serve as the entrypoint for 7 | dispatching specific implementations based on the data object type, e.g. 8 | `pandas.DataFrame`s. 9 | """ 10 | 11 | from typing import Any, Tuple 12 | 13 | from pandera.api.hypotheses import Hypothesis 14 | 15 | 16 | @Hypothesis.register_builtin_check_fn 17 | def two_sample_ttest( 18 | *samples: Tuple[Any, ...], 19 | equal_var: bool = True, 20 | nan_policy: str = "propagate", 21 | ): 22 | raise NotImplementedError 23 | 24 | 25 | @Hypothesis.register_builtin_check_fn 26 | def one_sample_ttest( 27 | *samples: Tuple[Any, ...], 28 | popmean: float, 29 | nan_policy: str = "propagate", 30 | ): 31 | raise NotImplementedError 32 | -------------------------------------------------------------------------------- /pandera/backends/pandas/__init__.py: -------------------------------------------------------------------------------- 1 | """Pandas backend implementation for schemas and checks.""" 2 | -------------------------------------------------------------------------------- /pandera/backends/pandas/builtin_hypotheses.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-function-docstring 2 | """Pandas implementation of built-in hypotheses.""" 3 | 4 | from typing import Tuple 5 | 6 | from pandera.api.extensions import register_builtin_hypothesis 7 | from pandera.backends.pandas.builtin_checks import PandasData 8 | 9 | 10 | @register_builtin_hypothesis( 11 | error="failed two sample ttest between '{sample1}' and '{sample2}'", 12 | samples_kwtypes={"sample1": str, "sample2": str}, 13 | ) 14 | def two_sample_ttest( 15 | *samples: Tuple[PandasData, ...], 16 | equal_var: bool = True, 17 | nan_policy: str = "propagate", 18 | ) -> Tuple[float, float]: 19 | from scipy import stats # pylint: disable=import-outside-toplevel 20 | 21 | assert ( 22 | len(samples) == 2 23 | ), "Expected two sample ttest data to contain exactly two samples" 24 | return stats.ttest_ind( 25 | samples[0], 26 | samples[1], 27 | equal_var=equal_var, 28 | nan_policy=nan_policy, 29 | ) 30 | 31 | 32 | @register_builtin_hypothesis( 33 | error="failed one sample ttest for column '{sample}'", 34 | samples_kwtypes={"sample": str}, 35 | ) 36 | def one_sample_ttest( 37 | *samples: Tuple[PandasData, ...], 38 | popmean: float, 39 | nan_policy: str = "propagate", 40 | ) -> Tuple[float, float]: 41 | from scipy import stats # pylint: disable=import-outside-toplevel 42 | 43 | assert ( 44 | len(samples) == 1 45 | ), "Expected one sample ttest data to contain only one sample" 46 | return stats.ttest_1samp( 47 | samples[0], popmean=popmean, nan_policy=nan_policy 48 | ) 49 | -------------------------------------------------------------------------------- /pandera/backends/pandas/parsers.py: -------------------------------------------------------------------------------- 1 | """Parser backend for pandas""" 2 | 3 | from functools import partial 4 | from typing import Dict, Optional, Union 5 | 6 | import pandas as pd 7 | 8 | from pandera.api.base.parsers import ParserResult 9 | from pandera.api.pandas.types import is_field, is_table 10 | from pandera.api.parsers import Parser 11 | from pandera.backends.base import BaseParserBackend 12 | 13 | 14 | class PandasParserBackend(BaseParserBackend): 15 | """Parser backend of pandas.""" 16 | 17 | def __init__(self, parser: Parser): 18 | """Initializes a parser backend object.""" 19 | super().__init__(parser) 20 | assert parser._parser_fn is not None, "Parser._parser_fn must be set." 21 | self.parser = parser 22 | self.parser_fn = partial(parser._parser_fn, **parser._parser_kwargs) 23 | 24 | def preprocess( 25 | self, parse_obj, key 26 | ) -> pd.Series: # pylint:disable=unused-argument 27 | """Preprocesses a parser object before applying the parse function.""" 28 | if is_table(parse_obj) and key is not None: 29 | return self.preprocess_table_with_key(parse_obj, key) 30 | elif is_table(parse_obj) and key is None: 31 | return self.preprocess_table(parse_obj) 32 | else: 33 | return parse_obj 34 | 35 | def preprocess_table_with_key( 36 | self, 37 | parse_obj, 38 | key, 39 | ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: 40 | return parse_obj[key] 41 | 42 | def preprocess_table( 43 | self, parse_obj 44 | ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: 45 | return parse_obj 46 | 47 | def apply(self, parse_obj): 48 | """Apply the parse function to a parser object.""" 49 | if is_field(parse_obj): 50 | return self.apply_field(parse_obj) 51 | elif is_table(parse_obj): 52 | return self.apply_table(parse_obj) 53 | else: 54 | raise NotImplementedError 55 | 56 | def apply_field(self, parse_obj): 57 | if self.parser.element_wise: 58 | return parse_obj.map(self.parser_fn) 59 | return self.parser_fn(parse_obj) 60 | 61 | def apply_table(self, parse_obj): 62 | if self.parser.element_wise: 63 | return getattr(parse_obj, "map", parse_obj.applymap)( 64 | self.parser_fn 65 | ) 66 | return self.parser_fn(parse_obj) 67 | 68 | def postprocess( 69 | self, 70 | parse_obj, 71 | parser_output, 72 | ) -> ParserResult: 73 | """Postprocesses the result of applying the parser function.""" 74 | return ParserResult( 75 | parser_output=parser_output, parsed_object=parse_obj 76 | ) 77 | 78 | def __call__( 79 | self, 80 | parse_obj: Union[pd.Series, pd.DataFrame], 81 | key: Optional[str] = None, 82 | ): 83 | parse_obj = self.preprocess(parse_obj, key) 84 | parser_output = self.apply(parse_obj) 85 | return self.postprocess(parse_obj, parser_output) 86 | -------------------------------------------------------------------------------- /pandera/backends/pandas/register.py: -------------------------------------------------------------------------------- 1 | """Register pandas backends.""" 2 | 3 | from functools import lru_cache 4 | from typing import Optional 5 | 6 | from pandera.backends.pandas.array import SeriesSchemaBackend 7 | from pandera.backends.pandas.checks import PandasCheckBackend 8 | from pandera.backends.pandas.components import ( 9 | ColumnBackend, 10 | IndexBackend, 11 | MultiIndexBackend, 12 | ) 13 | from pandera.backends.pandas.container import DataFrameSchemaBackend 14 | from pandera.backends.pandas.hypotheses import PandasHypothesisBackend 15 | from pandera.backends.pandas.parsers import PandasParserBackend 16 | 17 | 18 | @lru_cache 19 | def register_pandas_backends( 20 | check_cls_fqn: Optional[str] = None, 21 | ): # pylint: disable=unused-argument 22 | """Register pandas backends. 23 | 24 | This function is called at schema initialization in the _register_*_backends 25 | method. 26 | 27 | :param framework_name: name of the framework to register backends for. 28 | Allowable types are "pandas", "dask", "modin", "pyspark", and 29 | "geopandas". 30 | """ 31 | 32 | # pylint: disable=import-outside-toplevel,unused-import,cyclic-import 33 | from pandera._patch_numpy2 import _patch_numpy2 34 | 35 | _patch_numpy2() 36 | 37 | from pandera.api.checks import Check 38 | from pandera.api.hypotheses import Hypothesis 39 | from pandera.api.pandas.array import SeriesSchema 40 | from pandera.api.pandas.components import Column, Index, MultiIndex 41 | from pandera.api.pandas.container import DataFrameSchema 42 | from pandera.api.parsers import Parser 43 | from pandera.api.pandas.types import get_backend_types 44 | 45 | # NOTE: This registers the deprecated DataFrameSchema class. Remove this 46 | # once the deprecated class is removed. 47 | from pandera._pandas_deprecated import ( 48 | DataFrameSchema as _DataFrameSchemaDeprecated, 49 | ) 50 | 51 | assert check_cls_fqn is not None, ( 52 | "pandas backend registration requires passing in the fully qualified " 53 | "check class name" 54 | ) 55 | backend_types = get_backend_types(check_cls_fqn) 56 | 57 | from pandera.backends.pandas import builtin_checks, builtin_hypotheses 58 | 59 | for t in backend_types.check_backend_types: 60 | Check.register_backend(t, PandasCheckBackend) 61 | Hypothesis.register_backend(t, PandasHypothesisBackend) 62 | Parser.register_backend(t, PandasParserBackend) 63 | 64 | for t in backend_types.dataframe_datatypes: 65 | DataFrameSchema.register_backend(t, DataFrameSchemaBackend) 66 | _DataFrameSchemaDeprecated.register_backend(t, DataFrameSchemaBackend) 67 | Column.register_backend(t, ColumnBackend) 68 | MultiIndex.register_backend(t, MultiIndexBackend) 69 | Index.register_backend(t, IndexBackend) 70 | 71 | for t in backend_types.series_datatypes: 72 | SeriesSchema.register_backend(t, SeriesSchemaBackend) 73 | Column.register_backend(t, ColumnBackend) 74 | MultiIndex.register_backend(t, MultiIndexBackend) 75 | Index.register_backend(t, IndexBackend) 76 | 77 | for t in backend_types.index_datatypes: 78 | Index.register_backend(t, IndexBackend) 79 | 80 | for t in backend_types.multiindex_datatypes: 81 | MultiIndex.register_backend(t, MultiIndexBackend) 82 | -------------------------------------------------------------------------------- /pandera/backends/polars/__init__.py: -------------------------------------------------------------------------------- 1 | """Polars backend implementation for schemas and checks.""" 2 | -------------------------------------------------------------------------------- /pandera/backends/polars/error_formatters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/backends/polars/error_formatters.py -------------------------------------------------------------------------------- /pandera/backends/polars/register.py: -------------------------------------------------------------------------------- 1 | """Register polars backends.""" 2 | 3 | from functools import lru_cache 4 | from typing import Optional 5 | 6 | import polars as pl 7 | 8 | 9 | @lru_cache 10 | def register_polars_backends( 11 | check_cls_fqn: Optional[str] = None, 12 | ): # pylint: disable=unused-argument 13 | """Register polars backends. 14 | 15 | This function is called at schema initialization in the _register_*_backends 16 | method. 17 | """ 18 | 19 | # pylint: disable=import-outside-toplevel,unused-import,cyclic-import 20 | from pandera.api.checks import Check 21 | from pandera.api.polars.components import Column 22 | from pandera.api.polars.container import DataFrameSchema 23 | from pandera.backends.polars import builtin_checks 24 | from pandera.backends.polars.checks import PolarsCheckBackend 25 | from pandera.backends.polars.components import ColumnBackend 26 | from pandera.backends.polars.container import DataFrameSchemaBackend 27 | 28 | DataFrameSchema.register_backend(pl.LazyFrame, DataFrameSchemaBackend) 29 | DataFrameSchema.register_backend(pl.DataFrame, DataFrameSchemaBackend) 30 | Column.register_backend(pl.LazyFrame, ColumnBackend) 31 | Check.register_backend(pl.LazyFrame, PolarsCheckBackend) 32 | -------------------------------------------------------------------------------- /pandera/backends/pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | """PySpark native backend implementation for schemas and checks.""" 2 | -------------------------------------------------------------------------------- /pandera/backends/pyspark/checks.py: -------------------------------------------------------------------------------- 1 | """Check backend for pyspark.""" 2 | 3 | from functools import partial 4 | from typing import Dict, List, Optional, Union 5 | 6 | from pandera.api.base.checks import CheckResult 7 | from pandera.api.checks import Check 8 | from pandera.api.pyspark.types import ( 9 | PysparkDataframeColumnObject, 10 | is_bool, 11 | is_table, 12 | GroupbyObject, 13 | ) 14 | from pandera.backends.base import BaseCheckBackend 15 | from pandera.api.pyspark.types import DataFrameTypes 16 | 17 | 18 | class PySparkCheckBackend(BaseCheckBackend): 19 | """Check backend for PySpark.""" 20 | 21 | def __init__(self, check: Check): 22 | """Initializes a check backend object.""" 23 | super().__init__(check) 24 | assert check._check_fn is not None, "Check._check_fn must be set." 25 | self.check = check 26 | self.check_fn = partial(check._check_fn, **check._check_kwargs) 27 | 28 | def groupby(self, check_obj: DataFrameTypes): # pragma: no cover 29 | """Implements groupby behavior for check object.""" 30 | assert self.check.groupby is not None, "Check.groupby must be set." 31 | if isinstance(self.check.groupby, (str, list)): 32 | return check_obj.groupby(self.check.groupby) 33 | return self.check.groupby(check_obj) 34 | 35 | def query(self, check_obj): 36 | """Implements querying behavior to produce subset of check object.""" 37 | raise NotImplementedError 38 | 39 | def aggregate(self, check_obj): 40 | """Implements aggregation behavior for check object.""" 41 | raise NotImplementedError 42 | 43 | @staticmethod 44 | def _format_groupby_input( 45 | groupby_obj: GroupbyObject, 46 | groups: Optional[List[str]], 47 | ) -> Dict[str, DataFrameTypes]: # pragma: no cover 48 | raise NotImplementedError 49 | 50 | def preprocess( 51 | self, 52 | check_obj: DataFrameTypes, 53 | key: str, # type: ignore [valid-type] 54 | ) -> DataFrameTypes: 55 | return check_obj 56 | 57 | def apply( 58 | self, 59 | check_obj: Union[DataFrameTypes, is_table], 60 | column_name: str = None, 61 | kwargs: dict = None, 62 | ): 63 | if column_name and kwargs: 64 | check_obj_and_col_name = PysparkDataframeColumnObject( 65 | check_obj, column_name 66 | ) 67 | return self.check._check_fn(check_obj_and_col_name, **kwargs) 68 | 69 | else: 70 | return self.check_fn(check_obj) # pragma: no cover 71 | 72 | def postprocess( 73 | self, 74 | check_obj: DataFrameTypes, 75 | check_output: is_bool, # type: ignore [valid-type] 76 | ) -> CheckResult: 77 | """Postprocesses the result of applying the check function.""" 78 | return CheckResult( 79 | check_output=check_output, 80 | check_passed=check_output, 81 | checked_object=check_obj, 82 | failure_cases=None, 83 | ) 84 | 85 | def __call__( 86 | self, 87 | check_obj: DataFrameTypes, 88 | key: Optional[str] = None, 89 | ) -> CheckResult: 90 | check_obj = self.preprocess(check_obj, key) 91 | 92 | check_output = self.apply( # pylint:disable=too-many-function-args 93 | check_obj, key, self.check._check_kwargs 94 | ) 95 | 96 | return self.postprocess(check_obj, check_output) 97 | -------------------------------------------------------------------------------- /pandera/backends/pyspark/error_formatters.py: -------------------------------------------------------------------------------- 1 | """Make schema error messages human-friendly.""" 2 | 3 | 4 | def format_generic_error_message( 5 | parent_schema, 6 | check, 7 | ) -> str: 8 | """Construct an error message when a check validator fails. 9 | 10 | :param parent_schema: class of schema being validated. 11 | :param check: check that generated error. 12 | """ 13 | return f"{parent_schema} failed validation " f"{check.error}" 14 | 15 | 16 | def scalar_failure_case(x) -> dict: 17 | """Construct failure case from a scalar value. 18 | 19 | :param x: a scalar value representing failure case. 20 | :returns: Dictionary used for error reporting with ``SchemaErrors``. 21 | """ 22 | return { 23 | "index": [None], 24 | "failure_case": [x], 25 | } 26 | -------------------------------------------------------------------------------- /pandera/backends/pyspark/register.py: -------------------------------------------------------------------------------- 1 | """Register pyspark backends.""" 2 | 3 | from functools import lru_cache 4 | from typing import Optional 5 | from packaging import version 6 | 7 | import pyspark 8 | import pyspark.sql as ps 9 | 10 | # Handles optional Spark Connect imports for pyspark>=3.4 (if available) 11 | CURRENT_PYSPARK_VERSION = version.parse(pyspark.__version__) 12 | if CURRENT_PYSPARK_VERSION >= version.parse("3.4"): 13 | from pyspark.sql.connect import dataframe as psc 14 | 15 | 16 | @lru_cache 17 | def register_pyspark_backends( 18 | check_cls_fqn: Optional[str] = None, 19 | ): # pylint: disable=unused-argument 20 | """Register pyspark backends. 21 | 22 | This function is called at schema initialization in the _register_*_backends 23 | method. 24 | """ 25 | 26 | # pylint: disable=import-outside-toplevel,unused-import,cyclic-import 27 | from pandera._patch_numpy2 import _patch_numpy2 28 | 29 | _patch_numpy2() 30 | 31 | from pandera.api.checks import Check 32 | from pandera.api.pyspark.column_schema import ColumnSchema 33 | from pandera.api.pyspark.components import Column 34 | from pandera.api.pyspark.container import DataFrameSchema 35 | from pandera.backends.pyspark import builtin_checks 36 | from pandera.backends.pyspark.checks import PySparkCheckBackend 37 | from pandera.backends.pyspark.column import ColumnSchemaBackend 38 | from pandera.backends.pyspark.components import ColumnBackend 39 | from pandera.backends.pyspark.container import DataFrameSchemaBackend 40 | 41 | # Register classical DataFrame 42 | Check.register_backend(ps.DataFrame, PySparkCheckBackend) 43 | ColumnSchema.register_backend(ps.DataFrame, ColumnSchemaBackend) 44 | Column.register_backend(ps.DataFrame, ColumnBackend) 45 | DataFrameSchema.register_backend(ps.DataFrame, DataFrameSchemaBackend) 46 | # Register Spark Connect DataFrame, if available 47 | if CURRENT_PYSPARK_VERSION >= version.parse("3.4"): 48 | Check.register_backend(psc.DataFrame, PySparkCheckBackend) 49 | ColumnSchema.register_backend(psc.DataFrame, ColumnSchemaBackend) 50 | Column.register_backend(psc.DataFrame, ColumnBackend) 51 | DataFrameSchema.register_backend(psc.DataFrame, DataFrameSchemaBackend) 52 | -------------------------------------------------------------------------------- /pandera/backends/pyspark/utils.py: -------------------------------------------------------------------------------- 1 | """pyspark backend utilities.""" 2 | 3 | 4 | def convert_to_list(*args): 5 | """Converts arguments to a list""" 6 | converted_list = [] 7 | for arg in args: 8 | if isinstance(arg, list): 9 | converted_list.extend(arg) 10 | else: 11 | converted_list.append(arg) 12 | 13 | return converted_list 14 | -------------------------------------------------------------------------------- /pandera/backends/utils.py: -------------------------------------------------------------------------------- 1 | """Pandas backend utilities.""" 2 | 3 | from typing import Union 4 | 5 | from pandera.dtypes import UniqueSettings 6 | 7 | 8 | def convert_uniquesettings(unique: UniqueSettings) -> Union[bool, str]: 9 | """ 10 | Converts UniqueSettings object to string that can be passed onto pandas .duplicated() call 11 | """ 12 | # Default `keep` argument for pandas .duplicated() function 13 | keep_argument: Union[bool, str] 14 | if unique == "exclude_first": 15 | keep_argument = "first" 16 | elif unique == "exclude_last": 17 | keep_argument = "last" 18 | elif unique == "all": 19 | keep_argument = False 20 | else: 21 | raise ValueError( 22 | str(unique) + " is not a recognized report_duplicates value" 23 | ) 24 | return keep_argument 25 | -------------------------------------------------------------------------------- /pandera/constants.py: -------------------------------------------------------------------------------- 1 | """Pandera constants.""" 2 | 3 | CHECK_OUTPUT_KEY = "check_output" 4 | FAILURE_CASE_KEY = "failure_case" 5 | -------------------------------------------------------------------------------- /pandera/engines/__init__.py: -------------------------------------------------------------------------------- 1 | """Pandera type engines.""" 2 | 3 | import pydantic 4 | from packaging import version 5 | 6 | 7 | def pydantic_version(): 8 | """Return the pydantic version.""" 9 | 10 | return version.parse(pydantic.__version__) 11 | 12 | 13 | PYDANTIC_V2 = pydantic_version().release >= (2, 0, 0) 14 | -------------------------------------------------------------------------------- /pandera/engines/type_aliases.py: -------------------------------------------------------------------------------- 1 | """Custom type aliases.""" 2 | 3 | from typing import Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | PandasObject = Union[pd.Series, pd.DataFrame] 10 | PandasExtensionType = pd.core.dtypes.base.ExtensionDtype 11 | PandasDataType = Union[pd.core.dtypes.base.ExtensionDtype, np.dtype, type] 12 | -------------------------------------------------------------------------------- /pandera/extensions.py: -------------------------------------------------------------------------------- 1 | """Extensions module, for backwards compatibility.""" 2 | 3 | # pylint: disable=unused-import 4 | from pandera.api.extensions import ( 5 | CheckType, 6 | register_builtin_check, 7 | register_builtin_hypothesis, 8 | register_check_method, 9 | register_check_statistics, 10 | ) 11 | -------------------------------------------------------------------------------- /pandera/external_config.py: -------------------------------------------------------------------------------- 1 | """Configuration for external packages.""" 2 | 3 | import os 4 | 5 | 6 | def _set_pyspark_environment_variables(): 7 | """Sets environment variables for pyspark.""" 8 | 9 | is_spark_local_ip_dirty = False 10 | is_pyarrow_ignore_timezone_dirty = False 11 | 12 | try: 13 | # try importing pyspark to see if it exists. This is important because the 14 | # pandera.typing module defines a Series type that inherits from 15 | # pandas.Series, and pyspark v1+ injects a __getitem__ method to pandas 16 | # Series and DataFrames to support type hinting: 17 | # https://spark.apache.org/docs/3.2.0/api/python/user_guide/pandas_on_spark/typehints.html#type-hinting-with-names 18 | # pylint: disable=unused-import 19 | if os.getenv("SPARK_LOCAL_IP") is None: 20 | is_spark_local_ip_dirty = True 21 | os.environ["SPARK_LOCAL_IP"] = "127.0.0.1" 22 | if os.getenv("PYARROW_IGNORE_TIMEZONE") is None: 23 | is_pyarrow_ignore_timezone_dirty = True 24 | # This can be overridden by the user 25 | os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" 26 | 27 | import pyspark.pandas 28 | except (ImportError, ModuleNotFoundError): 29 | pass 30 | finally: 31 | if is_spark_local_ip_dirty: 32 | os.environ.pop("SPARK_LOCAL_IP") 33 | if is_pyarrow_ignore_timezone_dirty: 34 | os.environ.pop("PYARROW_IGNORE_TIMEZONE") 35 | -------------------------------------------------------------------------------- /pandera/import_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for importing optional dependencies.""" 2 | 3 | from functools import wraps 4 | from typing import Callable, TypeVar, cast 5 | 6 | 7 | F = TypeVar("F", bound=Callable) 8 | 9 | 10 | def strategy_import_error(fn: F) -> F: 11 | """Decorator to generate input error if dependency is missing.""" 12 | 13 | @wraps(fn) 14 | def _wrapper(*args, **kwargs): 15 | 16 | try: 17 | # pylint: disable=unused-import 18 | import hypothesis 19 | except ImportError as exc: 20 | raise ImportError( 21 | 'Strategies for generating data requires "hypothesis" to be \n' 22 | "installed. You can install pandera together with the strategies \n" 23 | "dependencies with:\n" 24 | "pip install pandera[strategies]" 25 | ) from exc 26 | 27 | return fn(*args, **kwargs) 28 | 29 | return cast(F, _wrapper) 30 | -------------------------------------------------------------------------------- /pandera/inspection_utils.py: -------------------------------------------------------------------------------- 1 | """Decorators for integrating pandera into existing data pipelines.""" 2 | 3 | from inspect import ismethod 4 | from typing import Callable 5 | 6 | 7 | def _is_like_classmethod(fn: Callable) -> bool: 8 | """A regular method defined on a metaclass behaves the same way as 9 | a method decorated with @classmethod defined on a regular class. 10 | 11 | This function covers both use cases. 12 | """ 13 | is_method = ismethod(fn) 14 | return is_method and isinstance(fn.__self__, type) # type: ignore[attr-defined] 15 | 16 | 17 | def is_decorated_classmethod(fn: Callable) -> bool: 18 | """Check if fn is a classmethod declared with the @classmethod decorator. 19 | 20 | Adapted from: 21 | https://stackoverflow.com/questions/19227724/check-if-a-function-uses-classmethod 22 | """ 23 | if not _is_like_classmethod(fn): 24 | return False 25 | bound_to = fn.__self__ # type: ignore[attr-defined] 26 | assert isinstance(bound_to, type) 27 | name = fn.__name__ 28 | for cls in bound_to.__mro__: 29 | descriptor = vars(cls).get(name) 30 | if descriptor is not None: 31 | return isinstance(descriptor, classmethod) 32 | return False 33 | 34 | 35 | def is_classmethod_from_meta(fn: Callable) -> bool: 36 | """Check if fn is a regular method defined on a metaclass 37 | (which behaves like an @classmethod method defined on a regular class).""" 38 | return not is_decorated_classmethod(fn) and _is_like_classmethod(fn) 39 | -------------------------------------------------------------------------------- /pandera/io/__init__.py: -------------------------------------------------------------------------------- 1 | """Subpackage for serializing/deserializing pandera schemas to other formats.""" 2 | 3 | from pandera.io.pandas_io import ( 4 | _deserialize_check_stats, 5 | _deserialize_component_stats, 6 | _format_checks, 7 | _format_index, 8 | _format_script, 9 | _get_dtype_string_alias, 10 | _serialize_check_stats, 11 | _serialize_component_stats, 12 | _serialize_dataframe_stats, 13 | deserialize_schema, 14 | from_frictionless_schema, 15 | from_json, 16 | from_yaml, 17 | serialize_schema, 18 | to_json, 19 | to_script, 20 | to_yaml, 21 | ) 22 | -------------------------------------------------------------------------------- /pandera/polars.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-import 2 | """A flexible and expressive polars validation library for Python.""" 3 | 4 | from pandera import errors 5 | from pandera.api.checks import Check 6 | from pandera.api.dataframe.model_components import ( 7 | Field, 8 | check, 9 | dataframe_check, 10 | ) 11 | from pandera.api.polars.components import Column 12 | from pandera.api.polars.container import DataFrameSchema 13 | from pandera.api.polars.model import DataFrameModel 14 | from pandera.api.polars.types import PolarsData 15 | from pandera.backends.polars.register import register_polars_backends 16 | from pandera.decorators import check_input, check_io, check_output, check_types 17 | from pandera.typing import polars as typing 18 | 19 | register_polars_backends() 20 | 21 | 22 | __all__ = [ 23 | "check_input", 24 | "check_io", 25 | "check_output", 26 | "check_types", 27 | "check", 28 | "Check", 29 | "Column", 30 | "dataframe_check", 31 | "DataFrameModel", 32 | "DataFrameSchema", 33 | "errors", 34 | "Field", 35 | "PolarsData", 36 | ] 37 | -------------------------------------------------------------------------------- /pandera/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/py.typed -------------------------------------------------------------------------------- /pandera/pyspark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-import,wrong-import-position,shadowed-import,reimported 2 | """A flexible and expressive pyspark validation library.""" 3 | 4 | from pandera._patch_numpy2 import _patch_numpy2 5 | 6 | _patch_numpy2() 7 | 8 | import pandera.backends.pyspark 9 | from pandera import errors, external_config 10 | from pandera.accessors import pyspark_sql_accessor 11 | from pandera.api.checks import Check 12 | from pandera.api.pyspark import Column, DataFrameSchema 13 | from pandera.api.pyspark.model import DataFrameModel 14 | from pandera.api.pyspark.model_components import Field, check, dataframe_check 15 | from pandera.decorators import check_input, check_io, check_output, check_types 16 | from pandera.dtypes import ( 17 | Bool, 18 | Category, 19 | Complex, 20 | Complex64, 21 | Complex128, 22 | Complex256, 23 | DataType, 24 | Date, 25 | DateTime, 26 | Decimal, 27 | Float, 28 | Float16, 29 | Float32, 30 | Float64, 31 | Float128, 32 | Int, 33 | Int8, 34 | Int16, 35 | Int32, 36 | Int64, 37 | String, 38 | Timedelta, 39 | Timestamp, 40 | UInt, 41 | UInt8, 42 | UInt16, 43 | UInt32, 44 | UInt64, 45 | ) 46 | from pandera.errors import PysparkSchemaError, SchemaInitError 47 | from pandera.schema_inference.pandas import infer_schema 48 | from pandera.typing import pyspark_sql 49 | from pandera._version import __version__ 50 | from pandera.typing import pyspark_sql as typing 51 | 52 | 53 | external_config._set_pyspark_environment_variables() 54 | 55 | __all__ = [ 56 | # dtypes 57 | "Bool", 58 | "Category", 59 | "Complex", 60 | "Complex64", 61 | "Complex128", 62 | "Complex256", 63 | "DataType", 64 | "DateTime", 65 | "Float", 66 | "Float16", 67 | "Float32", 68 | "Float64", 69 | "Float128", 70 | "Int", 71 | "Int8", 72 | "Int16", 73 | "Int32", 74 | "Int64", 75 | "String", 76 | "Timedelta", 77 | "Timestamp", 78 | "UInt", 79 | "UInt8", 80 | "UInt16", 81 | "UInt32", 82 | "UInt64", 83 | # checks 84 | "Check", 85 | # decorators 86 | "check_input", 87 | "check_io", 88 | "check_output", 89 | "check_types", 90 | # model 91 | "DataFrameModel", 92 | # model_components 93 | "Field", 94 | "check", 95 | "dataframe_check", 96 | # schema_components 97 | "Column", 98 | # schema_inference 99 | "infer_schema", 100 | # schemas 101 | "DataFrameSchema", 102 | # version 103 | "__version__", 104 | ] 105 | -------------------------------------------------------------------------------- /pandera/schema_inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/pandera/schema_inference/__init__.py -------------------------------------------------------------------------------- /pandera/schema_inference/pandas.py: -------------------------------------------------------------------------------- 1 | """Module for inferring dataframe/series schema.""" 2 | 3 | from typing import overload 4 | 5 | import pandas as pd 6 | 7 | from pandera.api.pandas.array import SeriesSchema 8 | from pandera.api.pandas.components import Column, Index, MultiIndex 9 | from pandera.api.pandas.container import DataFrameSchema 10 | from pandera.schema_statistics.pandas import ( 11 | infer_dataframe_statistics, 12 | infer_series_statistics, 13 | parse_check_statistics, 14 | ) 15 | 16 | 17 | @overload 18 | def infer_schema( 19 | pandas_obj: pd.Series, 20 | ) -> SeriesSchema: # pragma: no cover 21 | ... 22 | 23 | 24 | @overload 25 | def infer_schema( # type: ignore[misc] 26 | pandas_obj: pd.DataFrame, 27 | ) -> DataFrameSchema: # pragma: no cover 28 | ... 29 | 30 | 31 | def infer_schema(pandas_obj): 32 | """Infer schema for pandas DataFrame or Series object. 33 | 34 | :param pandas_obj: DataFrame or Series object to infer. 35 | :returns: DataFrameSchema or SeriesSchema 36 | :raises: TypeError if pandas_obj is not expected type. 37 | """ 38 | if isinstance(pandas_obj, pd.DataFrame): 39 | return infer_dataframe_schema(pandas_obj) 40 | elif isinstance(pandas_obj, pd.Series): 41 | return infer_series_schema(pandas_obj) 42 | else: 43 | raise TypeError( 44 | "pandas_obj type not recognized. Expected a pandas DataFrame or " 45 | f"Series, found {type(pandas_obj)}" 46 | ) 47 | 48 | 49 | def _create_index(index_statistics): 50 | index = [ 51 | Index( 52 | properties["dtype"], 53 | checks=parse_check_statistics(properties["checks"]), 54 | nullable=properties["nullable"], 55 | name=properties["name"], 56 | ) 57 | for properties in index_statistics 58 | ] 59 | if len(index) == 1: 60 | index = index[0] # type: ignore 61 | else: 62 | index = MultiIndex(index) # type: ignore 63 | 64 | return index 65 | 66 | 67 | def infer_dataframe_schema(df: pd.DataFrame) -> DataFrameSchema: 68 | """Infer a DataFrameSchema from a pandas DataFrame. 69 | 70 | :param df: DataFrame object to infer. 71 | :returns: DataFrameSchema 72 | """ 73 | df_statistics = infer_dataframe_statistics(df) 74 | schema = DataFrameSchema( 75 | columns={ 76 | colname: Column( 77 | properties["dtype"], 78 | checks=parse_check_statistics(properties["checks"]), 79 | nullable=properties["nullable"], 80 | ) 81 | for colname, properties in df_statistics["columns"].items() 82 | }, 83 | index=_create_index(df_statistics["index"]), 84 | coerce=True, 85 | ) 86 | return schema 87 | 88 | 89 | def infer_series_schema(series) -> SeriesSchema: 90 | """Infer a SeriesSchema from a pandas DataFrame. 91 | 92 | :param series: Series object to infer. 93 | :returns: SeriesSchema 94 | """ 95 | series_statistics = infer_series_statistics(series) 96 | schema = SeriesSchema( 97 | dtype=series_statistics["dtype"], 98 | checks=parse_check_statistics(series_statistics["checks"]), 99 | nullable=series_statistics["nullable"], 100 | name=series_statistics["name"], 101 | coerce=True, 102 | ) 103 | return schema 104 | -------------------------------------------------------------------------------- /pandera/schema_statistics/__init__.py: -------------------------------------------------------------------------------- 1 | """Module to extract schema statsitics from schema objects.""" 2 | 3 | from pandera.schema_statistics.pandas import ( 4 | get_dataframe_schema_statistics, 5 | get_index_schema_statistics, 6 | get_series_schema_statistics, 7 | infer_dataframe_statistics, 8 | infer_index_statistics, 9 | infer_series_statistics, 10 | parse_check_statistics, 11 | parse_checks, 12 | ) 13 | -------------------------------------------------------------------------------- /pandera/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-import 2 | """Data synthesis strategies for pandera, powered by the hypothesis package.""" 3 | 4 | import warnings 5 | 6 | try: 7 | import pandas 8 | from pandera.strategies.pandas_strategies import * 9 | except ImportError: 10 | pass 11 | -------------------------------------------------------------------------------- /pandera/strategies/base_strategies.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-import 2 | """Base module for `hypothesis`-based strategies for data synthesis.""" 3 | 4 | from functools import wraps 5 | from typing import Callable, Dict, Generic, Tuple, Type, TypeVar, cast 6 | 7 | import pandera.backends.base.builtin_checks 8 | 9 | 10 | F = TypeVar("F", bound=Callable) 11 | 12 | 13 | try: 14 | # pylint: disable=unused-import 15 | from hypothesis.strategies import SearchStrategy, composite 16 | except ImportError: # pragma: no cover 17 | T = TypeVar("T") 18 | 19 | # pylint: disable=too-few-public-methods 20 | class SearchStrategy(Generic[T]): # type: ignore 21 | """placeholder type.""" 22 | 23 | def composite(fn): # type: ignore 24 | """placeholder composite strategy.""" 25 | return fn 26 | 27 | HAS_HYPOTHESIS = False 28 | else: 29 | HAS_HYPOTHESIS = True 30 | 31 | 32 | # This strategy registry maps (check_name, data_type) -> strategy_function 33 | # For example: ("greater_than", pd.DataFrame) -> () 34 | STRATEGY_DISPATCHER: Dict[Tuple[str, Type], Callable] = {} 35 | 36 | 37 | def strategy_import_error(fn: F) -> F: 38 | """Decorator to generate input error if dependency is missing.""" 39 | 40 | @wraps(fn) 41 | def _wrapper(*args, **kwargs): 42 | if not HAS_HYPOTHESIS: # pragma: no cover 43 | raise ImportError( 44 | 'Strategies for generating data requires "hypothesis" to be \n' 45 | "installed. You can install pandera together with the strategies \n" 46 | "dependencies with:\n" 47 | "pip install pandera[strategies]" 48 | ) 49 | return fn(*args, **kwargs) 50 | 51 | return cast(F, _wrapper) 52 | -------------------------------------------------------------------------------- /pandera/system.py: -------------------------------------------------------------------------------- 1 | """Global variables relating to OS.""" 2 | 3 | import numpy as np 4 | 5 | # Windows and Mac M1 don't support floats of this precision: 6 | # https://github.com/pandera-dev/pandera/issues/623 7 | FLOAT_128_AVAILABLE = hasattr(np, "float128") 8 | -------------------------------------------------------------------------------- /pandera/typing/__init__.py: -------------------------------------------------------------------------------- 1 | """Typing module. 2 | 3 | For backwards compatibility, pandas types are exposed to the top-level scope of 4 | the typing module. 5 | """ 6 | 7 | from functools import lru_cache 8 | from typing import Set, Type 9 | from pandera.typing.common import AnnotationInfo 10 | 11 | try: 12 | from pandera.typing.pandas import ( 13 | DataFrame, 14 | Index, 15 | Series, 16 | Bool, 17 | Category, 18 | Date, 19 | DateTime, 20 | Decimal, 21 | Float, 22 | Float16, 23 | Float32, 24 | Float64, 25 | Int, 26 | Int8, 27 | Int16, 28 | Int32, 29 | Int64, 30 | Object, 31 | String, 32 | Timedelta, 33 | UInt8, 34 | UInt16, 35 | UInt32, 36 | UInt64, 37 | INT8, 38 | INT16, 39 | INT32, 40 | INT64, 41 | UINT8, 42 | UINT16, 43 | UINT32, 44 | UINT64, 45 | STRING, 46 | ) 47 | except ImportError: 48 | pass 49 | 50 | 51 | @lru_cache 52 | def get_dataframe_types(): 53 | from pandera.typing import ( 54 | dask, 55 | geopandas, 56 | modin, 57 | pyspark, 58 | pyspark_sql, 59 | ) 60 | 61 | dataframe_types: Set[Type] = {DataFrame} 62 | if dask.DASK_INSTALLED: 63 | dataframe_types.update({dask.DataFrame}) 64 | 65 | if modin.MODIN_INSTALLED: 66 | dataframe_types.update({modin.DataFrame}) 67 | 68 | if pyspark.PYSPARK_INSTALLED: 69 | dataframe_types.update({pyspark.DataFrame}) 70 | 71 | if pyspark_sql.PYSPARK_SQL_INSTALLED: 72 | dataframe_types.update({pyspark_sql.DataFrame}) 73 | 74 | if geopandas.GEOPANDAS_INSTALLED: 75 | dataframe_types.update({geopandas.GeoDataFrame}) 76 | 77 | return dataframe_types 78 | 79 | 80 | @lru_cache 81 | def get_series_types(): 82 | from pandera.typing import ( 83 | dask, 84 | geopandas, 85 | modin, 86 | pyspark, 87 | ) 88 | 89 | series_types: Set[Type] = {Series} 90 | if dask.DASK_INSTALLED: 91 | series_types.update({dask.Series}) 92 | 93 | if modin.MODIN_INSTALLED: 94 | series_types.update({modin.Series}) 95 | 96 | if pyspark.PYSPARK_INSTALLED: 97 | series_types.update({pyspark.Series}) 98 | 99 | if geopandas.GEOPANDAS_INSTALLED: 100 | series_types.update({geopandas.GeoSeries}) 101 | 102 | return series_types 103 | 104 | 105 | @lru_cache 106 | def get_index_types(): 107 | from pandera.typing import dask, modin, pyspark 108 | 109 | index_types: Set[Type] = {Index} 110 | if dask.DASK_INSTALLED: 111 | index_types.update({dask.Index}) 112 | 113 | if modin.MODIN_INSTALLED: 114 | index_types.update({modin.Index}) 115 | 116 | if pyspark.PYSPARK_INSTALLED: 117 | index_types.update({pyspark.Index}) # type: ignore [arg-type] 118 | 119 | return index_types 120 | 121 | 122 | __all__ = [ 123 | "AnnotationInfo", 124 | "DataFrame", 125 | "Series", 126 | "Index", 127 | "get_dataframe_types", 128 | "get_index_types", 129 | "get_series_types", 130 | ] 131 | -------------------------------------------------------------------------------- /pandera/typing/dask.py: -------------------------------------------------------------------------------- 1 | """Pandera type annotations for Dask.""" 2 | 3 | from typing import TYPE_CHECKING, Generic, TypeVar 4 | 5 | from pandera.typing.common import DataFrameBase, IndexBase, SeriesBase 6 | from pandera.typing.pandas import DataFrameModel, GenericDtype 7 | 8 | try: 9 | import dask.dataframe as dd 10 | 11 | DASK_INSTALLED = True 12 | except ImportError: 13 | DASK_INSTALLED = False 14 | 15 | 16 | # pylint:disable=invalid-name 17 | if TYPE_CHECKING: 18 | T = TypeVar("T") # pragma: no cover 19 | else: 20 | T = DataFrameModel 21 | 22 | 23 | if DASK_INSTALLED: 24 | # pylint: disable=too-few-public-methods,abstract-method 25 | class DataFrame(DataFrameBase, dd.DataFrame, Generic[T]): 26 | """ 27 | Representation of dask.dataframe.DataFrame, only used for type 28 | annotation. 29 | 30 | *new in 0.8.0* 31 | """ 32 | 33 | # pylint:disable=too-few-public-methods 34 | class Series(SeriesBase, dd.Series, Generic[GenericDtype]): # type: ignore 35 | """Representation of pandas.Series, only used for type annotation. 36 | 37 | *new in 0.8.0* 38 | """ 39 | 40 | # pylint:disable=too-few-public-methods 41 | class Index(IndexBase, dd.Index, Generic[GenericDtype]): 42 | """Representation of pandas.Index, only used for type annotation. 43 | 44 | *new in 0.8.0* 45 | """ 46 | -------------------------------------------------------------------------------- /pandera/typing/formats.py: -------------------------------------------------------------------------------- 1 | """Serialization formats for dataframes.""" 2 | 3 | from enum import Enum 4 | from typing import Union 5 | 6 | try: 7 | # python 3.8+ 8 | from typing import Literal # type: ignore[attr-defined] 9 | except ImportError: # pragma: no cover 10 | from typing_extensions import Literal # type: ignore[assignment] 11 | 12 | 13 | class Formats(Enum): 14 | """Data container serialization formats. 15 | 16 | The values of this enum specify the valid values taken by the ``to_format`` 17 | and ``from_format`` attributes in 18 | :py:class:`~pandera.typing.config.BaseConfig` when specifying a 19 | :py:class:`~pandera.api.pandas.model.DataFrameModel`. 20 | """ 21 | 22 | # pylint: disable=invalid-name 23 | 24 | #: comma-separated values file 25 | csv = "csv" 26 | 27 | #: python dictionary 28 | dict = "dict" 29 | 30 | #: json file 31 | json = "json" 32 | 33 | #: feather file format. See 34 | #: `here `__ for more 35 | #: details 36 | feather = "feather" 37 | 38 | #: parquet file format. See `here `__ for more 39 | #: details 40 | parquet = "parquet" 41 | 42 | #: python pickle file format 43 | pickle = "pickle" 44 | 45 | #: python json_normalize 46 | json_normalize = "json_normalize" 47 | 48 | 49 | Format = Union[ 50 | Literal[Formats.csv], 51 | Literal[Formats.dict], 52 | Literal[Formats.json], 53 | Literal[Formats.feather], 54 | Literal[Formats.parquet], 55 | Literal[Formats.pickle], 56 | Literal[Formats.json_normalize], 57 | ] 58 | -------------------------------------------------------------------------------- /pandera/typing/modin.py: -------------------------------------------------------------------------------- 1 | """Pandera type annotations for Modin.""" 2 | 3 | from typing import TYPE_CHECKING, Generic, TypeVar 4 | 5 | from packaging import version 6 | 7 | from pandera.typing.common import DataFrameBase, IndexBase, SeriesBase 8 | from pandera.typing.pandas import DataFrameModel, GenericDtype 9 | 10 | try: 11 | import modin 12 | import modin.pandas as mpd 13 | 14 | MODIN_INSTALLED = True 15 | except ImportError: 16 | MODIN_INSTALLED = False 17 | 18 | 19 | def modin_version(): 20 | """Return the modin version.""" 21 | return version.parse(modin.__version__) 22 | 23 | 24 | # pylint:disable=invalid-name 25 | if TYPE_CHECKING: 26 | T = TypeVar("T") # pragma: no cover 27 | else: 28 | T = DataFrameModel 29 | 30 | 31 | if MODIN_INSTALLED: 32 | # pylint: disable=too-few-public-methods 33 | class DataFrame(DataFrameBase, mpd.DataFrame, Generic[T]): 34 | """ 35 | Representation of dask.dataframe.DataFrame, only used for type 36 | annotation. 37 | 38 | *new in 0.8.0* 39 | """ 40 | 41 | # pylint:disable=too-few-public-methods,abstract-method 42 | class Series(SeriesBase, mpd.Series, Generic[GenericDtype]): 43 | """Representation of pandas.Series, only used for type annotation. 44 | 45 | *new in 0.8.0* 46 | """ 47 | 48 | # pylint:disable=too-few-public-methods,abstract-method 49 | class Index(IndexBase, mpd.Index, Generic[GenericDtype]): 50 | """Representation of pandas.Index, only used for type annotation. 51 | 52 | *new in 0.8.0* 53 | """ 54 | -------------------------------------------------------------------------------- /pandera/typing/pyspark.py: -------------------------------------------------------------------------------- 1 | """Pandera type annotations for Pyspark Pandas.""" 2 | 3 | from typing import TYPE_CHECKING, Generic, TypeVar 4 | 5 | from pandera.typing.common import ( 6 | DataFrameBase, 7 | GenericDtype, 8 | IndexBase, 9 | SeriesBase, 10 | ) 11 | from pandera.typing.pandas import DataFrameModel, _GenericAlias 12 | 13 | try: 14 | import pyspark.pandas as ps 15 | 16 | PYSPARK_INSTALLED = True 17 | except ImportError: # pragma: no cover 18 | PYSPARK_INSTALLED = False 19 | 20 | 21 | # pylint:disable=invalid-name 22 | if TYPE_CHECKING: 23 | T = TypeVar("T") # pragma: no cover 24 | else: 25 | T = DataFrameModel 26 | 27 | 28 | if PYSPARK_INSTALLED: 29 | # pylint: disable=too-few-public-methods,arguments-renamed 30 | class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]): 31 | """ 32 | Representation of dask.dataframe.DataFrame, only used for type 33 | annotation. 34 | 35 | *new in 0.8.0* 36 | """ 37 | 38 | def __class_getitem__(cls, item): 39 | """Define this to override's pyspark.pandas generic type.""" 40 | return _GenericAlias(cls, item) 41 | 42 | # pylint:disable=too-few-public-methods,arguments-renamed 43 | class Series(SeriesBase, ps.Series, Generic[GenericDtype]): # type: ignore [misc] # noqa 44 | """Representation of pandas.Series, only used for type annotation. 45 | 46 | *new in 0.8.0* 47 | """ 48 | 49 | def __class_getitem__(cls, item): 50 | """Define this to override pyspark.pandas generic type""" 51 | return _GenericAlias(cls, item) 52 | 53 | # pylint:disable=too-few-public-methods 54 | class Index(IndexBase, ps.Index, Generic[GenericDtype]): 55 | """Representation of pandas.Index, only used for type annotation. 56 | 57 | *new in 0.8.0* 58 | """ 59 | -------------------------------------------------------------------------------- /pandera/typing/pyspark_sql.py: -------------------------------------------------------------------------------- 1 | """Pandera type annotations for Pyspark.""" 2 | 3 | from typing import TypeVar, Union 4 | 5 | from pandera.typing.common import DataFrameBase 6 | from pandera.typing.pandas import DataFrameModel, _GenericAlias 7 | 8 | try: 9 | import pyspark.sql as ps 10 | 11 | PYSPARK_SQL_INSTALLED = True 12 | except ImportError: # pragma: no cover 13 | PYSPARK_SQL_INSTALLED = False 14 | 15 | if PYSPARK_SQL_INSTALLED: 16 | from pandera.engines import pyspark_engine 17 | 18 | PysparkString = pyspark_engine.String 19 | PysparkInt = pyspark_engine.Int 20 | PysparkLongInt = pyspark_engine.BigInt 21 | PysparkShortInt = pyspark_engine.ShortInt 22 | PysparkByteInt = pyspark_engine.ByteInt 23 | PysparkDouble = pyspark_engine.Double 24 | PysparkFloat = pyspark_engine.Float 25 | PysparkDecimal = pyspark_engine.Decimal 26 | PysparkDate = pyspark_engine.Date 27 | PysparkTimestamp = pyspark_engine.Timestamp 28 | PysparkBinary = pyspark_engine.Binary 29 | 30 | PysparkDType = TypeVar( # type: ignore 31 | "PysparkDType", 32 | bound=Union[ 33 | PysparkString, # type: ignore 34 | PysparkInt, # type: ignore 35 | PysparkLongInt, # type: ignore 36 | PysparkShortInt, # type: ignore 37 | PysparkByteInt, # type: ignore 38 | PysparkDouble, # type: ignore 39 | PysparkFloat, # type: ignore 40 | PysparkDecimal, # type: ignore 41 | PysparkDate, # type: ignore 42 | PysparkTimestamp, # type: ignore 43 | PysparkBinary, # type: ignore 44 | ], 45 | ) 46 | from typing import TYPE_CHECKING, Generic 47 | 48 | # pylint:disable=invalid-name 49 | if TYPE_CHECKING: 50 | T = TypeVar("T") # pragma: no cover 51 | else: 52 | T = DataFrameModel 53 | 54 | if PYSPARK_SQL_INSTALLED: 55 | # pylint: disable=too-few-public-methods,arguments-renamed 56 | class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]): 57 | """ 58 | Representation of dask.dataframe.DataFrame, only used for type 59 | annotation. 60 | 61 | *new in 0.8.0* 62 | """ 63 | 64 | def __class_getitem__(cls, item): 65 | """Define this to override's pyspark.pandas generic type.""" 66 | return _GenericAlias(cls, item) # pragma: no cover 67 | -------------------------------------------------------------------------------- /pandera/utils.py: -------------------------------------------------------------------------------- 1 | """General utility functions""" 2 | 3 | from typing import Any, Callable, TypeVar 4 | 5 | F = TypeVar("F", bound=Callable) 6 | 7 | 8 | def docstring_substitution(*args: Any, **kwargs: Any) -> Callable[[F], F]: 9 | """Typed wrapper around pandas.util.Substitution.""" 10 | 11 | def decorator(func: F) -> F: 12 | # handle case when pandera is run in optimized mode: 13 | # https://docs.python.org/3/using/cmdline.html#cmdoption-OO 14 | if func.__doc__ is None: 15 | return func 16 | 17 | if args: 18 | _doc = func.__doc__ % tuple(args) # type: ignore[operator] 19 | elif kwargs: 20 | _doc = func.__doc__ % kwargs # type: ignore[operator] 21 | func.__doc__ = _doc # pylint:disable=possibly-used-before-assignment 22 | return func 23 | 24 | return decorator 25 | 26 | 27 | def is_regex(name: str): 28 | """ 29 | Checks whether a string is a regex pattern, as defined as starting with 30 | '^' and ending with '$'. 31 | """ 32 | return name.startswith("^") and name.endswith("$") 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file is auto-generated from environment.yml, do not modify. 2 | # See that file for comments about the need/usage of each dependency. 3 | 4 | pip 5 | packaging >= 20.0 6 | typing_extensions 7 | hypothesis >= 6.92.7 8 | pyyaml >= 5.1 9 | typing_inspect >= 0.6.0 10 | frictionless <= 4.40.8 11 | pyarrow 12 | pydantic 13 | scipy 14 | pandas-stubs 15 | pyspark[connect] >= 3.2.0, < 4.0.0 16 | polars >= 0.20.0 17 | modin 18 | protobuf 19 | geopandas 20 | shapely 21 | fastapi 22 | black >= 24.0 23 | numpy >= 1.24.4 24 | pandas >= 2.1.1 25 | isort >= 5.7.0 26 | joblib 27 | mypy == 1.10.0 28 | pylint < 3.3 29 | pytest 30 | pytest-cov 31 | pytest-xdist 32 | pytest-asyncio 33 | pytz 34 | xdoctest 35 | nox 36 | uv 37 | setuptools 38 | uvicorn 39 | python-multipart 40 | sphinx 41 | sphinx-design 42 | sphinx-autodoc-typehints <= 1.14.1 43 | sphinx-copybutton 44 | recommonmark 45 | myst-nb 46 | twine 47 | asv >= 0.5.1 48 | pre_commit 49 | dask[dataframe] 50 | distributed 51 | furo 52 | sphinx-docsearch 53 | grpcio 54 | ray 55 | typeguard 56 | types-click 57 | types-pytz 58 | types-pyyaml 59 | types-requests 60 | types-setuptools 61 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | float_to_top = true 3 | profile = black 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/__init__.py -------------------------------------------------------------------------------- /tests/base/test_base_schema.py: -------------------------------------------------------------------------------- 1 | """Base schema unit tests.""" 2 | 3 | import pytest 4 | 5 | from pandera.api.base.schema import BaseSchema 6 | from pandera.backends.base import BaseSchemaBackend 7 | 8 | 9 | class MockSchema(BaseSchema): 10 | """Mock schema""" 11 | 12 | 13 | class MockSchemaBackend(BaseSchemaBackend): 14 | """Mock schema backend""" 15 | 16 | 17 | def test_get_backend_error(): 18 | """Raise value error when no arguments are passed.""" 19 | 20 | schema = MockSchema() 21 | with pytest.raises(ValueError): 22 | schema.get_backend() 23 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Pytest configuration.""" 2 | 3 | import os 4 | 5 | try: 6 | # pylint: disable=unused-import 7 | import hypothesis # noqa F401 8 | from hypothesis import settings 9 | except ImportError: 10 | HAS_HYPOTHESIS = False 11 | else: 12 | HAS_HYPOTHESIS = True 13 | 14 | # ignore test files associated with hypothesis strategies 15 | collect_ignore = [] 16 | 17 | if not HAS_HYPOTHESIS: 18 | collect_ignore.append("test_strategies.py") 19 | else: 20 | suppressed_health_checks = [ 21 | hypothesis.HealthCheck.data_too_large, 22 | hypothesis.HealthCheck.too_slow, 23 | hypothesis.HealthCheck.filter_too_much, 24 | ] 25 | 26 | settings.register_profile( 27 | "ci", 28 | max_examples=10, 29 | deadline=None, 30 | suppress_health_check=suppressed_health_checks, 31 | ) 32 | settings.register_profile( 33 | "dev", 34 | max_examples=30, 35 | deadline=None, 36 | suppress_health_check=suppressed_health_checks, 37 | ) 38 | settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "dev")) 39 | -------------------------------------------------------------------------------- /tests/dask/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/dask/__init__.py -------------------------------------------------------------------------------- /tests/dask/test_dask_accessor.py: -------------------------------------------------------------------------------- 1 | """Unit tests for dask_accessor module.""" 2 | 3 | from typing import Union 4 | 5 | import dask.dataframe as dd 6 | import pandas as pd 7 | import pytest 8 | 9 | import pandera.pandas as pa 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "schema1, schema2, data, invalid_data", 14 | [ 15 | [ 16 | pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True), 17 | pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True), 18 | dd.from_pandas(pd.DataFrame({"col": [1, 2, 3]}), npartitions=1), 19 | dd.from_pandas(pd.Series([1, 2, 3]), npartitions=1), 20 | ], 21 | [ 22 | pa.SeriesSchema(int, coerce=True), 23 | pa.SeriesSchema(float, coerce=True), 24 | dd.from_pandas(pd.Series([1, 2, 3]), npartitions=1), 25 | dd.from_pandas(pd.DataFrame({"col": [1, 2, 3]}), npartitions=1), 26 | ], 27 | ], 28 | ) 29 | @pytest.mark.parametrize("inplace", [False, True]) 30 | def test_dataframe_series_add_schema( 31 | schema1: Union[pa.DataFrameSchema, pa.SeriesSchema], 32 | schema2: Union[pa.DataFrameSchema, pa.SeriesSchema], 33 | data: Union[pd.DataFrame, pd.Series], 34 | invalid_data: Union[pd.DataFrame, pd.Series], 35 | inplace: bool, 36 | ) -> None: 37 | """ 38 | Test that pandas object contains schema metadata after pandera validation. 39 | """ 40 | validated_data_1 = schema1(data, inplace=inplace) # type: ignore[arg-type] 41 | if inplace: 42 | assert data.pandera.schema == schema1 43 | else: 44 | assert data.pandera.schema is None 45 | assert validated_data_1.pandera.schema == schema1 46 | 47 | validated_data_2 = schema2(validated_data_1, inplace=inplace) # type: ignore[arg-type] 48 | if inplace: 49 | assert validated_data_1.pandera.schema == schema2 50 | else: 51 | assert validated_data_1.pandera.schema == schema1 52 | assert validated_data_2.pandera.schema == schema2 53 | 54 | with pytest.raises(TypeError): 55 | schema1(invalid_data) # type: ignore[arg-type] 56 | 57 | with pytest.raises(TypeError): 58 | schema2(invalid_data) # type: ignore[arg-type] 59 | -------------------------------------------------------------------------------- /tests/dask/test_dask_not_installed.py: -------------------------------------------------------------------------------- 1 | """Tests behavior when dask is not installed.""" 2 | 3 | import sys 4 | from unittest import mock 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | 10 | def test_dask_not_installed() -> None: 11 | """ 12 | Test that Pandera and its modules can be imported and continue to work 13 | without dask. 14 | """ 15 | with mock.patch.dict("sys.modules", {"dask": None}): 16 | with pytest.raises(ImportError): 17 | # pylint: disable=import-outside-toplevel,unused-import 18 | import dask.dataframe 19 | 20 | for module in ["pandera", "pandera.accessors.dask_accessor"]: 21 | try: 22 | del sys.modules[module] 23 | except KeyError: 24 | ... 25 | 26 | # pylint: disable=import-outside-toplevel,unused-import 27 | import pandera 28 | 29 | assert "pandera.accessors.dask_accessor" not in sys.modules 30 | 31 | del sys.modules["pandera"] 32 | del sys.modules["pandera.api.pandas.types"] 33 | # pylint: disable=import-outside-toplevel 34 | from pandera.api.pandas.types import is_table 35 | 36 | assert not is_table(pd.Series([1])) 37 | 38 | for module in ["pandera", "pandera.typing"]: 39 | try: 40 | del sys.modules[module] 41 | except KeyError: 42 | ... 43 | 44 | # pylint: disable=import-outside-toplevel 45 | import pandera.typing 46 | 47 | annotation = pandera.typing.DataFrame[int] 48 | assert pandera.typing.AnnotationInfo(annotation).is_generic_df 49 | -------------------------------------------------------------------------------- /tests/fastapi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/fastapi/__init__.py -------------------------------------------------------------------------------- /tests/fastapi/app.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | from fastapi import Body, FastAPI, File 3 | from fastapi.responses import HTMLResponse 4 | 5 | from pandera.typing import DataFrame 6 | from pandera.typing.fastapi import UploadFile 7 | from tests.fastapi.models import ( 8 | Item, 9 | ResponseModel, 10 | Transactions, 11 | TransactionsDictOut, 12 | TransactionsParquet, 13 | ) 14 | 15 | try: 16 | from typing import Annotated # type: ignore[attr-defined] 17 | except ImportError: 18 | from typing_extensions import Annotated # type: ignore[assignment] 19 | 20 | app = FastAPI() 21 | 22 | 23 | @app.post("/items/", response_model=Item) 24 | def create_item(item: Item): 25 | return item 26 | 27 | 28 | @app.post("/transactions/", response_model=DataFrame[TransactionsDictOut]) 29 | def create_transactions( 30 | transactions: Annotated[DataFrame[Transactions], Body()], 31 | ): 32 | output = transactions.assign(name="foo") 33 | ... # do other stuff, e.g. update backend database with transactions 34 | return output 35 | 36 | 37 | @app.post("/file/", response_model=ResponseModel) 38 | def create_upload_file( 39 | file: Annotated[UploadFile[DataFrame[TransactionsParquet]], File()], 40 | ): 41 | return { 42 | "filename": file.filename, 43 | "df": file.data.assign(name="foo"), 44 | } 45 | 46 | 47 | @app.get("/") 48 | def main(): 49 | content = """ 50 | 51 |
52 | 53 | 54 |
55 | 56 | """ 57 | return HTMLResponse(content=content) 58 | -------------------------------------------------------------------------------- /tests/fastapi/models.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | from typing import Optional 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | import pandera.pandas as pa 7 | 8 | 9 | class Transactions(pa.DataFrameModel): 10 | id: pa.typing.Series[int] 11 | cost: pa.typing.Series[float] = pa.Field(ge=0, le=1000) 12 | 13 | class Config: 14 | coerce = True 15 | 16 | 17 | class TransactionsParquet(Transactions): 18 | class Config: 19 | from_format = "parquet" 20 | 21 | 22 | class TransactionsOut(Transactions): 23 | id: pa.typing.Series[int] 24 | cost: pa.typing.Series[float] 25 | name: pa.typing.Series[str] 26 | 27 | 28 | class TransactionsJsonOut(TransactionsOut): 29 | class Config: 30 | to_format = "json" 31 | to_format_kwargs = {"orient": "records"} 32 | 33 | 34 | class TransactionsDictOut(TransactionsOut): 35 | class Config: 36 | to_format = "dict" 37 | to_format_kwargs = {"orient": "records"} 38 | 39 | 40 | class Item(BaseModel): 41 | name: str 42 | value: int = Field(ge=0) 43 | description: Optional[str] = None 44 | 45 | 46 | class ResponseModel(BaseModel): 47 | filename: str 48 | df: pa.typing.DataFrame[TransactionsJsonOut] 49 | -------------------------------------------------------------------------------- /tests/fastapi/test_app.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=redefined-outer-name,unused-argument 2 | """Unit tests for using pandera types in fastapi endpoints.""" 3 | 4 | import io 5 | import subprocess 6 | import time 7 | from copy import deepcopy 8 | 9 | import pandas as pd 10 | import pytest 11 | import requests 12 | from hypothesis import given 13 | 14 | from tests.fastapi.models import Transactions, TransactionsOut 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def app(): 19 | """Transient app server for testing.""" 20 | # pylint: disable=consider-using-with 21 | process = subprocess.Popen( 22 | ["uvicorn", "tests.fastapi.app:app", "--port", "8000"], 23 | stdout=subprocess.PIPE, 24 | ) 25 | _wait_to_exist() 26 | yield process 27 | process.terminate() 28 | 29 | 30 | def _wait_to_exist(): 31 | for _ in range(20): 32 | try: 33 | requests.post("http://127.0.0.1:8000/") 34 | break 35 | except Exception: # pylint: disable=broad-except 36 | time.sleep(3.0) 37 | 38 | 39 | def test_items_endpoint(app): 40 | """Happy path test with pydantic type annotations.""" 41 | data = {"name": "Book", "value": 10, "description": "Hello"} 42 | for _ in range(10): 43 | response = requests.post("http://127.0.0.1:8000/items/", json=data) 44 | if response.status_code != 200: 45 | time.sleep(3.0) 46 | assert response.json() == data 47 | 48 | 49 | def test_transactions_endpoint(app): 50 | """Happy path test with pandera type endpoint type annotation.""" 51 | data = {"id": [1], "cost": [10.99]} 52 | response = requests.post( 53 | "http://127.0.0.1:8000/transactions/", 54 | json=data, 55 | ) 56 | expected_output = deepcopy(data) 57 | expected_output = [{"id": 1, "cost": 10.99, "name": "foo"}] 58 | assert response.json() == expected_output 59 | 60 | 61 | @given(Transactions.strategy(size=10)) 62 | def test_upload_file_endpoint(app, sample): 63 | """ 64 | Test upload file endpoint with Upload[DataFrame[DataFrameModel]] input. 65 | """ 66 | buf = io.BytesIO() 67 | sample.to_parquet(buf) 68 | buf.seek(0) 69 | 70 | expected_result = pd.read_parquet(buf).assign(name="foo") 71 | buf.seek(0) 72 | 73 | response = requests.post( 74 | "http://127.0.0.1:8000/file/", files={"file": buf} 75 | ) 76 | output = response.json() 77 | assert output["filename"] == "file" 78 | output_df = pd.read_json(output["df"]) 79 | cost_notna = ~output_df["cost"].isna() 80 | pd.testing.assert_frame_equal( 81 | TransactionsOut.validate(output_df[cost_notna]), 82 | TransactionsOut.validate(expected_result[cost_notna]), 83 | ) 84 | -------------------------------------------------------------------------------- /tests/hypotheses/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/hypotheses/__init__.py -------------------------------------------------------------------------------- /tests/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/io/__init__.py -------------------------------------------------------------------------------- /tests/modin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/modin/__init__.py -------------------------------------------------------------------------------- /tests/modin/conftest.py: -------------------------------------------------------------------------------- 1 | """Registers fixtures for core""" 2 | 3 | import os 4 | from typing import Generator 5 | 6 | import pytest 7 | from pandera.api.checks import Check 8 | 9 | # pylint: disable=unused-import 10 | ENGINES = os.getenv("CI_MODIN_ENGINES", "").split(",") 11 | if ENGINES == [""]: 12 | ENGINES = ["dask"] 13 | 14 | 15 | @pytest.fixture(scope="function") 16 | def custom_check_teardown() -> Generator[None, None, None]: 17 | """Remove all custom checks after execution of each pytest function.""" 18 | yield 19 | for check_name in list(Check.REGISTERED_CUSTOM_CHECKS): 20 | del Check.REGISTERED_CUSTOM_CHECKS[check_name] 21 | 22 | 23 | @pytest.fixture(scope="session", params=ENGINES, autouse=True) 24 | def setup_modin_engine(request): 25 | """Set up the modin engine. 26 | 27 | Eventually this will also support dask execution backend. 28 | """ 29 | engine = request.param 30 | os.environ["MODIN_ENGINE"] = engine 31 | os.environ["MODIN_STORAGE_FORMAT"] = "pandas" 32 | os.environ["MODIN_MEMORY"] = "100000000" 33 | os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1" 34 | 35 | if engine == "ray": 36 | # pylint: disable=import-outside-toplevel 37 | import ray 38 | 39 | ray.init( 40 | runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}} 41 | ) 42 | yield 43 | ray.shutdown() 44 | 45 | elif engine == "dask": 46 | # pylint: disable=import-outside-toplevel 47 | from distributed import Client 48 | 49 | client = Client() 50 | yield 51 | client.close() 52 | else: 53 | raise ValueError(f"Not supported engine: {engine}") 54 | -------------------------------------------------------------------------------- /tests/modin/test_modin_accessor.py: -------------------------------------------------------------------------------- 1 | """Unit tests of modin accessor functionality. 2 | 3 | Since modin doesn't currently support the pandas accessor extension API, 4 | pandera implements it. 5 | """ 6 | 7 | import pytest 8 | 9 | from pandera.accessors import modin_accessor 10 | 11 | 12 | # pylint: disable=too-few-public-methods 13 | class CustomAccessor: 14 | """Mock accessor class""" 15 | 16 | def __init__(self, obj): 17 | self._obj = obj 18 | 19 | 20 | def test_modin_accessor_warning(): 21 | """Test that modin accessor raises warning when name already exists.""" 22 | modin_accessor.register_dataframe_accessor("foo")(CustomAccessor) 23 | with pytest.warns(UserWarning): 24 | modin_accessor.register_dataframe_accessor("foo")(CustomAccessor) 25 | -------------------------------------------------------------------------------- /tests/mypy/config/no_plugin.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | follow_imports = silent 4 | show_error_codes = True 5 | allow_redefinition = True 6 | warn_return_any = False 7 | warn_unused_configs = True 8 | -------------------------------------------------------------------------------- /tests/mypy/config/plugin_mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | plugins = pandera.mypy 3 | ignore_missing_imports = True 4 | follow_imports = skip 5 | show_error_codes = True 6 | allow_redefinition = True 7 | warn_return_any = False 8 | warn_unused_configs = True 9 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/pandas_concat.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import pandas as pd 3 | 4 | df = pd.DataFrame([[1]]) 5 | sr = pd.Series([1]) 6 | 7 | 8 | df_concat = pd.concat([df, df]) 9 | sr_concat = pd.concat([sr, sr]) 10 | sr_axis1_concat = pd.concat([sr, sr], axis=1) 11 | 12 | # mypy error without pandera plugin 13 | df_generator_concat: pd.DataFrame = pd.concat(df for _ in range(3)) 14 | 15 | # mypy error without pandera plugin 16 | sr_generator_concat: pd.Series = pd.concat(sr for _ in range(3)) 17 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/pandas_dataframe.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | """Unit tests for static type checking of dataframes. 3 | 4 | This test module uses https://github.com/davidfritzsche/pytest-mypy-testing to 5 | run statically check the functions marked pytest.mark.mypy_testing 6 | """ 7 | 8 | from typing import Optional, cast 9 | 10 | import pandas as pd 11 | 12 | import pandera.pandas as pa 13 | from pandera.typing import DataFrame, Series 14 | 15 | 16 | class Schema(pa.DataFrameModel): 17 | id: Series[int] 18 | name: Series[str] 19 | 20 | 21 | class SchemaOut(pa.DataFrameModel): 22 | age: Series[int] 23 | 24 | 25 | class AnotherSchema(pa.DataFrameModel): 26 | id: Series[int] 27 | first_name: Optional[Series[str]] 28 | 29 | 30 | def fn(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 31 | return df.assign(age=30).pipe(DataFrame[SchemaOut]) # mypy okay 32 | 33 | 34 | def fn_pipe_incorrect_type(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 35 | return df.assign(age=30).pipe(DataFrame[AnotherSchema]) # mypy error 36 | # error: Argument 1 to "pipe" of "NDFrame" has incompatible type "Type[DataFrame[Any]]"; # noqa 37 | # expected "Union[Callable[..., DataFrame[SchemaOut]], Tuple[Callable[..., DataFrame[SchemaOut]], str]]" [arg-type] # noqa 38 | 39 | 40 | def fn_assign_copy(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 41 | return df.assign(age=30) # mypy error 42 | # error: Incompatible return value type (got "pandas.core.frame.DataFrame", 43 | # expected "pandera.typing.pandas.DataFrame[SchemaOut]") [return-value] 44 | 45 | 46 | # Define a few dataframe objects 47 | schema_df = DataFrame[Schema]({"id": [1], "name": ["foo"]}) 48 | pandas_df = pd.DataFrame({"id": [1], "name": ["foo"]}) 49 | another_df = DataFrame[AnotherSchema]({"id": [1], "first_name": ["foo"]}) 50 | 51 | 52 | fn(schema_df) # mypy okay 53 | 54 | fn(pandas_df) # mypy error 55 | # error: Argument 1 to "fn" has incompatible type "pandas.core.frame.DataFrame"; # noqa 56 | # expected "pandera.typing.pandas.DataFrame[Schema]" [arg-type] 57 | 58 | fn(another_df) # mypy error 59 | # error: Argument 1 to "fn" has incompatible type "DataFrame[AnotherSchema]"; 60 | # expected "DataFrame[Schema]" [arg-type] 61 | 62 | 63 | def fn_pipe_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 64 | return df.assign(age=30).pipe(DataFrame[SchemaOut]) # mypy okay 65 | 66 | 67 | def fn_cast_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 68 | return cast(DataFrame[SchemaOut], df.assign(age=30)) # mypy okay 69 | 70 | 71 | @pa.check_types 72 | def fn_mutate_inplace(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 73 | out = df.assign(age=30).pipe(DataFrame[SchemaOut]) 74 | out.drop(columns="age", inplace=True) 75 | return out # okay for mypy, pandera raises error 76 | 77 | 78 | @pa.check_types 79 | def fn_assign_and_get_index(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 80 | return df.assign(foo=30).iloc[:3] # mypy error 81 | # error: Incompatible return value type (got "pandas.core.frame.DataFrame", 82 | # expected "pandera.typing.pandas.DataFrame[SchemaOut]") [return-value] 83 | 84 | 85 | @pa.check_types 86 | def fn_cast_dataframe_invalid(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: 87 | return cast( 88 | DataFrame[SchemaOut], df 89 | ) # okay for mypy, pandera raises error 90 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/pandas_index.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import pandas as pd 3 | 4 | df = pd.DataFrame({"a": [1, 2, 3]}) 5 | sr = pd.Series([1, 2, 3]) 6 | idx = pd.Index([1, 2, 3]) 7 | 8 | df_index_unique: bool = df.index.is_unique 9 | sr_index_unique: bool = df["a"].index.is_unique 10 | idx_unique: bool = idx.is_unique 11 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/pandas_series.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import pandas as pd 3 | 4 | 5 | def fn(s: pd.Series[str]) -> bool: 6 | return True 7 | 8 | 9 | fn(s=pd.Series([1.0, 1.0, 1.0], dtype=float)) # mypy okay 10 | 11 | series = pd.Series([1.0, 1.0, 1.0], dtype=float) 12 | fn(series) # mypy able to determine `series` type, raises error 13 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/pandas_time.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import pandas as pd 3 | 4 | pd.Timestamp.now() + pd.tseries.offsets.YearEnd(1) 5 | 6 | pd.Timedelta(minutes=2) 7 | pd.Timedelta(2, unit="minutes") 8 | 9 | pd.Timedelta(minutes=2, seconds=30) 10 | pd.Timedelta(2.5, unit="minutes") # mypy error 11 | pd.Timedelta(2, unit="minutes") + pd.Timedelta(30, unit="seconds") 12 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/pandera_inheritance.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | """With the pandera.mypy plugin, mypy ignores type overrides.""" 3 | 4 | import pandera.pandas as pa 5 | 6 | 7 | class Schema(pa.DataFrameModel): 8 | a: pa.typing.Series[int] 9 | b: pa.typing.Series[str] 10 | c: pa.typing.Series[bool] 11 | 12 | 13 | class Schema2(Schema): 14 | a: pa.typing.Series[str] 15 | b: pa.typing.Series[float] 16 | c: pa.typing.Series[int] 17 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/pandera_types.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import pandas as pd 3 | 4 | import pandera.pandas as pa 5 | 6 | 7 | def fn(series: pa.typing.Series[int]) -> None: 8 | pass 9 | 10 | 11 | df = pd.DataFrame({"a": [1, 2, 3]}) 12 | sr = pd.Series([1, 2, 3]) 13 | 14 | fn(sr) 15 | fn(df["a"]) 16 | -------------------------------------------------------------------------------- /tests/mypy/pandas_modules/python_slice.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import pandas as pd 3 | 4 | df = pd.DataFrame({"a": [1, 2, 3]}, index=[*"abc"]) 5 | df.loc["a":"c"] 6 | -------------------------------------------------------------------------------- /tests/pandas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/pandas/__init__.py -------------------------------------------------------------------------------- /tests/pandas/checks_fixtures.py: -------------------------------------------------------------------------------- 1 | """Pytest fixtures for testing custom checks.""" 2 | 3 | from typing import Generator 4 | from unittest import mock 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | import pandera.pandas as pa 10 | import pandera.api.extensions as pa_ext 11 | 12 | __all__ = "custom_check_teardown", "extra_registered_checks" 13 | 14 | 15 | @pytest.fixture(scope="function") 16 | def custom_check_teardown() -> Generator[None, None, None]: 17 | """Remove all custom checks after execution of each pytest function.""" 18 | yield 19 | for check_name in list(pa.Check.REGISTERED_CUSTOM_CHECKS): 20 | del pa.Check.REGISTERED_CUSTOM_CHECKS[check_name] 21 | 22 | 23 | @pytest.fixture(scope="function") 24 | def extra_registered_checks() -> Generator[None, None, None]: 25 | """temporarily registers custom checks onto the Check class""" 26 | # pylint: disable=unused-variable 27 | with mock.patch( 28 | "pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict 29 | ): 30 | # register custom checks here 31 | @pa_ext.register_check_method() 32 | def no_param_check(_: pd.DataFrame) -> bool: 33 | return True 34 | 35 | @pa_ext.register_check_method() 36 | def no_param_check_ellipsis(_: pd.DataFrame) -> bool: 37 | return True 38 | 39 | @pa_ext.register_check_method() 40 | def raise_an_error_check(_: pd.DataFrame) -> bool: 41 | raise TypeError("Test error in custom check") 42 | 43 | yield 44 | -------------------------------------------------------------------------------- /tests/pandas/conftest.py: -------------------------------------------------------------------------------- 1 | """Registers fixtures for core""" 2 | 3 | # pylint: disable=unused-import 4 | from tests.pandas.checks_fixtures import ( 5 | custom_check_teardown, 6 | extra_registered_checks, 7 | ) 8 | -------------------------------------------------------------------------------- /tests/pandas/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/pandas/modules/__init__.py -------------------------------------------------------------------------------- /tests/pandas/modules/validate_on_init.py: -------------------------------------------------------------------------------- 1 | """Module for unit testing validation on initialization.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pandera.pandas as pa 6 | from pandera.typing import DataFrame 7 | 8 | 9 | class ExampleSchema(pa.DataFrameModel): 10 | class Config: 11 | coerce = True 12 | 13 | a: np.int64 14 | 15 | 16 | ExampleDataFrame = DataFrame[ExampleSchema] 17 | validated_dataframe = ExampleDataFrame(pd.DataFrame([], columns=["a"])) 18 | -------------------------------------------------------------------------------- /tests/pandas/test__pandas_deprecated__test_model.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=wrong-import-position,wildcard-import,unused-wildcard-import 2 | """Unit tests for the deprecated top-level pandera DataFrameModel class. 3 | 4 | Delete this file once the top-level pandera._pandas_deprecated module is 5 | removed. 6 | """ 7 | 8 | import pytest 9 | from pandera._pandas_deprecated import DataFrameModel as _DataFrameModel 10 | 11 | 12 | @pytest.fixture(autouse=True) 13 | def monkeypatch_dataframe_model(monkeypatch): 14 | """Monkeypatch DataFrameModel before importing test_schemas""" 15 | monkeypatch.setattr( 16 | "tests.pandas.test_schemas.DataFrameModel", _DataFrameModel 17 | ) 18 | 19 | 20 | from tests.pandas.test_schemas import * 21 | -------------------------------------------------------------------------------- /tests/pandas/test__pandas_deprecated__test_schemas.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=wrong-import-position,wildcard-import,unused-wildcard-import 2 | """Unit tests for the deprecated top-level pandera DataFrameSchema class. 3 | 4 | Delete this file once the top-level pandera._pandas_deprecated module is 5 | removed. 6 | """ 7 | 8 | import pytest 9 | from pandera._pandas_deprecated import DataFrameSchema as _DataFrameSchema 10 | 11 | 12 | @pytest.fixture(autouse=True) 13 | def monkeypatch_dataframe_schema(monkeypatch): 14 | """Monkeypatch DataFrameSchema before importing test_schemas""" 15 | monkeypatch.setattr( 16 | "tests.pandas.test_schemas.DataFrameSchema", _DataFrameSchema 17 | ) 18 | 19 | 20 | from tests.pandas.test_schemas import * 21 | -------------------------------------------------------------------------------- /tests/pandas/test_docs_setting_column_widths.py: -------------------------------------------------------------------------------- 1 | """Some of the doctest examples only work if the terminal is the correct width 2 | because of the way __str__/__repr__ works in pandas. This checks that 3 | conditions necessary for the doctests to pass properly exist on the host 4 | system.""" 5 | 6 | import pandas as pd 7 | 8 | from docs.source import conf 9 | 10 | 11 | def test_sphinx_doctest_setting_global_pandas_conditions() -> None: 12 | """Checks that no limit is set on the height/width of the __repr__/__str__ 13 | print of a pd.DataFrame to ensure doctest performs consistently across 14 | different Operating Systems.""" 15 | # pylint: disable=W0122 16 | exec(conf.doctest_global_setup) 17 | 18 | max_cols_after_being_set = pd.options.display.max_columns 19 | max_rows_after_being_set = pd.options.display.max_rows 20 | assert max_cols_after_being_set is None 21 | assert max_rows_after_being_set is None 22 | -------------------------------------------------------------------------------- /tests/pandas/test_engine_utils.py: -------------------------------------------------------------------------------- 1 | """Unit tests for engine module utility functions.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | from pandera.engines import utils 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "data_container, data_type, expected_failure_cases", 12 | [ 13 | [pd.Series(list("ab1cd3")), int, [False, False, True] * 2], 14 | [pd.Series(list("12345")), int, [True] * 5], 15 | [pd.Series([1, 2, "foo", "bar"]), float, [True, True, False, False]], 16 | ], 17 | ) 18 | def test_numpy_pandas_coercible( 19 | data_container, data_type, expected_failure_cases 20 | ): 21 | """Test that the correct boolean Series outputs are returned.""" 22 | assert ( 23 | expected_failure_cases 24 | == utils.numpy_pandas_coercible(data_container, data_type).tolist() 25 | ) 26 | 27 | 28 | @pytest.mark.parametrize( 29 | "data_container", 30 | [ 31 | pd.Series([1, 2, 3, 4]), 32 | np.array([1, 2, 3, 4]), 33 | pd.DataFrame({0: [1, 2, 3, 4]}), 34 | np.array([[1], [2], [3], [4]]), 35 | ], 36 | ) 37 | def test_numpy_pandas_coerce_failure_cases(data_container): 38 | """ 39 | Test that different data container types can be checked for coerce failure 40 | cases. 41 | """ 42 | failure_cases = utils.numpy_pandas_coerce_failure_cases( 43 | data_container, int 44 | ) 45 | assert failure_cases is None 46 | 47 | 48 | @pytest.mark.parametrize( 49 | "invalid_data_container, exception_type", 50 | [ 51 | [1, TypeError], 52 | [5.1, TypeError], 53 | ["foobar", TypeError], 54 | [[1, 2, 3], TypeError], 55 | [{0: 1}, TypeError], 56 | # pylint: disable=too-many-function-args 57 | [np.array([1]).reshape(1, 1, 1), ValueError], 58 | ], 59 | ) 60 | def test_numpy_pandas_coerce_failure_cases_exceptions( 61 | invalid_data_container, exception_type 62 | ): 63 | """ 64 | Test exceptions of trying to get failure cases for invalid input types. 65 | """ 66 | error_msg = { 67 | TypeError: "type of data_container .+ not understood", 68 | ValueError: "only numpy arrays of 1 or 2 dimensions are supported", 69 | }[exception_type] 70 | with pytest.raises(exception_type, match=error_msg): 71 | utils.numpy_pandas_coerce_failure_cases(invalid_data_container, int) 72 | -------------------------------------------------------------------------------- /tests/pandas/test_extension_modules.py: -------------------------------------------------------------------------------- 1 | """Tests for extension module imports.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pandera.api.hypotheses import Hypothesis 7 | 8 | 9 | try: 10 | from scipy import stats # pylint: disable=unused-import 11 | except ImportError: # pragma: no cover 12 | SCIPY_INSTALLED = False 13 | else: 14 | SCIPY_INSTALLED = True 15 | 16 | 17 | def test_hypotheses_module_import() -> None: 18 | """Test that Hypothesis built-in methods raise import error.""" 19 | data = pd.Series([1, 2, 3]) 20 | if not SCIPY_INSTALLED: 21 | for fn, check_args in [ 22 | ( 23 | lambda: Hypothesis.two_sample_ttest("sample1", "sample2"), 24 | pd.DataFrame({"sample1": data, "sample2": data}), 25 | ), 26 | (lambda: Hypothesis.one_sample_ttest(popmean=10), data), 27 | ]: 28 | with pytest.raises(ImportError): 29 | check = fn() 30 | check(check_args) 31 | -------------------------------------------------------------------------------- /tests/pandas/test_model_components.py: -------------------------------------------------------------------------------- 1 | """Tests individual model components.""" 2 | 3 | from typing import Any 4 | 5 | import pytest 6 | 7 | import pandera.pandas as pa 8 | from pandera.engines.pandas_engine import Engine 9 | 10 | 11 | def test_field_to_column() -> None: 12 | """Test that Field outputs the correct column options.""" 13 | for flag in ["nullable", "unique", "coerce", "regex"]: 14 | for value in [True, False]: 15 | col_kwargs = pa.Field(**{flag: value}).column_properties( # type: ignore[arg-type] 16 | pa.DateTime, required=value 17 | ) 18 | col = pa.Column(**col_kwargs) 19 | assert col.dtype == Engine.dtype(pa.DateTime) 20 | assert col.properties[flag] == value 21 | assert col.required == value 22 | 23 | 24 | def test_field_to_index() -> None: 25 | """Test that Field outputs the correct index options.""" 26 | for flag in ["nullable", "unique"]: 27 | for value in [True, False]: 28 | index_kwargs = pa.Field(**{flag: value}).index_properties( # type: ignore[arg-type] 29 | pa.DateTime 30 | ) 31 | index = pa.Index(**index_kwargs) 32 | assert index.dtype == Engine.dtype(pa.DateTime) 33 | assert getattr(index, flag) == value 34 | 35 | 36 | def test_field_no_checks() -> None: 37 | """Test Field without checks.""" 38 | assert not pa.Field().column_properties(str)["checks"] 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "arg,value,expected", 43 | [ 44 | ("eq", 9, pa.Check.equal_to(9)), 45 | ("ne", 9, pa.Check.not_equal_to(9)), 46 | ("gt", 9, pa.Check.greater_than(9)), 47 | ("ge", 9, pa.Check.greater_than_or_equal_to(9)), 48 | ("lt", 9, pa.Check.less_than(9)), 49 | ("le", 9, pa.Check.less_than_or_equal_to(9)), 50 | ( 51 | "in_range", 52 | {"min_value": 1, "max_value": 9}, 53 | pa.Check.in_range(1, 9), 54 | ), 55 | ("isin", [9, "a"], pa.Check.isin([9, "a"])), 56 | ("notin", [9, "a"], pa.Check.notin([9, "a"])), 57 | ("str_contains", "a", pa.Check.str_contains("a")), 58 | ("str_endswith", "a", pa.Check.str_endswith("a")), 59 | ("str_matches", "a", pa.Check.str_matches("a")), 60 | ( 61 | "str_length", 62 | {"min_value": 1, "max_value": 9}, 63 | pa.Check.str_length(1, 9), 64 | ), 65 | ("str_startswith", "a", pa.Check.str_startswith("a")), 66 | ], 67 | ) 68 | def test_field_checks(arg: str, value: Any, expected: pa.Check) -> None: 69 | """Test that all built-in checks are available in a Field.""" 70 | checks = pa.Field(**{arg: value}).column_properties(str)["checks"] 71 | assert len(checks) == 1 72 | assert checks[0] == expected 73 | -------------------------------------------------------------------------------- /tests/pandas/test_multithreaded.py: -------------------------------------------------------------------------------- 1 | """Test that pandera schemas are thread safe.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from joblib import Parallel, delayed 6 | 7 | import pandera.pandas as pa 8 | 9 | 10 | class Model(pa.DataFrameModel): 11 | time: pa.typing.Series[np.float32] = pa.Field(coerce=True) 12 | 13 | 14 | def validate_df(df): 15 | validated_df = Model.to_schema().validate(df) 16 | assert validated_df.dtypes["time"] == np.float32 17 | return validated_df 18 | 19 | 20 | def test_single_thread(): 21 | df = pd.DataFrame({"time": np.array([1.0, 2.0, 3.0], dtype=np.float64)}) 22 | validate_df(df) 23 | 24 | 25 | def test_multithreading(): 26 | df = pd.DataFrame({"time": np.array([1.0, 2.0, 3.0], dtype=np.float64)}) 27 | n_tries = 10 28 | total = 8 29 | n_jobs = 4 30 | 31 | for _ in range(n_tries): 32 | results = Parallel(n_jobs=n_jobs, prefer="threads")( 33 | delayed(validate_df)(df) for _ in range(total) 34 | ) 35 | for res in results: 36 | assert res.dtypes["time"] == np.float32 37 | -------------------------------------------------------------------------------- /tests/pandas/test_numpy_engine.py: -------------------------------------------------------------------------------- 1 | """Test numpy engine.""" 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from pandera.engines import numpy_engine 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "data_type", list(numpy_engine.Engine.get_registered_dtypes()) 11 | ) 12 | def test_numpy_data_type(data_type): 13 | """Test base numpy engine DataType.""" 14 | numpy_engine.Engine.dtype(data_type) 15 | numpy_engine.Engine.dtype(data_type.type) 16 | numpy_engine.Engine.dtype(str(data_type.type)) 17 | with pytest.warns(UserWarning): 18 | np_dtype = numpy_engine.DataType(data_type.type) 19 | with pytest.warns(UserWarning): 20 | np_dtype_from_str = numpy_engine.DataType(str(data_type.type)) 21 | assert np_dtype == np_dtype_from_str 22 | 23 | 24 | @pytest.mark.parametrize("data_type", ["foo", "bar", 1, 2, 3.14, np.void]) 25 | def test_numpy_engine_dtype_exceptions(data_type): 26 | """Test invalid inputs to numpy data-types.""" 27 | if data_type != np.void: 28 | with pytest.raises( 29 | TypeError, match="data type '.+' not understood by" 30 | ): 31 | numpy_engine.Engine.dtype(data_type) 32 | else: 33 | numpy_engine.Engine._registered_dtypes = set() 34 | numpy_engine.Engine.dtype(data_type) 35 | 36 | 37 | def test_numpy_string(): 38 | """Test numpy engine String data type coercion.""" 39 | # pylint: disable=no-value-for-parameter 40 | string_type = numpy_engine.String() 41 | assert ( 42 | string_type.coerce(np.array([1, 2, 3, 4, 5], dtype=int)) 43 | == np.array(list("12345")) 44 | ).all() 45 | assert string_type.check(numpy_engine.String()) 46 | -------------------------------------------------------------------------------- /tests/pandas/test_pandas_accessor.py: -------------------------------------------------------------------------------- 1 | """Unit tests for pandas_accessor module.""" 2 | 3 | from typing import Union 4 | from unittest.mock import patch 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | import pandera.pandas as pa 10 | import pandera.api.pandas.container 11 | from pandera.errors import BackendNotFoundError 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "schema1, schema2, data, invalid_data", 16 | [ 17 | [ 18 | pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True), 19 | pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True), 20 | pd.DataFrame({"col": [1, 2, 3]}), 21 | pd.Series([1, 2, 3]), 22 | ], 23 | [ 24 | pa.SeriesSchema(int, coerce=True), 25 | pa.SeriesSchema(float, coerce=True), 26 | pd.Series([1, 2, 3]), 27 | pd.DataFrame({"col": [1, 2, 3]}), 28 | ], 29 | ], 30 | ) 31 | @pytest.mark.parametrize("inplace", [False, True]) 32 | def test_dataframe_series_add_schema( 33 | schema1: Union[pa.DataFrameSchema, pa.SeriesSchema], 34 | schema2: Union[pa.DataFrameSchema, pa.SeriesSchema], 35 | data: Union[pd.DataFrame, pd.Series], 36 | invalid_data: Union[pd.DataFrame, pd.Series], 37 | inplace: bool, 38 | ) -> None: 39 | """ 40 | Test that pandas object contains schema metadata after pandera validation. 41 | """ 42 | validated_data_1 = schema1(data, inplace=inplace) # type: ignore 43 | if inplace: 44 | assert data.pandera.schema == schema1 45 | else: 46 | assert data.pandera.schema is None 47 | assert validated_data_1.pandera.schema == schema1 48 | 49 | validated_data_2 = schema2(validated_data_1, inplace=inplace) # type: ignore 50 | if inplace: 51 | assert validated_data_1.pandera.schema == schema2 52 | else: 53 | assert validated_data_1.pandera.schema == schema1 54 | assert validated_data_2.pandera.schema == schema2 55 | 56 | with pytest.raises((BackendNotFoundError, TypeError)): 57 | schema1(invalid_data) # type: ignore 58 | 59 | with pytest.raises((BackendNotFoundError, TypeError)): 60 | schema2(invalid_data) # type: ignore 61 | 62 | with patch.object( 63 | pandera.backends.pandas.container, 64 | "is_table", 65 | return_value=True, 66 | ): 67 | with patch.object( 68 | pandera.api.pandas.array, 69 | "is_field", 70 | return_value=True, 71 | ): 72 | with pytest.raises(BackendNotFoundError): 73 | schema1(invalid_data) # type: ignore 74 | 75 | with pytest.raises(BackendNotFoundError): 76 | schema2(invalid_data) # type: ignore 77 | -------------------------------------------------------------------------------- /tests/pandas/test_pandas_config.py: -------------------------------------------------------------------------------- 1 | """This module is to test the behaviour change based on defined config in pandera""" 2 | 3 | # pylint:disable=import-outside-toplevel,abstract-method,redefined-outer-name 4 | 5 | from dataclasses import asdict 6 | 7 | import pandas as pd 8 | import pytest 9 | 10 | import pandera.pandas as pa 11 | from pandera.pandas import DataFrameModel, DataFrameSchema, SeriesSchema 12 | from pandera.config import ValidationDepth, config_context, get_config_context 13 | 14 | 15 | @pytest.fixture(autouse=True, scope="function") 16 | def disable_validation(): 17 | """Fixture to disable validation and clean up after the test is finished""" 18 | with config_context(validation_enabled=False): 19 | yield 20 | 21 | 22 | class TestPandasDataFrameConfig: 23 | """Class to test all the different configs types""" 24 | 25 | sample_data = pd.DataFrame( 26 | (("Bread", 9), ("Cutter", 15)), columns=["product", "price_val"] 27 | ) 28 | 29 | # pylint: disable=unused-argument 30 | def test_disable_validation(self): 31 | """This function validates that a none object is loaded if validation is disabled""" 32 | 33 | pandera_schema = DataFrameSchema( 34 | { 35 | "product": pa.Column( 36 | str, pa.Check(lambda s: s.str.startswith("B")) 37 | ), 38 | "price_val": pa.Column(int), 39 | } 40 | ) 41 | 42 | class TestSchema(DataFrameModel): 43 | """Test Schema class""" 44 | 45 | product: str = pa.Field(str_startswith="B") 46 | price_val: int = pa.Field() 47 | 48 | expected = { 49 | "cache_dataframe": False, 50 | "keep_cached_dataframe": False, 51 | "validation_enabled": False, 52 | "validation_depth": ValidationDepth.SCHEMA_AND_DATA, 53 | } 54 | 55 | assert asdict(get_config_context()) == expected 56 | assert pandera_schema.validate(self.sample_data) is self.sample_data 57 | assert TestSchema.validate(self.sample_data) is self.sample_data 58 | 59 | 60 | class TestPandasSeriesConfig: 61 | """Class to test all the different configs types""" 62 | 63 | sample_data = pd.Series([1, 1, 2, 2, 3, 3]) 64 | 65 | # pylint: disable=unused-argument 66 | def test_disable_validation(self): 67 | """This function validates that a none object is loaded if validation is disabled""" 68 | expected = { 69 | "cache_dataframe": False, 70 | "keep_cached_dataframe": False, 71 | "validation_enabled": False, 72 | "validation_depth": ValidationDepth.SCHEMA_AND_DATA, 73 | } 74 | pandera_schema = SeriesSchema( 75 | int, pa.Check(lambda s: s.value_counts() == 2, element_wise=False) 76 | ) 77 | assert asdict(get_config_context()) == expected 78 | assert pandera_schema.validate(self.sample_data) is self.sample_data 79 | -------------------------------------------------------------------------------- /tests/pandas/test_pandas_parallel.py: -------------------------------------------------------------------------------- 1 | """Test parallelization with pandas using joblib.""" 2 | 3 | import pandas as pd 4 | from joblib import Parallel, delayed 5 | 6 | from pandera.pandas import Column, DataFrameSchema 7 | 8 | schema = DataFrameSchema({"a": Column("int64")}, coerce=True) 9 | 10 | 11 | def test_polars_parallel(): 12 | def fn(): 13 | return schema.validate(pd.DataFrame({"a": [1]})) 14 | 15 | results = Parallel(2)([delayed(fn)() for _ in range(10)]) 16 | assert len(results) == 10 17 | for result in results: 18 | assert result.dtypes["a"] == "int64" 19 | -------------------------------------------------------------------------------- /tests/pandas/test_pydantic_dtype.py: -------------------------------------------------------------------------------- 1 | """Unit tests for pydantic datatype.""" 2 | 3 | from typing import Type 4 | 5 | import pandas as pd 6 | import pytest 7 | from pydantic import BaseModel 8 | 9 | import pandera.pandas as pa 10 | from pandera.api.pandas.array import ArraySchema 11 | from pandera.engines.pandas_engine import PydanticModel 12 | 13 | 14 | class Record(BaseModel): 15 | """Pydantic record model.""" 16 | 17 | name: str 18 | xcoord: int 19 | ycoord: int 20 | 21 | 22 | class PydanticSchema(pa.DataFrameModel): 23 | """Pandera schema using the pydantic model.""" 24 | 25 | class Config: 26 | """Config with dataframe-level data type.""" 27 | 28 | dtype = PydanticModel(Record) 29 | 30 | 31 | class PanderaSchema(pa.DataFrameModel): 32 | """Pandera schema that's equivalent to PydanticSchema.""" 33 | 34 | name: pa.typing.Series[str] 35 | xcoord: pa.typing.Series[int] 36 | ycoord: pa.typing.Series[int] 37 | 38 | 39 | def test_pydantic_model(): 40 | """Test that pydantic model correctly validates data.""" 41 | 42 | @pa.check_types 43 | def func(df: pa.typing.DataFrame[PydanticSchema]): 44 | return df 45 | 46 | valid_df = pd.DataFrame( 47 | { 48 | "name": ["foo", "bar", "baz"], 49 | "xcoord": [1.0, 2, 3], 50 | "ycoord": [4, 5.0, 6], 51 | } 52 | ) 53 | 54 | invalid_df = pd.DataFrame( 55 | { 56 | "name": ["foo", "bar", "baz"], 57 | "xcoord": [1, 2, "c"], 58 | "ycoord": [4, 5, "d"], 59 | } 60 | ) 61 | 62 | validated = func(valid_df) 63 | PanderaSchema.validate(validated) 64 | 65 | expected_failure_cases = pd.DataFrame( 66 | {"index": [2], "failure_case": ["{'xcoord': 'c', 'ycoord': 'd'}"]} 67 | ) 68 | 69 | try: 70 | func(invalid_df) 71 | except pa.errors.SchemaError as exc: 72 | pd.testing.assert_frame_equal( 73 | exc.failure_cases, expected_failure_cases 74 | ) 75 | 76 | 77 | @pytest.mark.parametrize("series_type", [pa.SeriesSchema, pa.Column, pa.Index]) 78 | def test_pydantic_model_init_errors(series_type: Type[ArraySchema]): 79 | """ 80 | Should raise SchemaInitError with PydanticModel as `SeriesSchemaBase.dtype` 81 | """ 82 | with pytest.raises(pa.errors.SchemaInitError): 83 | series_type(dtype=PydanticModel(Record)) 84 | 85 | 86 | @pytest.mark.parametrize("coerce", [True, False]) 87 | def test_pydantic_model_coerce(coerce: bool): 88 | """Test that DataFrameSchema.coerce is always True with pydantic model""" 89 | 90 | dataframe_schema = pa.DataFrameSchema( 91 | dtype=PydanticModel(Record), coerce=coerce 92 | ) 93 | assert dataframe_schema.coerce is True 94 | -------------------------------------------------------------------------------- /tests/pandas/test_validation_depth.py: -------------------------------------------------------------------------------- 1 | """Unit tests for granular control based on validation depth.""" 2 | 3 | import pytest 4 | 5 | from pandera.backends.base import CoreCheckResult 6 | from pandera.config import ValidationDepth, ValidationScope, config_context 7 | from pandera.validation_depth import validate_scope 8 | 9 | 10 | def custom_backend(): 11 | class CustomBackend: 12 | 13 | # pylint: disable=unused-argument 14 | @validate_scope(ValidationScope.SCHEMA) 15 | def check_schema(self, check_obj): 16 | # core check result is passed as True when validation scope doesn't 17 | # include schema checks 18 | return CoreCheckResult(passed=False) 19 | 20 | # pylint: disable=unused-argument 21 | @validate_scope(ValidationScope.DATA) 22 | def check_data(self, check_obj): 23 | # core check result is passed as True when validation scope doesn't 24 | # include data checks 25 | return CoreCheckResult(passed=False) 26 | 27 | return CustomBackend() 28 | 29 | 30 | @pytest.mark.parametrize( 31 | "validation_depth,expected", 32 | [ 33 | [ValidationDepth.SCHEMA_ONLY, [False, True]], 34 | [ValidationDepth.DATA_ONLY, [True, False]], 35 | [ValidationDepth.SCHEMA_AND_DATA, [False, False]], 36 | [None, [False, False]], 37 | ], 38 | ) 39 | def test_validate_scope(validation_depth, expected): 40 | 41 | with config_context(validation_depth=validation_depth): 42 | backend = custom_backend() 43 | schema_result = backend.check_schema("foo") 44 | data_result = backend.check_data("foo") 45 | results = [schema_result.passed, data_result.passed] 46 | assert results == expected 47 | -------------------------------------------------------------------------------- /tests/polars/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/polars/__init__.py -------------------------------------------------------------------------------- /tests/polars/conftest.py: -------------------------------------------------------------------------------- 1 | """Polars unit test-specific configuration.""" 2 | 3 | import pytest 4 | 5 | from pandera.config import CONFIG, ValidationDepth, reset_config_context 6 | 7 | 8 | @pytest.fixture(scope="function", autouse=True) 9 | def validation_depth_schema_and_data(): 10 | """ 11 | These tests ensure that the validation depth is set to SCHEMA_AND_DATA 12 | for unit tests. 13 | """ 14 | _validation_depth = CONFIG.validation_depth 15 | CONFIG.validation_depth = ValidationDepth.SCHEMA_AND_DATA 16 | try: 17 | yield 18 | finally: 19 | CONFIG.validation_depth = _validation_depth 20 | reset_config_context() 21 | -------------------------------------------------------------------------------- /tests/polars/test_polars_dataframe_generic.py: -------------------------------------------------------------------------------- 1 | """Unit tests for polars LazyFrame generic.""" 2 | 3 | import polars as pl 4 | import pytest 5 | 6 | import pandera.polars as pa 7 | from pandera.typing.polars import LazyFrame, Series 8 | 9 | 10 | def test_series_annotation(): 11 | class Model(pa.DataFrameModel): 12 | col1: Series[pl.Int64] 13 | 14 | data = pl.LazyFrame( 15 | { 16 | "col1": [1, 2, 3], 17 | } 18 | ) 19 | 20 | assert data.collect().equals(Model.validate(data).collect()) 21 | 22 | invalid_data = data.cast({"col1": pl.Float64}) 23 | with pytest.raises(pa.errors.SchemaError): 24 | Model.validate(invalid_data).collect() 25 | 26 | 27 | def test_lazyframe_generic_simple(): 28 | class Model(pa.DataFrameModel): 29 | col1: pl.Int64 30 | col2: pl.Utf8 31 | col3: pl.Float64 32 | 33 | @pa.check_types 34 | def fn(lf: LazyFrame[Model]) -> LazyFrame[Model]: 35 | return lf 36 | 37 | data = pl.LazyFrame( 38 | { 39 | "col1": [1, 2, 3], 40 | "col2": [*"abc"], 41 | "col3": [1.0, 2.0, 3.0], 42 | } 43 | ) 44 | 45 | assert data.collect().equals(fn(data).collect()) 46 | 47 | invalid_data = data.cast({"col3": pl.Int64}) 48 | with pytest.raises(pa.errors.SchemaError): 49 | fn(invalid_data).collect() 50 | 51 | 52 | def test_lazyframe_generic_transform(): 53 | class Input(pa.DataFrameModel): 54 | col1: pl.Int64 55 | col2: pl.Utf8 56 | 57 | class Output(Input): 58 | col3: pl.Float64 59 | 60 | @pa.check_types 61 | def fn(lf: LazyFrame[Input]) -> LazyFrame[Output]: 62 | return lf.with_columns(col3=pl.lit(3.0)) # type: ignore 63 | 64 | @pa.check_types 65 | def invalid_fn(lf: LazyFrame[Input]) -> LazyFrame[Output]: 66 | return lf # type: ignore 67 | 68 | data = pl.LazyFrame( 69 | { 70 | "col1": [1, 2, 3], 71 | "col2": [*"abc"], 72 | } 73 | ) 74 | 75 | assert isinstance(fn(data).collect(), pl.DataFrame) 76 | 77 | with pytest.raises(pa.errors.SchemaError): 78 | invalid_fn(data).collect() 79 | -------------------------------------------------------------------------------- /tests/polars/test_polars_decorators.py: -------------------------------------------------------------------------------- 1 | """Unit tests for using schemas with polars and function decorators.""" 2 | 3 | import polars as pl 4 | import pytest 5 | 6 | import pandera.polars as pa 7 | import pandera.typing.polars as pa_typing 8 | 9 | 10 | @pytest.fixture 11 | def data() -> pl.DataFrame: 12 | return pl.DataFrame({"a": [1, 2, 3]}) 13 | 14 | 15 | @pytest.fixture 16 | def invalid_data(data) -> pl.DataFrame: 17 | return data.rename({"a": "b"}) 18 | 19 | 20 | def test_polars_dataframe_check_io(data, invalid_data): 21 | # pylint: disable=unused-argument 22 | 23 | schema = pa.DataFrameSchema({"a": pa.Column(int)}) 24 | 25 | @pa.check_input(schema) 26 | def fn_check_input(x): ... 27 | 28 | @pa.check_output(schema) 29 | def fn_check_output(x): 30 | return x 31 | 32 | @pa.check_io(x=schema, out=schema) 33 | def fn_check_io(x): 34 | return x 35 | 36 | @pa.check_io(x=schema, out=schema) 37 | def fn_check_io_invalid(x): 38 | return x.rename({"a": "b"}) 39 | 40 | # valid data should pass 41 | fn_check_input(data) 42 | fn_check_output(data) 43 | fn_check_io(data) 44 | 45 | # invalid data or invalid function should not pass 46 | with pytest.raises(pa.errors.SchemaError): 47 | fn_check_input(invalid_data) 48 | 49 | with pytest.raises(pa.errors.SchemaError): 50 | fn_check_output(invalid_data) 51 | 52 | with pytest.raises(pa.errors.SchemaError): 53 | fn_check_io_invalid(data) 54 | 55 | 56 | def test_polars_dataframe_check_types(data, invalid_data): 57 | # pylint: disable=unused-argument 58 | 59 | class Model(pa.DataFrameModel): 60 | a: int 61 | 62 | @pa.check_types 63 | def fn_check_input(x: pa_typing.DataFrame[Model]): ... 64 | 65 | @pa.check_types 66 | def fn_check_output(x) -> pa_typing.DataFrame[Model]: 67 | return x 68 | 69 | @pa.check_types 70 | def fn_check_io( 71 | x: pa_typing.DataFrame[Model], 72 | ) -> pa_typing.DataFrame[Model]: 73 | return x 74 | 75 | @pa.check_types 76 | def fn_check_io_invalid( 77 | x: pa_typing.DataFrame[Model], 78 | ) -> pa_typing.DataFrame[Model]: 79 | return x.rename({"a": "b"}) # type: ignore 80 | 81 | # valid data should pass 82 | fn_check_input(data) 83 | fn_check_output(data) 84 | fn_check_io(data) 85 | 86 | # invalid data or invalid function should not pass 87 | with pytest.raises(pa.errors.SchemaError): 88 | fn_check_input(invalid_data) 89 | 90 | with pytest.raises(pa.errors.SchemaError): 91 | fn_check_output(invalid_data) 92 | 93 | with pytest.raises(pa.errors.SchemaError): 94 | fn_check_io_invalid(data) 95 | -------------------------------------------------------------------------------- /tests/polars/test_polars_parallel.py: -------------------------------------------------------------------------------- 1 | """Test parallelization with polars using joblib.""" 2 | 3 | import polars as pl 4 | from joblib import Parallel, delayed 5 | 6 | from pandera.polars import Column, DataFrameSchema 7 | 8 | schema = DataFrameSchema({"a": Column(pl.Int32)}, coerce=True) 9 | 10 | 11 | def test_polars_parallel(): 12 | def fn(): 13 | return schema.validate(pl.DataFrame({"a": [1]})) 14 | 15 | results = Parallel(2)([delayed(fn)() for _ in range(10)]) 16 | assert len(results) == 10 17 | for result in results: 18 | assert result.schema["a"] == pl.Int32 19 | -------------------------------------------------------------------------------- /tests/polars/test_polars_strategies.py: -------------------------------------------------------------------------------- 1 | """Unit tests for polars strategy methods.""" 2 | 3 | import pytest 4 | 5 | import pandera.polars as pa 6 | 7 | 8 | def test_dataframe_schema_strategy(): 9 | schema = pa.DataFrameSchema() 10 | 11 | with pytest.raises(NotImplementedError): 12 | schema.strategy() 13 | 14 | with pytest.raises(NotImplementedError): 15 | schema.example() 16 | 17 | 18 | def test_column_schema_strategy(): 19 | column_schema = pa.Column(str) 20 | 21 | with pytest.raises(NotImplementedError): 22 | column_schema.strategy() 23 | 24 | with pytest.raises(NotImplementedError): 25 | column_schema.example() 26 | 27 | with pytest.raises(NotImplementedError): 28 | column_schema.strategy_component() 29 | -------------------------------------------------------------------------------- /tests/pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | """Init file for pyspark tests""" 2 | -------------------------------------------------------------------------------- /tests/pyspark/test_pyspark_accessor.py: -------------------------------------------------------------------------------- 1 | """Unit tests for pyspark_accessor module.""" 2 | 3 | from typing import Union 4 | 5 | import pytest 6 | from pyspark.sql import DataFrame, SparkSession 7 | from pyspark.sql.functions import col 8 | from pyspark.sql.types import FloatType, LongType 9 | 10 | import pandera.pyspark as pa 11 | from pandera.config import PanderaConfig, ValidationDepth 12 | from pandera.pyspark import pyspark_sql_accessor 13 | 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | 17 | @pytest.mark.parametrize( 18 | "schema1, schema2, data, invalid_data", 19 | [ 20 | [ 21 | pa.DataFrameSchema({"col": pa.Column("long")}, coerce=True), 22 | pa.DataFrameSchema({"col": pa.Column("float")}, coerce=False), 23 | spark.createDataFrame([{"col": 1}, {"col": 2}, {"col": 3}]), 24 | spark.createDataFrame([{"col": 1}, {"col": 2}, {"col": 3}]), 25 | ], 26 | ], 27 | ) 28 | def test_dataframe_add_schema( 29 | schema1: pa.DataFrameSchema, 30 | schema2: pa.DataFrameSchema, 31 | data: Union[DataFrame, col], 32 | invalid_data: Union[DataFrame, col], 33 | config_params: PanderaConfig, 34 | ) -> None: 35 | """ 36 | Test that pyspark object contains schema metadata after pandera validation. 37 | """ 38 | schema1(data) # type: ignore[arg-type] 39 | 40 | assert data.pandera.schema == schema1 41 | assert isinstance(schema1.validate(data), DataFrame) 42 | assert isinstance(schema1(data), DataFrame) 43 | if config_params.validation_depth != ValidationDepth.DATA_ONLY: 44 | assert dict(schema2(invalid_data).pandera.errors["SCHEMA"]) == { 45 | "WRONG_DATATYPE": [ 46 | { 47 | "schema": None, 48 | "column": "col", 49 | "check": f"dtype('{str(FloatType())}')", 50 | "error": f"expected column 'col' to have type {str(FloatType())}, got {str(LongType())}", 51 | } 52 | ] 53 | } # type: ignore[arg-type] 54 | 55 | 56 | class CustomAccessor: 57 | """Mock accessor class""" 58 | 59 | def __init__(self, obj): 60 | self._obj = obj 61 | 62 | 63 | def test_modin_accessor_warning(): 64 | """Test that modin accessor raises warning when name already exists.""" 65 | pyspark_sql_accessor.register_dataframe_accessor("foo")(CustomAccessor) 66 | with pytest.warns(UserWarning): 67 | pyspark_sql_accessor.register_dataframe_accessor("foo")(CustomAccessor) 68 | -------------------------------------------------------------------------------- /tests/pyspark/test_pyspark_engine.py: -------------------------------------------------------------------------------- 1 | """Tests Engine subclassing and registering DataTypes.Test pyspark engine.""" 2 | 3 | # pylint:disable=redefined-outer-name,unused-argument 4 | 5 | import pytest 6 | 7 | from pandera.engines import pyspark_engine 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "data_type", 12 | list( 13 | pyspark_engine.Engine.get_registered_dtypes() 14 | ), # pylint:disable=no-value-for-parameter 15 | ) 16 | def test_pyspark_data_type(data_type): 17 | """Test pyspark engine DataType base class.""" 18 | if data_type.type is None: 19 | # don't test data types that require parameters e.g. Category 20 | return 21 | parameterized_datatypes = ["decimal", "array", "map"] 22 | 23 | pyspark_engine.Engine.dtype( 24 | data_type 25 | ) # pylint:disable=no-value-for-parameter 26 | pyspark_engine.Engine.dtype( 27 | data_type.type 28 | ) # pylint:disable=no-value-for-parameter 29 | if data_type.type.typeName() not in parameterized_datatypes: 30 | pyspark_engine.Engine.dtype( 31 | str(data_type.type) 32 | ) # pylint:disable=no-value-for-parameter 33 | 34 | with pytest.warns(UserWarning): 35 | pd_dtype = pyspark_engine.DataType(data_type.type) 36 | if data_type.type.typeName() not in parameterized_datatypes: 37 | with pytest.warns(UserWarning): 38 | pd_dtype_from_str = pyspark_engine.DataType(str(data_type.type)) 39 | assert pd_dtype == pd_dtype_from_str 40 | -------------------------------------------------------------------------------- /tests/strategies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unionai-oss/pandera/4daf6854b1e4e4b87f5d6fa55f8d2a6a98256a73/tests/strategies/__init__.py -------------------------------------------------------------------------------- /tests/test_inspection_utils.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-function-docstring,missing-module-docstring 2 | # pylint: disable=missing-class-docstring,bad-mcs-classmethod-argument 3 | from pandera.inspection_utils import ( 4 | is_classmethod_from_meta, 5 | is_decorated_classmethod, 6 | ) 7 | 8 | 9 | class SomeMeta(type): 10 | def __new__(mcs, *args, **kwargs): 11 | return super().__new__(mcs, *args, **kwargs) 12 | 13 | def __init__(cls, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | 16 | def regular_method_meta(cls): 17 | return cls 18 | 19 | @classmethod 20 | def class_method_meta(mcs): 21 | return mcs 22 | 23 | @staticmethod 24 | def static_method_meta(): 25 | return 1 26 | 27 | 28 | class SomeClass(metaclass=SomeMeta): 29 | def regular_method(self): 30 | return self 31 | 32 | @classmethod 33 | def class_method(cls): 34 | return cls 35 | 36 | @staticmethod 37 | def static_method(): 38 | return 2 39 | 40 | 41 | class SomeChild(SomeClass): 42 | def regular_method_child(self): 43 | return self 44 | 45 | @classmethod 46 | def class_method_child(cls): 47 | return cls 48 | 49 | @staticmethod 50 | def static_method_child(): 51 | return 3 52 | 53 | 54 | def test_is_decorated_classmethod() -> None: 55 | some_instance = SomeClass() 56 | some_child = SomeChild() 57 | 58 | cls_methods_with_deco = { 59 | SomeMeta.class_method_meta, 60 | SomeClass.class_method_meta, 61 | SomeClass.class_method, 62 | SomeChild.class_method_meta, 63 | SomeChild.class_method, 64 | SomeChild.class_method_child, 65 | } 66 | 67 | cls_methods_from_meta = { 68 | SomeClass.regular_method_meta, 69 | SomeChild.regular_method_meta, 70 | } 71 | 72 | all_methods = { 73 | # from meta 74 | SomeMeta.class_method_meta, 75 | SomeMeta.static_method_meta, 76 | # from parent 77 | SomeClass.class_method_meta, 78 | SomeClass.regular_method_meta, 79 | SomeClass.static_method_meta, 80 | SomeClass.class_method, 81 | some_instance.regular_method, 82 | SomeClass.static_method, 83 | # from child 84 | SomeChild.class_method_meta, 85 | SomeChild.regular_method_meta, 86 | SomeChild.static_method_meta, 87 | SomeChild.class_method, 88 | some_child.regular_method, 89 | SomeChild.static_method, 90 | SomeChild.class_method_child, 91 | some_child.regular_method_child, 92 | SomeChild.static_method_child, 93 | } 94 | 95 | for method in cls_methods_with_deco: 96 | assert is_decorated_classmethod(method), f"{method} is decorated" 97 | for method in all_methods - cls_methods_with_deco: 98 | assert not is_decorated_classmethod( 99 | method 100 | ), f"{method} is not decorated" 101 | for method in cls_methods_from_meta: 102 | assert is_classmethod_from_meta(method), f"{method} comes from meta" 103 | for method in all_methods - cls_methods_from_meta: 104 | assert not is_classmethod_from_meta( 105 | method 106 | ), f"{method} does not come from meta" 107 | --------------------------------------------------------------------------------