├── tests
├── __init__.py
├── test_indexed_dataset.py
├── test_dataset.py
└── test_type_validation.py
├── strictly_typed_pandas
├── py.typed
├── _vendor
│ ├── __init__.py
│ ├── typeguard
│ │ ├── py.typed
│ │ ├── pytest_plugin.py
│ │ ├── importhook.py
│ │ └── __init__.py
│ └── typeguard-2.13.3.dist-info
│ │ ├── REQUESTED
│ │ ├── INSTALLER
│ │ ├── top_level.txt
│ │ ├── entry_points.txt
│ │ ├── WHEEL
│ │ ├── LICENSE
│ │ ├── RECORD
│ │ └── METADATA
├── __init__.py
├── create_empty_dataframe.py
├── immutable.py
├── pandas_types.py
├── pytest_plugin.py
├── validate_schema.py
├── typeguard.py
└── dataset.py
├── requirements.txt
├── docs
├── requirements.txt
├── source
│ ├── api.rst
│ ├── stubs
│ │ ├── strictly_typed_pandas.DataSet.rst
│ │ └── strictly_typed_pandas.IndexedDataSet.rst
│ ├── index.rst
│ ├── contributing.rst
│ ├── conf.py
│ ├── typeguard.rst
│ ├── advanced.ipynb
│ ├── deepdive_into_dtypes.ipynb
│ └── getting_started.ipynb
├── Makefile
└── make.bat
├── vendorize.toml
├── tox.ini
├── .gitignore
├── requirements-dev.txt
├── .readthedocs.yaml
├── pyproject.toml
├── .github
├── dependabot.yml
└── workflows
│ ├── autoapprove.yml
│ ├── publish.yml
│ └── build.yml
├── .pre-commit-config.yaml
├── LICENSE
├── setup.py
└── README.rst
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/REQUESTED:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/INSTALLER:
--------------------------------------------------------------------------------
1 | pip
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy<=2.2.4
2 | pandas<=2.2.3
3 | pandas-stubs<=2.2.3.250308
4 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==8.2.3
2 | sphinx_rtd_theme
3 | nbsphinx
4 | jupyter
5 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/top_level.txt:
--------------------------------------------------------------------------------
1 | typeguard
2 |
--------------------------------------------------------------------------------
/vendorize.toml:
--------------------------------------------------------------------------------
1 | target = "strictly_typed_pandas/_vendor"
2 | packages = [
3 | "typeguard==2.13.3",
4 | ]
5 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [pytest11]
2 | typeguard = typeguard.pytest_plugin
3 |
4 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 |
3 | # Hard line limit
4 | max-line-length = 120
5 |
6 | extend-exclude = strictly_typed_pandas/_vendor/
7 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/WHEEL:
--------------------------------------------------------------------------------
1 | Wheel-Version: 1.0
2 | Generator: bdist_wheel (0.37.0)
3 | Root-Is-Purelib: true
4 | Tag: py3-none-any
5 |
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .mypy_cache
2 | .pytest_chache
3 | **/__pycache__
4 | .vscode
5 | *.pyc
6 | *.ipynb_checkpoints*
7 | .DS_Store
8 | .coverage
9 | .cache
10 | *.egg*
11 | docs/build
12 |
--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
1 | =================
2 | API documentation
3 | =================
4 |
5 | .. toctree::
6 |
7 | stubs/strictly_typed_pandas.DataSet
8 | stubs/strictly_typed_pandas.IndexedDataSet
9 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/__init__.py:
--------------------------------------------------------------------------------
1 | from strictly_typed_pandas.dataset import DataSet, IndexedDataSet # isort: skip
2 | import strictly_typed_pandas.typeguard # noqa: F401
3 |
4 | __all__ = ["DataSet", "IndexedDataSet"]
5 |
--------------------------------------------------------------------------------
/docs/source/stubs/strictly_typed_pandas.DataSet.rst:
--------------------------------------------------------------------------------
1 | strictly\_typed\_pandas.DataSet
2 | ===============================
3 |
4 | .. currentmodule:: strictly_typed_pandas
5 |
6 | .. autoclass:: DataSet
7 |
8 | .. automethod:: __init__
9 | .. automethod:: to_dataframe
10 | .. automethod:: to_frame
11 |
--------------------------------------------------------------------------------
/docs/source/stubs/strictly_typed_pandas.IndexedDataSet.rst:
--------------------------------------------------------------------------------
1 | strictly\_typed\_pandas.IndexedDataSet
2 | ======================================
3 |
4 | .. currentmodule:: strictly_typed_pandas
5 |
6 | .. autoclass:: IndexedDataSet
7 |
8 | .. automethod:: __init__
9 | .. automethod:: to_dataframe
10 | .. automethod:: to_frame
11 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | mypy<=1.17.1
2 | flake8<=7.3.0
3 | black[jupyter]<=25.1.0
4 | docformatter<=1.7.7
5 | isort<=6.0.1
6 | coverage<=7.10.6
7 | pytest<=8.4.1
8 | nbconvert==7.16.6
9 | jupyter==1.1.1
10 | sphinx<=8.2.3
11 | sphinx_rtd_theme==3.0.2
12 | nbsphinx==0.9.7
13 | pre-commit<=4.3.0
14 | types-setuptools<=80.9.0.20250822
15 | pyarrow<=21.0.0
16 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | build:
9 | os: ubuntu-22.04
10 | tools:
11 | python: "3.11"
12 |
13 | sphinx:
14 | configuration: docs/source/conf.py
15 |
16 | python:
17 | install:
18 | - method: pip
19 | path: .
20 | - requirements: requirements-dev.txt
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. Strictly Typed Pandas documentation master file, created by
2 | sphinx-quickstart on Wed Jul 14 17:15:09 2021.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | .. include:: ../../README.rst
7 |
8 | .. toctree::
9 | :hidden:
10 |
11 | getting_started
12 | advanced
13 | deepdive_into_dtypes
14 | typeguard
15 | api
16 | contributing
17 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 100
3 | force-exclude = ["strictly_typed_pandas/_vendor/*"]
4 |
5 | [tool.isort]
6 | profile = "black"
7 | line_length = 100
8 | extend_skip_glob = ["strictly_typed_pandas/_vendor/*"]
9 |
10 | [tool.mypy]
11 | exclude = ['strictly_typed_pandas/_vendor/.*']
12 |
13 | [[tool.mypy.overrides]]
14 | module="strictly_typed_pandas._vendor.*"
15 | follow_imports = 'skip'
16 |
17 | [[tool.mypy.overrides]]
18 | module="typeguard"
19 | ignore_missing_imports = true
20 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "pip" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "daily"
12 |
--------------------------------------------------------------------------------
/.github/workflows/autoapprove.yml:
--------------------------------------------------------------------------------
1 | name: Dependabot auto-approve
2 | on: pull_request
3 |
4 | permissions:
5 | pull-requests: write
6 |
7 | jobs:
8 | dependabot:
9 | runs-on: ubuntu-latest
10 | if: ${{ github.actor == 'dependabot[bot]' }}
11 | steps:
12 | - name: Dependabot metadata
13 | id: metadata
14 | uses: dependabot/fetch-metadata@v1
15 | with:
16 | github-token: "${{ secrets.AUTOAPPROVE_TOKEN }}"
17 | - name: Approve a PR
18 | run: gh pr review --approve "$PR_URL"
19 | env:
20 | PR_URL: ${{github.event.pull_request.html_url}}
21 | GITHUB_TOKEN: ${{secrets.AUTOAPPROVE_TOKEN}}
22 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - name: Set up Python
13 | uses: actions/setup-python@v2
14 | with:
15 | python-version: '3.x'
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install setuptools wheel twine
20 | - name: Build and publish
21 | env:
22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 | run: |
25 | python setup.py sdist bdist_wheel
26 | twine upload dist/*
27 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: '^strictly_typed_pandas/_vendor'
2 | repos:
3 | - repo: local
4 | hooks:
5 | - id: flake8
6 | name: flake8
7 | entry: flake8
8 | language: system
9 | types: [python]
10 | - id: mypy
11 | name: mypy
12 | entry: mypy
13 | language: system
14 | types: [python]
15 | - id: black
16 | name: black
17 | description: "Black: The uncompromising Python code formatter"
18 | entry: black
19 | language: system
20 | require_serial: true
21 | files: \.(py|ipynb)$
22 | - id: isort
23 | name: isort
24 | entry: isort
25 | language: system
26 | types: [python]
27 | - id: pytest
28 | name: pytest
29 | entry: coverage run -m pytest --typeguard-packages=tests
30 | language: system
31 | types: [python]
32 | pass_filenames: false
33 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Contributing
3 | ============
4 |
5 | We welcome contributions! To set up your development environment, we recommend using pyenv. You can find more on how to install ``pyenv`` and ``pyenv-virtualen`` here:
6 |
7 | * https://github.com/pyenv/pyenv
8 | * https://github.com/pyenv/pyenv-virtualenv
9 |
10 | To set up the environment, run:
11 |
12 | .. code-block:: bash
13 |
14 | pyenv install 3.11
15 | pyenv virtualenv 3.11 strictly_typed_pandas
16 | pyenv activate strictly_typed_pandas
17 | pip install -r requirements.txt
18 | pip install -r requirements-dev.txt
19 |
20 | For a list of currently supported Python versions, we refer to ``.github/workflows/build.yml``.
21 |
22 | ---------------
23 | Pre-commit hook
24 | ---------------
25 | We use ``pre-commit`` to run a number of checks on the code before it is committed. To install the pre-commit hook, run:
26 |
27 | .. code-block:: bash
28 |
29 | pre-commit install
30 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard/pytest_plugin.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from ..typeguard.importhook import install_import_hook
4 |
5 |
6 | def pytest_addoption(parser):
7 | group = parser.getgroup('typeguard')
8 | group.addoption('--typeguard-packages', action='store',
9 | help='comma separated name list of packages and modules to instrument for '
10 | 'type checking')
11 |
12 |
13 | def pytest_configure(config):
14 | value = config.getoption("typeguard_packages")
15 | if not value:
16 | return
17 |
18 | packages = [pkg.strip() for pkg in value.split(",")]
19 |
20 | already_imported_packages = sorted(
21 | package for package in packages if package in sys.modules
22 | )
23 | if already_imported_packages:
24 | message = (
25 | "typeguard cannot check these packages because they "
26 | "are already imported: {}"
27 | )
28 | raise RuntimeError(message.format(", ".join(already_imported_packages)))
29 |
30 | install_import_hook(packages=packages)
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Nanne Aben
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/LICENSE:
--------------------------------------------------------------------------------
1 | This is the MIT license: http://www.opensource.org/licenses/mit-license.php
2 |
3 | Copyright (c) Alex Grönholm
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
6 | software and associated documentation files (the "Software"), to deal in the Software
7 | without restriction, including without limitation the rights to use, copy, modify, merge,
8 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
9 | to whom the Software is furnished to do so, subject to the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be included in all copies or
12 | substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
15 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
16 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
17 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19 | DEALINGS IN THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/RECORD:
--------------------------------------------------------------------------------
1 | typeguard-2.13.3.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2 | typeguard-2.13.3.dist-info/LICENSE,sha256=YWP3mH37ONa8MgzitwsvArhivEESZRbVUu8c1DJH51g,1130
3 | typeguard-2.13.3.dist-info/METADATA,sha256=rrszCBWMnpJt2j9D8QqPgS1kQUFdTu5exwvCVkB0cIY,3591
4 | typeguard-2.13.3.dist-info/RECORD,,
5 | typeguard-2.13.3.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6 | typeguard-2.13.3.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
7 | typeguard-2.13.3.dist-info/entry_points.txt,sha256=uBVT0tmiav9LH4v6cq0GIl7TYz07TqFHniXP6zCfHbY,48
8 | typeguard-2.13.3.dist-info/top_level.txt,sha256=4z28AhuDodwRS_c1J_l8H51t5QuwfTseskYzlxp6grs,10
9 | typeguard/__init__.py,sha256=7LyyccpyAXgyd3WO2j1GXCWDdyasGjmA9v9DeydHR70,49186
10 | typeguard/__pycache__/__init__.cpython-311.pyc,,
11 | typeguard/__pycache__/importhook.cpython-311.pyc,,
12 | typeguard/__pycache__/pytest_plugin.cpython-311.pyc,,
13 | typeguard/importhook.py,sha256=nv3-M2SZ4cHxJBakslR_7w73YpT6Lit67txi7H7-xGM,5601
14 | typeguard/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15 | typeguard/pytest_plugin.py,sha256=T1wfao9RMZ-fQ31bA_gmkoOtHEmXk3o1s0Nty5ZrFnw,917
16 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 |
4 | def get_requirements():
5 | with open("requirements.txt") as f:
6 | return f.read().splitlines()
7 |
8 |
9 | def get_long_description():
10 | with open("README.rst", encoding="utf-8") as f:
11 | return f.read()
12 |
13 |
14 | setup(
15 | name="strictly_typed_pandas",
16 | url="https://github.com/nanne-aben/strictly_typed_pandas",
17 | license="MIT",
18 | author="Nanne Aben",
19 | author_email="nanne.aben@gmail.com",
20 | description="Static type checking of pandas DataFrames",
21 | keywords="typing type checking pandas mypy linting",
22 | long_description=get_long_description(),
23 | long_description_content_type="text/x-rst",
24 | packages=find_packages(include=["strictly_typed_pandas", "strictly_typed_pandas.*"]),
25 | install_requires=get_requirements(),
26 | python_requires=">=3.8.0",
27 | classifiers=["Typing :: Typed"],
28 | setuptools_git_versioning={"enabled": True},
29 | setup_requires=["setuptools-git-versioning"],
30 | package_data={"strictly_typed_pandas": ["py.typed"]},
31 | entry_points={
32 | "pytest11": [
33 | "strictly_typed_pandas = strictly_typed_pandas.pytest_plugin",
34 | ],
35 | },
36 | )
37 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/create_empty_dataframe.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Callable, Dict
2 |
3 | import numpy as np # type: ignore
4 | import pandas as pd
5 | from pandas.api.extensions import ExtensionDtype
6 |
7 | from strictly_typed_pandas.pandas_types import StringDtype
8 |
9 |
10 | def create_empty_dataframe(schema: Dict[str, Any]) -> pd.DataFrame:
11 | res = dict()
12 | for name, dtype in schema.items():
13 | if dtype == Any:
14 | dtype = object
15 |
16 | if isinstance(dtype, Callable) and isinstance(dtype(), ExtensionDtype): # type: ignore
17 | dtype = dtype.name
18 |
19 | if isinstance(dtype, ExtensionDtype):
20 | dtype = dtype.name
21 |
22 | if dtype == np.datetime64:
23 | dtype = "datetime64[ns]"
24 |
25 | if dtype == np.timedelta64:
26 | dtype = "timedelta64[ns]"
27 |
28 | if dtype == str:
29 | dtype = StringDtype.name
30 |
31 | res[name] = pd.Series([], dtype=dtype)
32 |
33 | return pd.DataFrame(res)
34 |
35 |
36 | def create_empty_indexed_dataframe(
37 | index_schema: Dict[str, Any], data_schema: Dict[str, Any]
38 | ) -> pd.DataFrame:
39 | df_index = create_empty_dataframe(index_schema)
40 | df_data = create_empty_dataframe(data_schema)
41 | return pd.concat([df_index, df_data], axis=1).set_index(list(index_schema.keys()))
42 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/immutable.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from typing import Any, Callable, Optional
3 |
4 | from pandas.core.indexing import _iLocIndexer, _LocIndexer
5 |
6 | immutable_error_msg = (
7 | "To ensure that the DataSet adheres to its schema, you cannot perform inplace modifications. You can either use "
8 | + "dataset.to_dataframe() to cast the DataSet to a DataFrame, or use operations that return a DataFrame, e.g. "
9 | + "df = df.assign(...)."
10 | )
11 |
12 |
13 | class _ImmutableiLocIndexer(_iLocIndexer):
14 | def __setitem__(self, key: Any, value: Any) -> None:
15 | raise NotImplementedError(immutable_error_msg)
16 |
17 |
18 | class _ImmutableLocIndexer(_LocIndexer):
19 | def __setitem__(self, key: Any, value: Any) -> None:
20 | raise NotImplementedError(immutable_error_msg)
21 |
22 |
23 | def _get_index_of_inplace_in_args(call: Callable) -> Optional[int]:
24 | signature = inspect.signature(call)
25 | parameters = signature.parameters.keys()
26 |
27 | if "inplace" in parameters:
28 | return [i for i, v in enumerate(parameters) if v == "inplace"][0]
29 | else:
30 | return None
31 |
32 |
33 | def inplace_argument_interceptor(call: Callable) -> Callable:
34 | inplace_ind = _get_index_of_inplace_in_args(call)
35 |
36 | def func(*args, **kwargs):
37 | if inplace_ind is not None and inplace_ind < len(args) and args[inplace_ind]:
38 | raise NotImplementedError(immutable_error_msg)
39 |
40 | if "inplace" in kwargs and kwargs["inplace"]:
41 | raise NotImplementedError(immutable_error_msg)
42 |
43 | return call(*args, **kwargs)
44 |
45 | return func
46 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/pandas_types.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | # for backward compatability with pandas 0.23 - 0.25
5 | class BackwardCompatibility(pd.api.extensions.ExtensionDtype):
6 | name = "object"
7 |
8 | def __init__(self, *args, **kwargs) -> None:
9 | pass # pragma: no cover
10 |
11 |
12 | if hasattr(pd, "StringDtype"):
13 | StringDtype = pd.StringDtype
14 | else: # pragma: no cover
15 |
16 | class StringDtype(BackwardCompatibility): # type: ignore
17 | pass
18 |
19 |
20 | if hasattr(pd, "DatetimeTZDtype"):
21 | DatetimeTZDtype = pd.DatetimeTZDtype
22 | else: # pragma: no cover
23 |
24 | class DatetimeTZDtype(BackwardCompatibility): # type: ignore
25 | pass
26 |
27 |
28 | if hasattr(pd, "CategoricalDtype"):
29 | CategoricalDtype = pd.CategoricalDtype
30 | else: # pragma: no cover
31 |
32 | class CategoricalDtype(BackwardCompatibility): # type: ignore
33 | pass
34 |
35 |
36 | if hasattr(pd, "PeriodDtype"):
37 | PeriodDtype = pd.PeriodDtype
38 | else: # pragma: no cover
39 |
40 | class PeriodDtype(BackwardCompatibility): # type: ignore
41 | pass
42 |
43 |
44 | if hasattr(pd, "SparseDtype"):
45 | SparseDtype = pd.SparseDtype
46 | else: # pragma: no cover
47 |
48 | class SparseDtype(BackwardCompatibility): # type: ignore
49 | pass
50 |
51 |
52 | if hasattr(pd, "IntervalDtype"):
53 | IntervalDtype = pd.IntervalDtype
54 | else: # pragma: no cover
55 |
56 | class IntervalDtype(BackwardCompatibility): # type: ignore
57 | pass
58 |
59 |
60 | if hasattr(pd, "Int64Dtype"):
61 | Int64Dtype = pd.Int64Dtype
62 | else: # pragma: no cover
63 |
64 | class Int64Dtype(BackwardCompatibility): # type: ignore
65 | pass
66 |
67 |
68 | if hasattr(pd, "BooleanDtype"):
69 | BooleanDtype = pd.BooleanDtype
70 | else: # pragma: no cover
71 |
72 | class BooleanDtype(BackwardCompatibility): # type: ignore
73 | pass
74 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Python package
2 |
3 | on: [pull_request, workflow_dispatch]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 | strategy:
10 | matrix:
11 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python ${{ matrix.python-version }}
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install -r requirements.txt
23 | pip install -r requirements-dev.txt
24 | pip install -e .
25 | - name: Lint with isort
26 | run: isort --check .
27 | - name: Lint with black
28 | run: black --check .
29 | - name: Lint with docformatter
30 | run: docformatter --black -c **/*.py
31 | - name: Lint with flake8
32 | run: flake8
33 | - name: Lint with mypy
34 | run: mypy .
35 | - name: Test with pytest
36 | run: |
37 | coverage run -m pytest --stp-typeguard-packages=tests
38 | coverage report -m
39 | - name: Run notebooks
40 | run: |
41 | for FILE in docs/source/*.ipynb; do
42 | BASE=$(basename $FILE)
43 | cp $FILE .
44 | jupyter nbconvert --to notebook $BASE --execute
45 | done
46 | - name: Run pytest with --typeguard-packages for backwards compatibility
47 | run: |
48 | pytest --typeguard-packages=tests
49 | - name: Run pytest with compatible typeguard installed
50 | run: |
51 | pip install typeguard==2.13.2
52 | pytest --typeguard-packages=tests
53 | pytest --stp-typeguard-packages=tests
54 | - name: Run pytest with incompatible typeguard installed
55 | run: |
56 | pip install typeguard==4.1.5
57 | pytest --stp-typeguard-packages=tests
58 |
--------------------------------------------------------------------------------
/tests/test_indexed_dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np # type: ignore
2 | import pandas as pd
3 | import pytest
4 |
5 | from strictly_typed_pandas import IndexedDataSet
6 | from strictly_typed_pandas.pandas_types import StringDtype
7 |
8 |
9 | class IndexSchema:
10 | a: int
11 | b: str
12 |
13 |
14 | class DataSchema:
15 | c: int
16 | d: str
17 |
18 |
19 | class AlternativeIndexSchema:
20 | a: int
21 |
22 |
23 | class AlternativeDataSchema:
24 | f: int
25 |
26 |
27 | def test_empty_indexed_dataset() -> None:
28 | df = IndexedDataSet[IndexSchema, DataSchema]()
29 |
30 | assert df.shape[0] == 0
31 | assert np.all(df.index.names == ["a", "b"])
32 | assert np.all(df.columns == ["c", "d"])
33 |
34 | assert df.index.get_level_values(0).dtype == int
35 | assert df.index.get_level_values(1).dtype == object or isinstance(
36 | df.index.get_level_values(1).dtype, StringDtype
37 | )
38 |
39 | assert df.dtypes.iloc[0] == int
40 | assert df.dtypes.iloc[1] == object or isinstance(df.dtypes.iloc[1], StringDtype)
41 |
42 |
43 | def test_indexed_dataset() -> None:
44 | (
45 | pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1, 2, 3], "d": ["a", "b", "c"]})
46 | .set_index(["a", "b"])
47 | .pipe(IndexedDataSet[IndexSchema, DataSchema])
48 | )
49 |
50 |
51 | def test_missing_index():
52 | with pytest.raises(TypeError, match="No named columns in index"):
53 | pd.DataFrame({"a": [1, 2, 3]}).pipe(IndexedDataSet[IndexSchema, DataSchema])
54 |
55 |
56 | def test_overlapping_columns():
57 | with pytest.raises(TypeError):
58 | IndexedDataSet[IndexSchema, IndexSchema]()
59 |
60 |
61 | def foo(df: IndexedDataSet[IndexSchema, DataSchema]) -> IndexedDataSet[IndexSchema, DataSchema]:
62 | return df
63 |
64 |
65 | def test_typeguard_indexed_dataset() -> None:
66 | foo(IndexedDataSet[IndexSchema, DataSchema]())
67 |
68 | with pytest.raises(TypeError):
69 | foo(IndexedDataSet[AlternativeIndexSchema, AlternativeDataSchema]()) # type: ignore
70 |
71 | with pytest.raises(TypeError):
72 | foo(pd.DataFrame()) # type: ignore
73 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 | # This is a configuration file we shouldn't be checking it.
18 | # mypy: ignore-errors
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = "Strictly Typed Pandas"
23 | copyright = "2021, Nanne Aben"
24 | author = "Nanne Aben"
25 |
26 |
27 | # -- General configuration ---------------------------------------------------
28 |
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | extensions = ["sphinx.ext.autodoc", "sphinx_rtd_theme", "nbsphinx"]
33 |
34 | # Add any paths that contain templates here, relative to this directory.
35 | templates_path = ["_templates"]
36 |
37 | # List of patterns, relative to source directory, that match files and
38 | # directories to ignore when looking for source files.
39 | # This pattern also affects html_static_path and html_extra_path.
40 | exclude_patterns = []
41 |
42 |
43 | # -- Options for HTML output -------------------------------------------------
44 |
45 | # The theme to use for HTML and HTML Help pages. See the documentation for
46 | # a list of builtin themes.
47 | #
48 | html_theme = "sphinx_rtd_theme"
49 |
50 | # Add any paths that contain custom static files (such as style sheets) here,
51 | # relative to this directory. They are copied after the builtin static files,
52 | # so a file named "default.css" will overwrite the builtin "default.css".
53 | html_static_path = ["_static"]
54 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/pytest_plugin.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from typing import List
3 |
4 | from ._vendor.typeguard.importhook import install_import_hook
5 |
6 | try:
7 | import typeguard
8 |
9 | del typeguard
10 | TYPEGUARD_INSTALLED = True
11 | except ImportError:
12 | TYPEGUARD_INSTALLED = False
13 |
14 |
15 | def pytest_addoption(parser):
16 | group = parser.getgroup("stp_typeguard")
17 | group.addoption(
18 | "--stp-typeguard-packages",
19 | action="store",
20 | help=(
21 | "comma separated name list of packages and modules to "
22 | "instrument for type checking by strictly typed pandas"
23 | ),
24 | )
25 | if not TYPEGUARD_INSTALLED:
26 | group = parser.getgroup("typeguard")
27 | group.addoption(
28 | "--typeguard-packages",
29 | action="store",
30 | help=(
31 | "comma separated name list of packages and modules to "
32 | "instrument for type checking"
33 | ),
34 | )
35 |
36 |
37 | def _parse_packages(val: str) -> List[str]:
38 | if val is None or not val.strip():
39 | return []
40 | return [pkg.strip() for pkg in val.split(",")]
41 |
42 |
43 | def pytest_configure(config):
44 | packages = _parse_packages(config.getoption("stp_typeguard_packages"))
45 | typeguard_packages = _parse_packages(config.getoption("typeguard_packages"))
46 |
47 | packages_in_both = set(packages) & set(typeguard_packages)
48 | if packages_in_both:
49 | raise RuntimeError(
50 | "If you are going to use both --stp-typeguard-packages "
51 | "and --typeguard-packages at the same time, "
52 | "please don't list the same package in both options: "
53 | f"{', '.join(packages_in_both)}"
54 | )
55 |
56 | if not TYPEGUARD_INSTALLED:
57 | packages.extend(typeguard_packages)
58 |
59 | if not packages:
60 | return
61 |
62 | already_imported_packages = sorted(p for p in packages if p in sys.modules)
63 | if already_imported_packages:
64 | message = (
65 | "strictly_typed_pandas cannot check these packages because "
66 | "they are already imported: {}"
67 | )
68 | raise RuntimeError(message.format(", ".join(already_imported_packages)))
69 |
70 | install_import_hook(packages=packages)
71 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | ================================================================
2 | Strictly Typed Pandas: static type checking of pandas DataFrames
3 | ================================================================
4 |
5 | I love Pandas! But in production code I’m always a bit wary when I see:
6 |
7 | .. code-block:: python
8 |
9 | import pandas as pd
10 |
11 | def foo(df: pd.DataFrame) -> pd.DataFrame:
12 | # do stuff
13 | return df
14 |
15 | Because… How do I know which columns are supposed to be in `df`?
16 |
17 | Using `strictly_typed_pandas`, we can be more explicit about what these data should look like.
18 |
19 | .. code-block:: python
20 |
21 | from strictly_typed_pandas import DataSet
22 |
23 | class Schema:
24 | id: int
25 | name: str
26 |
27 | def foo(df: DataSet[Schema]) -> DataSet[Schema]:
28 | # do stuff
29 | return df
30 |
31 | Where `DataSet`:
32 | * is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`.
33 | * validates whether the data adheres to the provided schema upon its initialization.
34 | * is immutable, so its schema cannot be changed using inplace modifications.
35 |
36 | The `DataSet[Schema]` annotations are compatible with:
37 | * `mypy` for type checking during linting-time (i.e. while you write your code).
38 | * `typeguard` (`_.
55 |
56 | FAQ
57 | ===
58 |
59 | | **Do you know of something similar for pyspark?**
60 | | Yes! Check out our package `typedspark `_.
61 | |
62 | | **Why use Python if you want static typing?**
63 | | There are just so many good packages for data science in Python. Rather than sacrificing all of that by moving to a different language, I'd like to make the Pythonverse a little bit better.
64 | |
65 | | **I found a bug! What should I do?**
66 | | Great! Contact me and I'll look into it.
67 | |
68 | | **I have a great idea to improve strictly_typed_pandas! How can we make this work?**
69 | | Awesome, drop me a line!
70 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/validate_schema.py:
--------------------------------------------------------------------------------
1 | from typing import Any, ClassVar, Dict, Set, get_origin
2 |
3 | import numpy as np # type: ignore
4 | from pandas.api.extensions import ExtensionDtype
5 | from pandas.core.dtypes.common import is_dtype_equal
6 |
7 | from strictly_typed_pandas.pandas_types import StringDtype
8 |
9 |
10 | def check_for_duplicate_columns(names_index: Set[str], names_data: Set[str]) -> None:
11 | intersection = names_index & names_data
12 | if len(intersection) > 0:
13 | msg = "The following column is present in both the index schema and the data schema: {}"
14 | raise TypeError(msg.format(intersection))
15 |
16 |
17 | def remove_classvars(schema_expected: Dict[str, Any]) -> Dict[str, Any]:
18 | return {
19 | key: value
20 | for key, value in schema_expected.items() if get_origin(value) is not ClassVar
21 | }
22 |
23 |
24 | def validate_schema(schema_expected: Dict[str, Any], schema_observed: Dict[str, Any]):
25 | schema_expected = remove_classvars(schema_expected)
26 | _check_names(set(schema_expected.keys()), set(schema_observed.keys()))
27 | _check_dtypes(schema_expected, schema_observed)
28 |
29 |
30 | def _check_names(names_expected: Set[str], names_observed: Set[str]) -> None:
31 | diff = names_observed - names_expected
32 | if diff:
33 | raise TypeError(
34 | "Data contains the following columns not present in schema: {diff}".format(diff=diff)
35 | )
36 |
37 | diff = names_expected - names_observed
38 | if diff:
39 | raise TypeError(
40 | "Schema contains the following columns not present in data: {diff}".format(diff=diff)
41 | )
42 |
43 |
44 | def _check_dtypes(schema_expected: Dict[str, Any], schema_observed: Dict[str, Any]) -> None:
45 | for name, dtype_expected in schema_expected.items():
46 | dtype_observed = schema_observed[name]
47 |
48 | if dtype_expected in [object, np.object_, Any]:
49 | continue
50 |
51 | if dtype_expected == str and dtype_observed == object:
52 | continue # pandas stores strings as objects by default
53 |
54 | if dtype_expected == str and isinstance(dtype_observed, StringDtype):
55 | continue # since np.int64 == int, I'd say we should also support pd.StringDtype == str
56 |
57 | if isinstance(dtype_observed, np.dtype) and dtype_observed != np.object_:
58 | if dtype_observed == dtype_expected or np.issubdtype(dtype_observed, dtype_expected):
59 | continue
60 |
61 | if isinstance(dtype_expected, ExtensionDtype) and is_dtype_equal(
62 | dtype_expected, dtype_observed
63 | ):
64 | continue
65 |
66 | if dtype_observed != object and isinstance(dtype_observed, dtype_expected):
67 | continue
68 |
69 | msg = "Column {name} is of type {dtype_observed}, but the schema suggests {dtype_expected}"
70 |
71 | if isinstance(dtype_observed, np.dtype):
72 | dtype_observed = "numpy." + str(dtype_observed)
73 |
74 | raise TypeError(
75 | msg.format(name=name, dtype_observed=dtype_observed, dtype_expected=dtype_expected)
76 | )
77 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/typeguard.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import PackageNotFoundError, version
2 |
3 | from strictly_typed_pandas import DataSet, IndexedDataSet
4 | from strictly_typed_pandas._vendor import typeguard
5 |
6 | try:
7 | COMPATIBLE_EXTERNAL_TYPEGUARD_EXISTS = version("typeguard").startswith("2.")
8 | except PackageNotFoundError:
9 | COMPATIBLE_EXTERNAL_TYPEGUARD_EXISTS = False
10 |
11 | if COMPATIBLE_EXTERNAL_TYPEGUARD_EXISTS:
12 | import typeguard as external_typeguard
13 | else:
14 | external_typeguard = None
15 |
16 |
17 | def check_dataset(argname: str, value, expected_type, memo: typeguard._TypeCheckMemo) -> None:
18 | schema_expected = expected_type.__args__[0]
19 | if not isinstance(value, DataSet):
20 | msg = "Type of {argname} must be a DataSet[{schema_expected}]; got {class_observed} instead"
21 | raise TypeError(
22 | msg.format(
23 | argname=argname,
24 | schema_expected=typeguard.qualified_name(schema_expected),
25 | class_observed=typeguard.qualified_name(value),
26 | )
27 | )
28 |
29 | schema_observed = value.__orig_class__.__args__[0]
30 | if schema_observed != schema_expected:
31 | msg = "Type of {argname} must be a DataSet[{schema_expected}]; got DataSet[{schema_observed}] instead"
32 | raise TypeError(
33 | msg.format(
34 | argname=argname,
35 | schema_expected=typeguard.qualified_name(schema_expected),
36 | schema_observed=typeguard.qualified_name(schema_observed),
37 | )
38 | )
39 |
40 |
41 | def check_indexed_dataset(argname: str, value, expected_type, memo: typeguard._TypeCheckMemo):
42 | schema_index_expected = expected_type.__args__[0]
43 | schema_data_expected = expected_type.__args__[1]
44 | if not isinstance(value, IndexedDataSet):
45 | msg = (
46 | "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];"
47 | + "got {class_observed} instead"
48 | )
49 | raise TypeError(
50 | msg.format(
51 | argname=argname,
52 | schema_index_expected=typeguard.qualified_name(schema_index_expected),
53 | schema_data_expected=typeguard.qualified_name(schema_data_expected),
54 | class_observed=typeguard.qualified_name(value),
55 | )
56 | )
57 |
58 | schema_index_observed = value.__orig_class__.__args__[0]
59 | schema_data_observed = value.__orig_class__.__args__[1]
60 | if (
61 | schema_index_observed != schema_index_expected
62 | or schema_data_observed != schema_data_expected
63 | ):
64 | msg = (
65 | "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];"
66 | + "got IndexedDataSet[{schema_index_observed},{schema_data_observed}] instead"
67 | )
68 | raise TypeError(
69 | msg.format(
70 | argname=argname,
71 | schema_index_expected=typeguard.qualified_name(schema_index_expected),
72 | schema_data_expected=typeguard.qualified_name(schema_data_expected),
73 | schema_index_observed=typeguard.qualified_name(schema_index_observed),
74 | schema_data_observed=typeguard.qualified_name(schema_data_observed),
75 | )
76 | )
77 |
78 |
79 | typeguard.origin_type_checkers[DataSet] = check_dataset
80 | typeguard.origin_type_checkers[IndexedDataSet] = check_indexed_dataset
81 | typechecked = typeguard.typechecked
82 |
83 | if external_typeguard is not None:
84 | external_typeguard.origin_type_checkers[DataSet] = check_dataset
85 | external_typeguard.origin_type_checkers[IndexedDataSet] = check_indexed_dataset
86 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/METADATA:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: typeguard
3 | Version: 2.13.3
4 | Summary: Run-time type checker for Python
5 | Home-page: UNKNOWN
6 | Author: Alex Grönholm
7 | Author-email: alex.gronholm@nextday.fi
8 | License: MIT
9 | Project-URL: Documentation, https://typeguard.readthedocs.io/en/latest/
10 | Project-URL: Change log, https://typeguard.readthedocs.io/en/latest/versionhistory.html
11 | Project-URL: Source code, https://github.com/agronholm/typeguard
12 | Project-URL: Issue tracker, https://github.com/agronholm/typeguard/issues
13 | Platform: UNKNOWN
14 | Classifier: Development Status :: 5 - Production/Stable
15 | Classifier: Intended Audience :: Developers
16 | Classifier: License :: OSI Approved :: MIT License
17 | Classifier: Programming Language :: Python
18 | Classifier: Programming Language :: Python :: 3
19 | Classifier: Programming Language :: Python :: 3.5
20 | Classifier: Programming Language :: Python :: 3.6
21 | Classifier: Programming Language :: Python :: 3.7
22 | Classifier: Programming Language :: Python :: 3.8
23 | Classifier: Programming Language :: Python :: 3.9
24 | Classifier: Programming Language :: Python :: 3.10
25 | Requires-Python: >=3.5.3
26 | License-File: LICENSE
27 | Provides-Extra: doc
28 | Requires-Dist: sphinx-rtd-theme ; extra == 'doc'
29 | Requires-Dist: sphinx-autodoc-typehints (>=1.2.0) ; extra == 'doc'
30 | Provides-Extra: test
31 | Requires-Dist: pytest ; extra == 'test'
32 | Requires-Dist: typing-extensions ; extra == 'test'
33 | Requires-Dist: mypy ; (platform_python_implementation != "PyPy") and extra == 'test'
34 |
35 | .. image:: https://travis-ci.com/agronholm/typeguard.svg?branch=master
36 | :target: https://travis-ci.com/agronholm/typeguard
37 | :alt: Build Status
38 | .. image:: https://coveralls.io/repos/agronholm/typeguard/badge.svg?branch=master&service=github
39 | :target: https://coveralls.io/github/agronholm/typeguard?branch=master
40 | :alt: Code Coverage
41 | .. image:: https://readthedocs.org/projects/typeguard/badge/?version=latest
42 | :target: https://typeguard.readthedocs.io/en/latest/?badge=latest
43 |
44 | This library provides run-time type checking for functions defined with
45 | `PEP 484 `_ argument (and return) type annotations.
46 |
47 | Four principal ways to do type checking are provided, each with its pros and cons:
48 |
49 | #. the ``check_argument_types()`` and ``check_return_type()`` functions:
50 |
51 | * debugger friendly (except when running with the pydev debugger with the C extension installed)
52 | * does not work reliably with dynamically defined type hints (e.g. in nested functions)
53 | #. the ``@typechecked`` decorator:
54 |
55 | * automatically type checks yields and sends of returned generators (regular and async)
56 | * adds an extra frame to the call stack for every call to a decorated function
57 | #. the stack profiler hook (``with TypeChecker('packagename'):``) (deprecated):
58 |
59 | * emits warnings instead of raising ``TypeError``
60 | * requires very few modifications to the code
61 | * multiple TypeCheckers can be stacked/nested
62 | * does not work reliably with dynamically defined type hints (e.g. in nested functions)
63 | * may cause problems with badly behaving debuggers or profilers
64 | * cannot distinguish between an exception being raised and a ``None`` being returned
65 | #. the import hook (``typeguard.importhook.install_import_hook()``):
66 |
67 | * automatically annotates classes and functions with ``@typechecked`` on import
68 | * no code changes required in target modules
69 | * requires imports of modules you need to check to be deferred until after the import hook has
70 | been installed
71 | * may clash with other import hooks
72 |
73 | See the documentation_ for further instructions.
74 |
75 | .. _documentation: https://typeguard.readthedocs.io/en/latest/
76 |
77 |
78 |
--------------------------------------------------------------------------------
/docs/source/typeguard.rst:
--------------------------------------------------------------------------------
1 | Typeguard
2 | =========
3 |
4 | We use typeguard in strictly typed pandas to as an additional runtime check, as described in earlier sections. As per typeguard 3.0.0, a number of breaking changes were introduced, which we couldn't reconcile with strictly typed pandas. Other packages that depend on typeguard 2.13.3 are in a similar situation.
5 |
6 | However, the ``typeguard<=2.13.3`` requirement became problematic over time, as it meant people could not use strictly typed pandas together with packages that depend on ``typeguard>=3.0.0``. For this reason, we have decided to vendor typeguard in ``strictly_typed_pandas==0.2.0``, meaning that we include typeguard within the strictly typed pandas code base, rather than having it as a dependency.
7 |
8 | In this document, we outline how you can use typeguard with ``strictly_typed_pandas>=0.2.0``.
9 |
10 | With typeguard 2.13.3 (backwards compatibility)
11 | -----------------------------------------------
12 |
13 | To support backwards compatibility, we allow you to use typeguard with ``strictly_typed_pandas>=0.2.0`` by simply installing ``typeguard==2.13.3``, without any other changes required. This can be done by running:
14 |
15 | .. code-block:: bash
16 |
17 | pip install typeguard==2.13.3
18 |
19 | You can use all functionality from typeguard as before:
20 |
21 | Decorator
22 | ^^^^^^^^^
23 |
24 | .. code-block:: python
25 |
26 | from typeguard import typechecked
27 |
28 | @typechecked
29 | def foo(df: DataSet[Person]) -> DataSet[Person]:
30 | ...
31 |
32 | Import hook
33 | ^^^^^^^^^^^
34 |
35 | .. code-block:: python
36 |
37 | from typeguard import install_import_hook
38 |
39 | install_import_hook('my_app')
40 | from my_app import some_module # import only AFTER installing the hook, or it won't take effect
41 |
42 | Pytest plugin
43 | ^^^^^^^^^^^^^
44 |
45 | .. code-block:: bash
46 |
47 | pytest --typeguard-packages=my_app
48 |
49 | With the vendored typeguard version (recommended)
50 | -------------------------------------------------
51 |
52 | We recommend that you use the vendored typeguard version, as it is the most future-proof solution.
53 |
54 | Decorator
55 | ^^^^^^^^^
56 |
57 | You can use the vendored version as follows:
58 |
59 | .. code-block:: python
60 |
61 | from strictly_typed_pandas.typeguard import typechecked
62 |
63 | @typechecked
64 | def foo(df: DataSet[Person]) -> DataSet[Person]:
65 | ...
66 |
67 | If you also want to use a second typeguard version in your project (e.g. ``typeguard>=3.0.0``), you can pip install that version and then you can use the following:
68 |
69 | .. code-block:: python
70 |
71 | from typeguard import typechecked as typechecked_vanilla
72 |
73 | @typechecked_vanilla
74 | def foo(a: int) -> int:
75 | ...
76 |
77 | Note that ``@typechecked_vanilla`` will not work with strictly typed pandas types; you can only use it for projects that do not use strictly typed pandas.
78 |
79 | Import hook
80 | ^^^^^^^^^^^
81 |
82 | The import hook is currently not supported in the vendored version. It should be possible to add support for this, but we have not done so yet. If you would like to use the import hook, please open an issue.
83 |
84 | Of course, you can still use the import hook with the vanilla version, as follows:
85 |
86 | .. code-block:: python
87 |
88 | from typeguard import install_import_hook
89 |
90 | install_import_hook('my_app')
91 | from my_app import some_module # import only AFTER installing the hook, or it won't take effect
92 |
93 | Pytest plugin
94 | ^^^^^^^^^^^^^
95 |
96 | To use the vendored version of the pytest plugin, you can use the following:
97 |
98 | .. code-block:: bash
99 |
100 | pytest --stp-typeguard-packages=my_app
101 |
102 | If you also want to use a second typeguard version in your project (e.g. ``typeguard>=3.0.0``), you can pip install that version and then you can use the following:
103 |
104 | .. code-block:: bash
105 |
106 | pytest --typeguard-packages=my_other_app
107 |
108 | You can also use them at the same time:
109 |
110 | .. code-block:: bash
111 |
112 | pytest --stp-typeguard-packages=my_app --typeguard-packages=my_other_app
113 |
114 | Please don't define the same package in both flags, this will raise an error.
115 |
--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import tempfile
3 | from typing import ClassVar
4 |
5 | import numpy as np # type: ignore
6 | import pandas as pd
7 | import pytest
8 |
9 | from strictly_typed_pandas import DataSet
10 | from strictly_typed_pandas.pandas_types import StringDtype
11 |
12 |
13 | class Schema:
14 | a: int
15 | b: str
16 |
17 |
18 | class AlternativeSchema:
19 | a: int
20 |
21 |
22 | class SchemaWithClassVar:
23 | a: int
24 | b: ClassVar[str] = "abc"
25 |
26 |
27 | dictionary = {"a": [1, 2, 3], "b": ["a", "b", "c"]}
28 |
29 |
30 | def test_empty_dataset() -> None:
31 | df = DataSet[Schema]()
32 |
33 | assert df.shape[0] == 0
34 | assert np.all(df.columns == ["a", "b"])
35 |
36 | assert df.dtypes.iloc[0] == int
37 | assert df.dtypes.iloc[1] == object or isinstance(df.dtypes.iloc[1], StringDtype)
38 |
39 |
40 | def test_dataset() -> None:
41 | DataSet[Schema](dictionary)
42 |
43 |
44 | def test_dataset_missing_colnames() -> None:
45 | with pytest.raises(TypeError):
46 | DataSet[Schema]({"a": []})
47 |
48 |
49 | def test_dataset_too_many_colnames() -> None:
50 | with pytest.raises(TypeError):
51 | DataSet[Schema]({"a": [], "b": [], "c": []})
52 |
53 |
54 | def test_dataset_check_types() -> None:
55 | with pytest.raises(TypeError):
56 | DataSet[Schema]({"a": ["1", "2", "3"], "b": ""})
57 |
58 |
59 | def test_dataset_immutable() -> None:
60 | df = DataSet[Schema](dictionary)
61 | strings = ["1", "2", "3"]
62 |
63 | with pytest.raises(NotImplementedError):
64 | df["a"] = strings
65 |
66 | with pytest.raises(NotImplementedError):
67 | df.a = strings
68 |
69 | with pytest.raises(NotImplementedError):
70 | df.loc[:, "a"] = strings
71 |
72 | with pytest.raises(NotImplementedError):
73 | df.iloc[:, 0] = strings
74 |
75 | with pytest.raises(NotImplementedError):
76 | df.assign(a=strings, inplace=True)
77 |
78 | with pytest.raises(NotImplementedError):
79 | # 4th argument is inplace
80 | df.set_index(["a"], True, False, True) # type: ignore
81 |
82 | assert isinstance(df.assign(a=strings), pd.DataFrame)
83 |
84 |
85 | def test_dataset_to_dataframe() -> None:
86 | df = DataSet[Schema](dictionary)
87 | assert isinstance(df.to_dataframe(), pd.DataFrame)
88 | assert isinstance(df.to_frame(), pd.DataFrame)
89 |
90 |
91 | def foo(df: DataSet[Schema]) -> DataSet[Schema]:
92 | return df
93 |
94 |
95 | def test_typeguard_dataset() -> None:
96 | foo(DataSet[Schema]())
97 |
98 | with pytest.raises(TypeError):
99 | foo(DataSet[AlternativeSchema]()) # type: ignore
100 |
101 | with pytest.raises(TypeError):
102 | foo(pd.DataFrame()) # type: ignore
103 |
104 |
105 | def test_duplicates() -> None:
106 | with pytest.raises(TypeError):
107 | DataSet[AlternativeSchema]([[1, 1]], columns=["a", "a"])
108 |
109 |
110 | def test_pickle():
111 | df = DataSet[Schema](dictionary)
112 |
113 | with tempfile.TemporaryDirectory() as tmpdir:
114 | pickle.dump(df, open(f"{tmpdir}/test.pkl", "wb"))
115 | loaded = pickle.load(open(f"{tmpdir}/test.pkl", "rb"))
116 |
117 | assert (df == loaded).all().all()
118 |
119 |
120 | def test_classvar_colum_not_allowed():
121 | with pytest.raises(TypeError):
122 | DataSet[SchemaWithClassVar](dictionary)
123 |
124 |
125 | def test_classvar_colum_not_required():
126 | DataSet[SchemaWithClassVar]({"a": [1, 2, 3]})
127 |
128 |
129 | class A:
130 | a: int
131 |
132 |
133 | class B:
134 | a: int
135 |
136 |
137 | def test_resetting_of_schema_annotations():
138 | df = DataSet[A]()
139 |
140 | a: pd.DataFrame
141 |
142 | # if no schema is specified, the annotation should be None
143 | a = DataSet(df)
144 | assert a._schema_annotations is None
145 |
146 | # when we specify a schema, the class variable will be set to A, but afterwards it should be
147 | # reset to None again when we initialize a new object without specifying a schema
148 | DataSet[A]
149 | a = DataSet(df)
150 | assert a._schema_annotations is None
151 |
152 | # and then to B
153 | a = DataSet[B](df)
154 |
155 | # and then to None again
156 | a = DataSet(df)
157 | assert a._schema_annotations is None
158 |
--------------------------------------------------------------------------------
/tests/test_type_validation.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Callable, Union
2 |
3 | import numpy as np # type: ignore
4 | import pandas as pd
5 |
6 | from strictly_typed_pandas import DataSet, IndexedDataSet
7 | from strictly_typed_pandas.pandas_types import (
8 | BackwardCompatibility,
9 | BooleanDtype,
10 | CategoricalDtype,
11 | DatetimeTZDtype,
12 | Int64Dtype,
13 | IntervalDtype,
14 | PeriodDtype,
15 | SparseDtype,
16 | StringDtype,
17 | )
18 |
19 |
20 | def is_backward_compatibility_type(dtype) -> bool:
21 | if isinstance(dtype, BackwardCompatibility):
22 | return True
23 |
24 | if dtype != Any:
25 | if isinstance(dtype, Callable) and isinstance(dtype(), BackwardCompatibility): # type: ignore
26 | return True
27 |
28 | return False
29 |
30 |
31 | def are_they_equal(observed, expected) -> Union[bool, float]:
32 | if is_backward_compatibility_type(observed) or is_backward_compatibility_type(expected):
33 | return np.nan
34 |
35 | class SchemaExpected:
36 | a: expected
37 |
38 | class SchemaObserved:
39 | a: observed
40 |
41 | df = DataSet[SchemaObserved]()
42 |
43 | try:
44 | DataSet[SchemaExpected](df)
45 | except TypeError:
46 | return False
47 |
48 | return True
49 |
50 |
51 | def check_list_of_types(observed, expected_to_match, expected_to_fail):
52 | expected_to_match += [object, np.object_, Any]
53 | matches = pd.Series([are_they_equal(observed, expected) for expected in expected_to_match])
54 | assert matches.dropna().all()
55 |
56 | fails = pd.Series([are_they_equal(observed, expected) for expected in expected_to_fail])
57 | assert not fails.dropna().any()
58 |
59 |
60 | def test_numeric_base_python_types():
61 | check_list_of_types(int, [np.int64, np.int_, int], [float, np.float64])
62 | check_list_of_types(float, [np.float64, float], [int, np.int_])
63 | check_list_of_types(bool, [np.bool_, bool], [int, np.int_])
64 |
65 |
66 | def test_numpy_types():
67 | check_list_of_types(np.int64, [np.int64, np.int_, int], [float, np.float64])
68 | check_list_of_types(np.float64, [np.float64, float], [int, np.int_])
69 | check_list_of_types(np.bool_, [np.bool_, bool], [int, np.int_])
70 | check_list_of_types(
71 | np.datetime64, [np.datetime64], [np.timedelta64, DatetimeTZDtype(tz="UTC"), np.int_]
72 | )
73 | check_list_of_types(np.timedelta64, [np.timedelta64], [np.datetime64, np.int64])
74 |
75 |
76 | def test_pandas_types():
77 | check_list_of_types(
78 | DatetimeTZDtype(tz="UTC"),
79 | [DatetimeTZDtype(tz="UTC")],
80 | [np.datetime64, DatetimeTZDtype(tz="GMT"), np.int_],
81 | )
82 | check_list_of_types(CategoricalDtype, [CategoricalDtype], [Int64Dtype, np.int_, int])
83 | check_list_of_types(
84 | PeriodDtype(freq="D"),
85 | [PeriodDtype(freq="D")],
86 | [np.datetime64, PeriodDtype(freq="W"), np.int_],
87 | )
88 | check_list_of_types(
89 | SparseDtype(dtype=np.int64),
90 | [SparseDtype(dtype=np.int64)],
91 | [np.int64, SparseDtype(dtype=np.float64), int],
92 | )
93 | check_list_of_types(IntervalDtype, [IntervalDtype], [Int64Dtype, np.int_, int])
94 | check_list_of_types(Int64Dtype, [Int64Dtype], [IntervalDtype, np.int64, int])
95 | check_list_of_types(BooleanDtype, [BooleanDtype], [IntervalDtype, np.bool_, bool])
96 |
97 |
98 | def test_strings():
99 | check_list_of_types(str, [str, StringDtype], [int, np.int_])
100 | check_list_of_types(StringDtype, [str, StringDtype], [int, np.int_])
101 |
102 | # as long as this is true
103 | df = pd.DataFrame({"a": ["a", "b", "c"]})
104 | assert df.dtypes.iloc[0] == object
105 | # we'll need to do this
106 | check_list_of_types(object, [str], [StringDtype])
107 |
108 |
109 | def test_any():
110 | check_list_of_types(Any, [], [int, np.int_])
111 | check_list_of_types(object, [], [int, np.int_])
112 | check_list_of_types(np.object_, [], [int, np.int_])
113 |
114 |
115 | class DataSchema:
116 | b: str
117 |
118 |
119 | def test_supported_index_data_type():
120 | dtypes = [
121 | DatetimeTZDtype(tz="UTC"),
122 | CategoricalDtype,
123 | PeriodDtype(freq="D"),
124 | IntervalDtype,
125 | str,
126 | int,
127 | float,
128 | np.int_,
129 | np.float64,
130 | np.datetime64,
131 | np.timedelta64,
132 | Any,
133 | object,
134 | np.object_,
135 | SparseDtype(dtype=np.int64),
136 | np.bool_,
137 | Int64Dtype,
138 | BooleanDtype,
139 | StringDtype,
140 | ]
141 | for dtype in dtypes:
142 | if is_backward_compatibility_type(dtype):
143 | continue
144 |
145 | class IndexSchema:
146 | a: dtype # type: ignore
147 |
148 | IndexedDataSet[IndexSchema, DataSchema]()
149 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard/importhook.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import sys
3 | from importlib.abc import MetaPathFinder
4 | from importlib.machinery import SourceFileLoader
5 | from importlib.util import cache_from_source, decode_source
6 | from inspect import isclass
7 | from typing import Iterable, Type
8 | from unittest.mock import patch
9 |
10 |
11 | # The name of this function is magical
12 | def _call_with_frames_removed(f, *args, **kwargs):
13 | return f(*args, **kwargs)
14 |
15 |
16 | def optimized_cache_from_source(path, debug_override=None):
17 | return cache_from_source(path, debug_override, optimization='typeguard')
18 |
19 |
20 | class TypeguardTransformer(ast.NodeVisitor):
21 | def __init__(self) -> None:
22 | self._parents = []
23 |
24 | def visit_Module(self, node: ast.Module):
25 | # Insert "from strictly_typed_pandas._vendor import typeguard" after any "from __future__ ..." imports
26 | for i, child in enumerate(node.body):
27 | if isinstance(child, ast.ImportFrom) and child.module == '__future__':
28 | continue
29 | elif isinstance(child, ast.Expr) and isinstance(child.value, ast.Str):
30 | continue # module docstring
31 | else:
32 | node.body.insert(
33 | i,
34 | ast.ImportFrom(
35 | module="strictly_typed_pandas._vendor",
36 | names=[ast.alias(name="typeguard", asname=None)],
37 | level=0,
38 | ),
39 | )
40 | break
41 |
42 | self._parents.append(node)
43 | self.generic_visit(node)
44 | self._parents.pop()
45 | return node
46 |
47 | def visit_ClassDef(self, node: ast.ClassDef):
48 | node.decorator_list.append(
49 | ast.Attribute(ast.Name(id='typeguard', ctx=ast.Load()), 'typechecked', ast.Load())
50 | )
51 | self._parents.append(node)
52 | self.generic_visit(node)
53 | self._parents.pop()
54 | return node
55 |
56 | def visit_FunctionDef(self, node: ast.FunctionDef):
57 | # Let the class level decorator handle the methods of a class
58 | if isinstance(self._parents[-1], ast.ClassDef):
59 | return node
60 |
61 | has_annotated_args = any(arg for arg in node.args.args if arg.annotation)
62 | has_annotated_return = bool(node.returns)
63 | if has_annotated_args or has_annotated_return:
64 | node.decorator_list.insert(
65 | 0,
66 | ast.Attribute(ast.Name(id='typeguard', ctx=ast.Load()), 'typechecked', ast.Load())
67 | )
68 |
69 | self._parents.append(node)
70 | self.generic_visit(node)
71 | self._parents.pop()
72 | return node
73 |
74 |
75 | class TypeguardLoader(SourceFileLoader):
76 | def source_to_code(self, data, path, *, _optimize=-1):
77 | source = decode_source(data)
78 | tree = _call_with_frames_removed(compile, source, path, 'exec', ast.PyCF_ONLY_AST,
79 | dont_inherit=True, optimize=_optimize)
80 | tree = TypeguardTransformer().visit(tree)
81 | ast.fix_missing_locations(tree)
82 | return _call_with_frames_removed(compile, tree, path, 'exec',
83 | dont_inherit=True, optimize=_optimize)
84 |
85 | def exec_module(self, module):
86 | # Use a custom optimization marker – the import lock should make this monkey patch safe
87 | with patch('importlib._bootstrap_external.cache_from_source', optimized_cache_from_source):
88 | return super().exec_module(module)
89 |
90 |
91 | class TypeguardFinder(MetaPathFinder):
92 | """
93 | Wraps another path finder and instruments the module with ``@typechecked`` if
94 | :meth:`should_instrument` returns ``True``.
95 |
96 | Should not be used directly, but rather via :func:`~.install_import_hook`.
97 |
98 | .. versionadded:: 2.6
99 |
100 | """
101 |
102 | def __init__(self, packages, original_pathfinder):
103 | self.packages = packages
104 | self._original_pathfinder = original_pathfinder
105 |
106 | def find_spec(self, fullname, path=None, target=None):
107 | if self.should_instrument(fullname):
108 | spec = self._original_pathfinder.find_spec(fullname, path, target)
109 | if spec is not None and isinstance(spec.loader, SourceFileLoader):
110 | spec.loader = TypeguardLoader(spec.loader.name, spec.loader.path)
111 | return spec
112 |
113 | return None
114 |
115 | def should_instrument(self, module_name: str) -> bool:
116 | """
117 | Determine whether the module with the given name should be instrumented.
118 |
119 | :param module_name: full name of the module that is about to be imported (e.g. ``xyz.abc``)
120 |
121 | """
122 | for package in self.packages:
123 | if module_name == package or module_name.startswith(package + '.'):
124 | return True
125 |
126 | return False
127 |
128 |
129 | class ImportHookManager:
130 | def __init__(self, hook: MetaPathFinder):
131 | self.hook = hook
132 |
133 | def __enter__(self):
134 | pass
135 |
136 | def __exit__(self, exc_type, exc_val, exc_tb):
137 | self.uninstall()
138 |
139 | def uninstall(self):
140 | try:
141 | sys.meta_path.remove(self.hook)
142 | except ValueError:
143 | pass # already removed
144 |
145 |
146 | def install_import_hook(packages: Iterable[str], *,
147 | cls: Type[TypeguardFinder] = TypeguardFinder) -> ImportHookManager:
148 | """
149 | Install an import hook that decorates classes and functions with ``@typechecked``.
150 |
151 | This only affects modules loaded **after** this hook has been installed.
152 |
153 | :return: a context manager that uninstalls the hook on exit (or when you call ``.uninstall()``)
154 |
155 | .. versionadded:: 2.6
156 |
157 | """
158 | if isinstance(packages, str):
159 | packages = [packages]
160 |
161 | for i, finder in enumerate(sys.meta_path):
162 | if isclass(finder) and finder.__name__ == 'PathFinder' and hasattr(finder, 'find_spec'):
163 | break
164 | else:
165 | raise RuntimeError('Cannot find a PathFinder in sys.meta_path')
166 |
167 | hook = cls(packages, finder)
168 | sys.meta_path.insert(0, hook)
169 | return ImportHookManager(hook)
170 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/dataset.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from abc import ABC
3 | from typing import Any, Generic, TypeVar, get_type_hints
4 |
5 | import pandas as pd
6 |
7 | from strictly_typed_pandas.create_empty_dataframe import (
8 | create_empty_dataframe,
9 | create_empty_indexed_dataframe,
10 | )
11 | from strictly_typed_pandas.immutable import (
12 | _ImmutableiLocIndexer,
13 | _ImmutableLocIndexer,
14 | immutable_error_msg,
15 | inplace_argument_interceptor,
16 | )
17 | from strictly_typed_pandas.validate_schema import check_for_duplicate_columns, validate_schema
18 |
19 | dataframe_functions = dict(inspect.getmembers(pd.DataFrame, predicate=inspect.isfunction))
20 | dataframe_member_names = dict(inspect.getmembers(pd.DataFrame)).keys()
21 |
22 |
23 | class DataSetBase(pd.DataFrame, ABC):
24 | def __init__(self, *args, **kwargs) -> None:
25 | """This class is a subclass of `pd.DataFrame`, hence it is initialized with the
26 | same parameters as a `DataFrame`.
27 |
28 | See the Pandas `DataFrame` documentation for more information.
29 | """
30 | super().__init__(*args, **kwargs)
31 |
32 | if self.columns.duplicated().any():
33 | msg = "DataSet has duplicate columns: {cols}".format(
34 | cols=self.columns[self.columns.duplicated()]
35 | )
36 | raise TypeError(msg)
37 |
38 | def __setattr__(self, name: str, value: Any) -> None:
39 | object.__setattr__(self, name, value)
40 |
41 | if name in self.columns and name not in dataframe_member_names:
42 | raise NotImplementedError(immutable_error_msg)
43 |
44 | def __setitem__(self, key: Any, value: Any):
45 | raise NotImplementedError(immutable_error_msg)
46 |
47 | def __getattribute__(self, name: str) -> Any:
48 | if name in dataframe_functions:
49 | attribute = dataframe_functions[name].__get__(self, type(self))
50 | return inplace_argument_interceptor(attribute)
51 | else:
52 | return object.__getattribute__(self, name)
53 |
54 | @property
55 | def iloc(self) -> _ImmutableiLocIndexer: # type: ignore
56 | return _ImmutableiLocIndexer("iloc", self) # type: ignore
57 |
58 | @property
59 | def loc(self) -> _ImmutableLocIndexer: # type: ignore
60 | return _ImmutableLocIndexer("loc", self) # type: ignore
61 |
62 | def to_dataframe(self) -> pd.DataFrame:
63 | """Converts the object to a pandas `DataFrame`."""
64 | return pd.DataFrame(self)
65 |
66 | def to_frame(self) -> pd.DataFrame:
67 | """Synonym of to to_dataframe(): converts the object to a pandas `DataFrame`."""
68 | return self.to_dataframe()
69 |
70 |
71 | T = TypeVar("T")
72 | V = TypeVar("V")
73 |
74 |
75 | class DataSet(Generic[T], DataSetBase):
76 | """`DataSet` allows for static type checking of pandas DataFrames, for example:
77 |
78 | .. code-block:: python
79 |
80 | class Schema:
81 | a: int
82 |
83 | DataSet[Schema]({"a": [1, 2, 3]})
84 |
85 | Where `DataSet`:
86 | * is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`.
87 | * validates whether the data adheres to the provided schema upon its initialization.
88 | * is immutable, so its schema cannot be changed using inplace modifications.
89 |
90 | The `DataSet[Schema]` annotations are compatible with:
91 | * `mypy` for type checking during linting-time (i.e. while you write your code).
92 | * `typeguard` (<3.0) for type checking during run-time (i.e. while you run your unit tests).
93 | """
94 |
95 | _schema_annotations = None
96 |
97 | def __class_getitem__(cls, item):
98 | """Allows us to define a schema for the ``DataSet``."""
99 | cls = super().__class_getitem__(item)
100 | cls._schema_annotations = item
101 | return cls
102 |
103 | def __init__(self, *args, **kwargs):
104 | super().__init__(*args, **kwargs)
105 |
106 | if DataSet._schema_annotations is None:
107 | return
108 |
109 | schema_expected = get_type_hints(DataSet._schema_annotations)
110 | DataSet._schema_annotations = None
111 |
112 | if self.shape == (0, 0):
113 | df = create_empty_dataframe(schema_expected)
114 | super().__init__(df)
115 | else:
116 | schema_observed = dict(zip(self.columns, self.dtypes))
117 | validate_schema(schema_expected, schema_observed)
118 |
119 |
120 | class IndexedDataSet(Generic[T, V], DataSetBase):
121 | """`IndexedDataSet` allows for static type checking of indexed pandas DataFrames,
122 | for example:
123 |
124 | .. code-block:: text
125 |
126 | class IndexSchema:
127 | a: int
128 |
129 | class DataSchema:
130 | b: str
131 |
132 | df = (
133 | pd.DataFrame(
134 | {
135 | "a": [1, 2, 3],
136 | "b": ["1", "2", "3"]
137 | }
138 | )
139 | .set_index(["a"])
140 | .pipe(IndexedDataSet[IndexSchema, DataSchema])
141 | )
142 |
143 | Where `IndexedDataSet`:
144 | * is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`.
145 | * validates whether the data adheres to the provided schema upon its initialization.
146 | * is immutable, so its schema cannot be changed using inplace modifications.
147 |
148 | The `IndexedDataSet[Schema]` annotations are compatible with:
149 | * `mypy` for type checking during linting-time (i.e. while you write your code).
150 | * `typeguard` (<3.0) for type checking during run-time (i.e. while you run your unit tests).
151 | """
152 |
153 | _schema_index = None
154 | _schema_data = None
155 |
156 | def __class_getitem__(cls, item):
157 | """Allows us to define a schema for the ``DataSet``."""
158 | cls = super().__class_getitem__(item)
159 | cls._schema_index = item[0]
160 | cls._schema_data = item[1]
161 | return cls
162 |
163 | def __init__(self, *args, **kwargs):
164 | super().__init__(*args, **kwargs)
165 |
166 | if IndexedDataSet._schema_index is None or IndexedDataSet._schema_data is None:
167 | return
168 |
169 | schema_index_expected = get_type_hints(IndexedDataSet._schema_index)
170 | schema_data_expected = get_type_hints(IndexedDataSet._schema_data)
171 | IndexedDataSet._schema_index = None
172 | IndexedDataSet._schema_data = None
173 |
174 | check_for_duplicate_columns(
175 | set(schema_index_expected.keys()), set(schema_data_expected.keys())
176 | )
177 |
178 | if self.shape == (0, 0) and self.index.shape == (0,):
179 | df = create_empty_indexed_dataframe(schema_index_expected, schema_data_expected)
180 | super().__init__(df)
181 | else:
182 | schema_data_observed = dict(zip(self.columns, self.dtypes))
183 | schema_index_observed = {
184 | name: self.index.get_level_values(i).dtype
185 | for i, name in enumerate(self.index.names)
186 | }
187 |
188 | if all(name is None for name in self.index.names):
189 | raise TypeError("No named columns in index. Did you remember to set the index?")
190 |
191 | validate_schema(schema_index_expected, schema_index_observed)
192 | validate_schema(schema_data_expected, schema_data_observed)
193 |
--------------------------------------------------------------------------------
/docs/source/advanced.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Advanced\n",
8 | "\n",
9 | "## Subclassing schemas\n",
10 | "\n",
11 | "Subclassing schemas is a useful pattern for pipelines where every next function adds a few columns."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from strictly_typed_pandas import DataSet\n",
21 | "\n",
22 | "\n",
23 | "class SchemaA:\n",
24 | " name: str\n",
25 | "\n",
26 | "\n",
27 | "class SchemaB(SchemaA):\n",
28 | " id: int\n",
29 | "\n",
30 | "\n",
31 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n",
32 | "\n",
33 | "\n",
34 | "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n",
35 | " return df.assign(\n",
36 | " id=lambda df: range(df.shape[0]),\n",
37 | " ).pipe(DataSet[SchemaB])"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "Similarly, you can use it when merging (or joining or concatenating) two datasets together."
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "class SchemaA:\n",
54 | " id: int\n",
55 | " name: str\n",
56 | "\n",
57 | "\n",
58 | "class SchemaB:\n",
59 | " id: int\n",
60 | " job: str\n",
61 | "\n",
62 | "\n",
63 | "class SchemaAB(SchemaA, SchemaB):\n",
64 | " pass\n",
65 | "\n",
66 | "\n",
67 | "df1 = DataSet[SchemaA](\n",
68 | " {\n",
69 | " \"id\": [1, 2, 3],\n",
70 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
71 | " }\n",
72 | ")\n",
73 | "df2 = DataSet[SchemaB](\n",
74 | " {\n",
75 | " \"id\": [1, 2, 3],\n",
76 | " \"job\": \"Data Scientist\",\n",
77 | " }\n",
78 | ")\n",
79 | "df1.merge(df2, on=\"id\").pipe(DataSet[SchemaAB])"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "## Creating an empty DataSet\n",
87 | "Sometimes it's useful to create a DataSet without any rows. This can be easily done as follows:"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "class Schema:\n",
97 | " id: int\n",
98 | " name: str\n",
99 | "\n",
100 | "\n",
101 | "DataSet[Schema]()"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "## Support for numpy and pandas data types\n",
109 | "We also support using numpy types and pandas types, as well as `typing.Any`. If you miss support for any other data type, drop us a line and we'll see if we can add it!"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "import numpy as np\n",
119 | "import pandas as pd\n",
120 | "from typing import Any\n",
121 | "\n",
122 | "\n",
123 | "class Schema:\n",
124 | " name: pd.StringDtype\n",
125 | " money: np.float64\n",
126 | " eggs: np.int64\n",
127 | " potatoes: Any\n",
128 | "\n",
129 | "\n",
130 | "df = DataSet[Schema](\n",
131 | " {\n",
132 | " \"name\": pd.Series([\"John\", \"Jane\", \"Jack\"], dtype=\"string\"),\n",
133 | " \"money\": pd.Series([100.50, 1000.23, 123.45], dtype=np.float64),\n",
134 | " \"eggs\": pd.Series([1, 2, 3], dtype=np.int64),\n",
135 | " \"potatoes\": [\"1\", 0, np.nan],\n",
136 | " }\n",
137 | ")\n",
138 | "\n",
139 | "df.dtypes"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "# ClassVar variables\n",
147 | "\n",
148 | "Variables annotated with `typing.ClassVar` variables are not included in the schema, so these can be used for example to store metadata about the DataSet."
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "from typing import ClassVar\n",
158 | "\n",
159 | "class Schema:\n",
160 | " id: int\n",
161 | " name: str\n",
162 | " file_name: ClassVar[str] = \"schema_data.csv\"\n",
163 | "\n",
164 | "df1 = DataSet[Schema](\n",
165 | " {\n",
166 | " \"id\": [1, 2, 3],\n",
167 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
168 | " }\n",
169 | ")\n",
170 | "\n",
171 | "print(Schema.file_name)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "## IndexedDataSet\n",
179 | "\n",
180 | "If you'd like to also strictly type the index, you can use the IndexedDataSet class."
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "from strictly_typed_pandas import IndexedDataSet\n",
190 | "\n",
191 | "\n",
192 | "class IndexSchema:\n",
193 | " id: int\n",
194 | " job: str\n",
195 | "\n",
196 | "\n",
197 | "class DataSchema:\n",
198 | " name: str\n",
199 | "\n",
200 | "\n",
201 | "df = (\n",
202 | " pd.DataFrame(\n",
203 | " {\n",
204 | " \"id\": [1, 2, 3],\n",
205 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
206 | " \"job\": \"Data Scientist\",\n",
207 | " }\n",
208 | " )\n",
209 | " .set_index([\"id\", \"job\"])\n",
210 | " .pipe(IndexedDataSet[IndexSchema, DataSchema])\n",
211 | ")"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "## Reusing a variable (e.g. `df`) with different schemas\n",
219 | "Sometimes when building a pipeline, it's useful to reuse a variable (e.g. `df`) with different schemas. If we do that in the following way however, we'll get a mypy error."
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "class SchemaA:\n",
229 | " name: str\n",
230 | "\n",
231 | "\n",
232 | "class SchemaB(SchemaA):\n",
233 | " id: int\n",
234 | "\n",
235 | "\n",
236 | "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n",
237 | " return df.assign(id=1).pipe(DataSet[SchemaB])\n",
238 | "\n",
239 | "\n",
240 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n",
241 | "df = foo(df)\n",
242 | "# mypy(error): Incompatible types in assignment (expression has type \"DataSet[SchemaB]\", variable has type \"DataSet[SchemaA]\")"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "To avoid this error, we need to declare that `df` will be of the type `DataSet` (implying the the schema may be different at different points)"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "df: DataSet\n",
259 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n",
260 | "df = foo(df)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": []
269 | }
270 | ],
271 | "metadata": {
272 | "interpreter": {
273 | "hash": "21955bae40816b58329a864495bd83642121ab031d49eff86d34b7b0569c6cea"
274 | },
275 | "kernelspec": {
276 | "display_name": "Python 3.8.5 64-bit ('base': conda)",
277 | "name": "python3"
278 | },
279 | "language_info": {
280 | "name": "python",
281 | "version": ""
282 | },
283 | "orig_nbformat": 2
284 | },
285 | "nbformat": 4,
286 | "nbformat_minor": 2
287 | }
288 |
--------------------------------------------------------------------------------
/docs/source/deepdive_into_dtypes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Deepdive into data types"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "\n",
19 | "from typing import Any\n",
20 | "from strictly_typed_pandas import DataSet"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## Numeric types\n",
28 | "\n",
29 | "Pandas stores all numeric data using numpy data types. For example, if we make the following `DataFrame` (where we explicitely define the data types using base python types):"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "df = pd.DataFrame(\n",
39 | " {\n",
40 | " \"a\": pd.Series([1, 2, 3], dtype=int),\n",
41 | " \"b\": pd.Series([1.0, 2.0, 3.0], dtype=float),\n",
42 | " \"c\": pd.Series([True, False, True], dtype=bool),\n",
43 | " }\n",
44 | ")\n",
45 | "\n",
46 | "df.dtypes"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "Then we see that all columns have a numpy data type."
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "assert df.dtypes[\"a\"] == np.int64\n",
63 | "assert df.dtypes[\"b\"] == np.float64\n",
64 | "assert df.dtypes[\"c\"] == np.bool_"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "Interestingly, numpy data types are by default equal to their base python counterparts."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "assert df.dtypes[\"a\"] == int\n",
81 | "assert df.dtypes[\"b\"] == float\n",
82 | "assert df.dtypes[\"c\"] == bool"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "Following this mindset, we allow the schemas to be defined using either numpy or base python data types."
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "class Schema:\n",
99 | " a: int\n",
100 | " b: float\n",
101 | " c: bool\n",
102 | "\n",
103 | "\n",
104 | "df = DataSet[Schema]()\n",
105 | "df.dtypes"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "class Schema:\n",
115 | " a: np.int64\n",
116 | " b: np.float64\n",
117 | " c: np.bool_\n",
118 | "\n",
119 | "\n",
120 | "df = DataSet[Schema]()\n",
121 | "df.dtypes"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "You can also define your schema with superclasses (e.g. `np.integer`) instead of specific classes (e.g. `np.int64`)."
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "class Schema:\n",
138 | " a: np.integer\n",
139 | "\n",
140 | "\n",
141 | "df = DataSet[Schema](\n",
142 | " {\n",
143 | " \"a\": pd.Series([1, 2, 3], dtype=np.int64),\n",
144 | " }\n",
145 | ")\n",
146 | "df.dtypes"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## Datetime and timedelta\n",
154 | "These too are defined using numpy.\n"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "class Schema:\n",
164 | " a: np.datetime64\n",
165 | " b: np.timedelta64\n",
166 | "\n",
167 | "\n",
168 | "df = DataSet[Schema]()\n",
169 | "df.dtypes"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "## Pandas data types\n",
177 | "Pandas has a number of its own data types, to allow for things like:\n",
178 | "\n",
179 | "* Timezones\n",
180 | "\n",
181 | "* Categorical values\n",
182 | "\n",
183 | "* Sparse data"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "class Schema:\n",
193 | " a: pd.DatetimeTZDtype(tz=\"UTC\") # type: ignore # noqa: F821\n",
194 | " b: pd.CategoricalDtype\n",
195 | " c: pd.PeriodDtype(freq=\"D\") # type: ignore # noqa: F821\n",
196 | " d: pd.SparseDtype(dtype=np.int64) # type: ignore\n",
197 | " e: pd.IntervalDtype\n",
198 | " f: pd.Int64Dtype\n",
199 | " h: pd.BooleanDtype\n",
200 | "\n",
201 | "\n",
202 | "df = DataSet[Schema]()\n",
203 | "df.dtypes"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "Some of these types accept arguments (e.g. `pd.DatetimeTZDtype(tz=\"UTC\")`). While this works perfectly well during run-time, it does result in linting errors. You can suppress these without any problems by using `# type: ignore # noqa: F821`.\n",
211 | "\n",
212 | "Note that the pandas data types are not considered equivalent to their numpy or base python equivalents."
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "class SchemaA:\n",
222 | " a: pd.Int64Dtype\n",
223 | "\n",
224 | "\n",
225 | "class SchemaB:\n",
226 | " a: np.int64\n",
227 | "\n",
228 | "\n",
229 | "try:\n",
230 | " DataSet[SchemaA]().pipe(DataSet[SchemaB])\n",
231 | "except TypeError as e:\n",
232 | " print(e)"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "## Strings\n",
240 | "String types are complicated business in pandas. From pandas 1.0.0 and higher, we suggest using the `string` (i.e. `pd.StringDtype`) data type. When defining a schema, this data type is compatible with both the base python `str` annotation and the pandas `pd.StringDtype` annotation."
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "class Schema:\n",
250 | " a: str\n",
251 | " b: pd.StringDtype\n",
252 | "\n",
253 | "\n",
254 | "df = DataSet[Schema](\n",
255 | " {\n",
256 | " \"a\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\"),\n",
257 | " \"b\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\"),\n",
258 | " }\n",
259 | ")\n",
260 | "df.dtypes"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "Unfortunately, `pd.StringDtype` has only been around briefly: it isn't available in older versions of python, and as of yet it is still not used by default when creating a DataFrame with strings. Instead, strings are by default stored as the non-descript `object` type."
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "df = pd.DataFrame({\"a\": [\"a\", \"b\", \"c\"]})\n",
277 | "df.dtypes"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "metadata": {},
283 | "source": [
284 | "To be consistent, we have decided to set `str == object` when checking the schema, atleast until `pd.StringDtype` will be the default data type for strings in pandas."
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "class Schema:\n",
294 | " a: str\n",
295 | "\n",
296 | "\n",
297 | "df = DataSet[Schema]({\"a\": [\"a\", \"b\", \"c\"]})\n",
298 | "df.dtypes"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "Note that this is horribly unspecific. For example, the following `DataSet` contains a column `a` with data type `object`, which contains several things that are definitely not strings. However, since we had to agree that `object == str`, this currently passes without failure."
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "class Schema:\n",
315 | " a: str\n",
316 | "\n",
317 | "\n",
318 | "df = DataSet[Schema]({\"a\": [None, 42, lambda x: x]})\n",
319 | "df.dtypes"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "We hope that `pd.StringDtype` will soon be the default string type, so that we can avoid the problem outlined above. Until then, if you want to be sure that your string columns are actually strings, it's best to use `pd.StringDtype` for your type annotations."
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "class Schema:\n",
336 | " a: pd.StringDtype\n",
337 | "\n",
338 | "\n",
339 | "df = DataSet[Schema]({\"a\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\")})"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "try:\n",
349 | " DataSet[Schema]({\"a\": [None, 42, lambda x: x]})\n",
350 | "except TypeError as e:\n",
351 | " print(e)"
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "metadata": {},
357 | "source": [
358 | "## The `Any` type\n",
359 | "\n",
360 | "In some cases it is useful to be able to define that a column can have `Any` type. This can either be a column of a specific type (e.g. `int64`) or a mix of data types (i.e. an `object`)"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": null,
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "class Schema:\n",
370 | " a: Any\n",
371 | " b: Any\n",
372 | "\n",
373 | "\n",
374 | "df = DataSet[Schema](\n",
375 | " {\n",
376 | " \"a\": [1, 2, 3],\n",
377 | " \"b\": [\"1\", 2, None],\n",
378 | " }\n",
379 | ")\n",
380 | "df.dtypes"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "## Anything missing?\n",
388 | "There's a zoo of data types used in pandas. Is anything missing? Contact me and I'll look into it!"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": []
397 | }
398 | ],
399 | "metadata": {
400 | "kernelspec": {
401 | "display_name": "stp",
402 | "language": "python",
403 | "name": "python3"
404 | },
405 | "language_info": {
406 | "codemirror_mode": {
407 | "name": "ipython",
408 | "version": 3
409 | },
410 | "file_extension": ".py",
411 | "mimetype": "text/x-python",
412 | "name": "python",
413 | "nbconvert_exporter": "python",
414 | "pygments_lexer": "ipython3",
415 | "version": "3.8.13"
416 | },
417 | "orig_nbformat": 2,
418 | "vscode": {
419 | "interpreter": {
420 | "hash": "0785e816af5df78c77a9de5b5385808c06b955fe7dba50fa53415245f1f2e5ee"
421 | }
422 | }
423 | },
424 | "nbformat": 4,
425 | "nbformat_minor": 2
426 | }
427 |
--------------------------------------------------------------------------------
/docs/source/getting_started.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Getting started\n",
8 | "\n",
9 | "## The problem\n",
10 | "\n",
11 | "I love Pandas! But in production code I’m always a bit wary when I see:"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import pandas as pd\n",
21 | "\n",
22 | "\n",
23 | "def foo(df: pd.DataFrame) -> pd.DataFrame:\n",
24 | " # do stuff\n",
25 | " return df"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "Because… How do I know which columns are supposed to be in `df`?\n",
33 | "\n",
34 | "Sure, in a notebook this is often not a big problem, because we'll likely have\n",
35 | "\n",
36 | "* a few hundred lines of code\n",
37 | "\n",
38 | "* that you're working on alone\n",
39 | "\n",
40 | "* over a limited amount of time\n",
41 | "\n",
42 | "But what if this is production code, where we have:\n",
43 | "\n",
44 | "* \\>1000 lines of code\n",
45 | "\n",
46 | "* that we are maintaining for years to come\n",
47 | "\n",
48 | "* potentially by colleagues who haven't even been hired yet\n",
49 | "\n",
50 | "You'll probably want to be a bit more explicit about what these DataFrames should look like!\n",
51 | "\n",
52 | "## The solution: static type checking of pandas DataFrames\n",
53 | "\n",
54 | "Suppose we know that our DataFrame has two columns: `id` (an int) and `name` (a str). Using `strictly_typed_pandas`, we may write that down as follows."
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "from strictly_typed_pandas import DataSet\n",
64 | "\n",
65 | "\n",
66 | "class Schema:\n",
67 | " id: int\n",
68 | " name: str\n",
69 | "\n",
70 | "\n",
71 | "def foo(df: DataSet[Schema]) -> DataSet[Schema]:\n",
72 | " # do stuff\n",
73 | " return df"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "These type definitions can now be checked using `mypy`, a linter for static type checking. The big benefit of `mypy` is that the type checking doesn't happen during run-time, but rather during linting time (so while you're coding), saving you precious time. If you haven't already, you should really check out how to set up `mypy` for your IDE.\n",
81 | "\n",
82 | "Let's consider an example of how this works. First, we'll create some data. Since `DataSet` is a subclass of `pd.DataFrame`, it has (nearly) all the functionality of a `DataFrame`, including:"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "df = DataSet[Schema](\n",
92 | " {\n",
93 | " \"id\": [1, 2, 3],\n",
94 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
95 | " }\n",
96 | ")\n",
97 | "df"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "We can now call `foo()` with our data. All types check out, so nothing special happens."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "res = foo(df)"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "However, if we instead try to run `foo()` on a `DataFrame`, mypy will throw the following error.\n",
121 | "\n",
122 | "(Shown as a comment here, but it will show up in your IDE if you set up mypy.)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "df = pd.DataFrame(df)\n",
132 | "res = foo(df)\n",
133 | "# mypy(error): Argument 1 to \"foo\" has incompatible type \"DataFrame\"; expected \"DataSet[Schema]\""
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "Likewise, if we call `foo()` on a `DataSet` with an alternative schema, mypy will throw the following error."
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "class AlternativeSchema:\n",
150 | " id: int\n",
151 | " first_name: str\n",
152 | "\n",
153 | "\n",
154 | "df = DataSet[AlternativeSchema](\n",
155 | " {\n",
156 | " \"id\": [1, 2, 3],\n",
157 | " \"first_name\": [\"John\", \"Jane\", \"Jack\"],\n",
158 | " }\n",
159 | ")\n",
160 | "try:\n",
161 | " res = foo(df)\n",
162 | " # mypy(error): Argument 1 to \"foo\" has incompatible type \"DataSet[AlternativeSchema]\"; expected \"DataSet[Schema]\"\n",
163 | "except:\n",
164 | " pass"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "## How can we be sure that a DataSet adheres to its schema?\n",
172 | "\n",
173 | "The above is great if everyone is meticulous in keeping the schema annotations correct and up-to-date. But shouldn't we be worried that these schema annotations get out of sync? For example:"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "class Schema:\n",
183 | " id: int\n",
184 | " name: str\n",
185 | "\n",
186 | "\n",
187 | "def foo() -> DataSet[Schema]:\n",
188 | " return DataSet[Schema](\n",
189 | " {\n",
190 | " \"id\": [1, 2, 3],\n",
191 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
192 | " \"job\": \"Data Scientist\",\n",
193 | " }\n",
194 | " )"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "Fortunately, we have some extra precautions in place that prevent the above scenario:\n",
202 | "\n",
203 | "* The schema of the data is validated during the `DataSet` creation.\n",
204 | "\n",
205 | "* `DataSet` is immutable, so its schema cannot change due to inplace modifications.\n",
206 | "\n",
207 | "As we will see, this means that if your codebase (e.g. `foo()`) is unit tested, functions like the above will result in errors and hence they shouldn't make it to the master branch. As such, you will be able to trust the schema annotations in your code base.\n",
208 | "\n",
209 | "Let's have a look at these precautions in more detail. First, if the columns in the data do not correspond to the ones defined in the shema, we get a TypeError, for example:\n"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "try:\n",
219 | " df = DataSet[Schema]({\"id\": [1, 2, 3]})\n",
220 | "except TypeError as e:\n",
221 | " print(e)"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "Similarly, if the types defined in the schema don't match the types in the data, we again get a `TypeError`."
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "try:\n",
238 | " df = DataSet[Schema](\n",
239 | " {\n",
240 | " \"id\": [1, 2, 3],\n",
241 | " \"name\": [1, 2, 3],\n",
242 | " }\n",
243 | " )\n",
244 | "except TypeError as e:\n",
245 | " print(e)"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "Hence, when we successsfully create our `DataSet[Schema]`, we can be certain that it adheres to the schema. \n",
253 | "\n",
254 | "Of course, for this to work, we do need to make sure that the `DataSet`'s columns and datatypes cannot be changed after its creation. This brings us to our second point: \n",
255 | "\n",
256 | "* `DataSet` is immutable, so its schema cannot change due to inplace modifications.\n",
257 | "\n",
258 | "To this end, we have disabled operations such as:"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "df = DataSet[Schema](\n",
268 | " {\n",
269 | " \"id\": [1, 2, 3],\n",
270 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
271 | " }\n",
272 | ")\n",
273 | "ids = [\"1\", \"2\", \"3\"]\n",
274 | "try:\n",
275 | " df[\"id\"] = ids\n",
276 | " df.id = ids\n",
277 | " df.loc[:, \"id\"] = ids\n",
278 | " df.iloc[:, 0] = ids\n",
279 | " df.assign(id=ids, inplace=True)\n",
280 | "except NotImplementedError as e:\n",
281 | " print(e)"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "When you do need to make changes to the schema, you can either cast the `DataSet` back to a `DataFrame`."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "df = df.to_dataframe()"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "Or you can perform the `assign()` in the following way, which also casts it to a `DataFrame`"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "df = df.assign(id=ids)\n",
314 | "assert type(df) == pd.DataFrame"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "In practice, this often means that functions have the following sequence:\n",
322 | "\n",
323 | "1. The input is a `DataSet[SchemaA]`\n",
324 | "\n",
325 | "2. The data is converted to a `DataFrame` so changes can be made\n",
326 | "\n",
327 | "3. The output is cast to `DataSet[SchemaB]`"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "class SchemaA:\n",
337 | " name: str\n",
338 | "\n",
339 | "\n",
340 | "class SchemaB:\n",
341 | " id: int\n",
342 | " name: str\n",
343 | "\n",
344 | "\n",
345 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n",
346 | "\n",
347 | "\n",
348 | "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n",
349 | " n = df.shape[0]\n",
350 | " ids = range(n)\n",
351 | " new_df = df.assign(id=ids)\n",
352 | " return DataSet[SchemaB](new_df)"
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {},
358 | "source": [
359 | "Or alternatively in the more compact version"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "def foo(data: DataSet[SchemaA]) -> DataSet[SchemaB]:\n",
369 | " return df.assign(\n",
370 | " id=lambda df: range(df.shape[0]),\n",
371 | " ).pipe(DataSet[SchemaB])"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "metadata": {},
377 | "source": [
378 | "## What about functions that return `Any`?\n",
379 | "So far we've seen that we can strictly type check our pandas data using a combination of linting checks and runtime checks. So is there anything that we haven't covered yet? Well, it turns out there is. Consider the following example.\n"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "class Schema:\n",
389 | " id: int\n",
390 | " name: str\n",
391 | "\n",
392 | "\n",
393 | "def foo() -> DataSet[Schema]:\n",
394 | " return (\n",
395 | " DataSet[Schema](\n",
396 | " {\n",
397 | " \"id\": [1, 2, 3],\n",
398 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
399 | " }\n",
400 | " )\n",
401 | " .assign(job=\"Data Scientist\")\n",
402 | " .iloc[:3]\n",
403 | " )\n",
404 | "\n",
405 | "\n",
406 | "res = foo()"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {},
412 | "source": [
413 | "Now this is interesting: `foo()` clearly returns something that doesn't adhere to the schema, but the above gives neither a linting error nor a runtime error!\n",
414 | "\n",
415 | "It turns out that the above problem often happens with functions like `iloc`, `loc` and `pipe`, whose return type is `Any` (and when you think about it, these can indeed return any possible datatype). When mypy sees that the return type is `Any`, it reasons that that could still be a `DataSet[Schema]` object, so it doesn't raise an error. It's only during runtime that we find out here that the return type actually is a `DataFrame`, but `mypy` doesn't do any runtime checks.\n",
416 | "\n",
417 | "Fortunately, Python offers other ways to do type checking during runtime. Here, we will use the `typeguard` package. "
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "from strictly_typed_pandas.typeguard import typechecked\n",
427 | "\n",
428 | "\n",
429 | "@typechecked\n",
430 | "def foo() -> DataSet[Schema]:\n",
431 | " return (\n",
432 | " DataSet[Schema](\n",
433 | " {\n",
434 | " \"id\": [1, 2, 3],\n",
435 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n",
436 | " }\n",
437 | " )\n",
438 | " .assign(job=\"Data Scientist\")\n",
439 | " .iloc[:3]\n",
440 | " )\n",
441 | "\n",
442 | "\n",
443 | "try:\n",
444 | " res = foo()\n",
445 | "except TypeError as e:\n",
446 | " print(e)"
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {},
452 | "source": [
453 | "Alright, we now caught the error dead in its tracks! \n",
454 | "\n",
455 | "We can improve this with one more step: instead of adding the `@typechecked` decorator to every function by hand (which could be error prone), `typeguard` can do this automatically when running the unit tests. To do this, simply run your unit tests using `pytest --stp-typeguard-packages=foo.bar` (where `foo.bar` is your package name)\n",
456 | "\n",
457 | "## Conclusions\n",
458 | "\n",
459 | "We can statically type check pandas in the following way:"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "from strictly_typed_pandas import DataSet\n",
469 | "\n",
470 | "\n",
471 | "class Schema:\n",
472 | " id: int\n",
473 | " name: str\n",
474 | "\n",
475 | "\n",
476 | "def foo(df: DataSet[Schema]) -> DataSet[Schema]:\n",
477 | " # do stuff\n",
478 | " return df"
479 | ]
480 | },
481 | {
482 | "cell_type": "markdown",
483 | "metadata": {},
484 | "source": [
485 | "Where `DataSet`:\n",
486 | "\n",
487 | "* is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`.\n",
488 | "\n",
489 | "* validates whether the data adheres to the provided schema upon its initialization.\n",
490 | "\n",
491 | "* is immutable, so its schema cannot be changed using inplace modifications.\n",
492 | "\n",
493 | "The `DataSet[Schema]` annotations are compatible with:\n",
494 | "\n",
495 | "* `mypy` for type checking during linting-time (i.e. while you write your code).\n",
496 | "\n",
497 | "* `typeguard` for type checking during run-time (i.e. while you run your unit tests).\n",
498 | "\n",
499 | "To get the most out of `strictly_typed_pandas`, be sure to:\n",
500 | "\n",
501 | "* set up `mypy` in your IDE.\n",
502 | "\n",
503 | "* run your unit tests with `pytest --stp-typeguard-packages=foo.bar` (where `foo.bar` is your package name)."
504 | ]
505 | },
506 | {
507 | "cell_type": "markdown",
508 | "metadata": {},
509 | "source": []
510 | }
511 | ],
512 | "metadata": {
513 | "interpreter": {
514 | "hash": "21955bae40816b58329a864495bd83642121ab031d49eff86d34b7b0569c6cea"
515 | },
516 | "kernelspec": {
517 | "display_name": "Python 3.8.5 64-bit ('base': conda)",
518 | "name": "python3"
519 | },
520 | "language_info": {
521 | "name": "python",
522 | "version": ""
523 | },
524 | "orig_nbformat": 2
525 | },
526 | "nbformat": 4,
527 | "nbformat_minor": 2
528 | }
529 |
--------------------------------------------------------------------------------
/strictly_typed_pandas/_vendor/typeguard/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ('ForwardRefPolicy', 'TypeHintWarning', 'typechecked', 'check_return_type',
2 | 'check_argument_types', 'check_type', 'TypeWarning', 'TypeChecker',
3 | 'typeguard_ignore')
4 |
5 | import collections.abc
6 | import gc
7 | import inspect
8 | import sys
9 | import threading
10 | from collections import OrderedDict
11 | from enum import Enum
12 | from functools import partial, wraps
13 | from inspect import Parameter, isclass, isfunction, isgeneratorfunction
14 | from io import BufferedIOBase, IOBase, RawIOBase, TextIOBase
15 | from traceback import extract_stack, print_stack
16 | from types import CodeType, FunctionType
17 | from typing import (
18 | IO, TYPE_CHECKING, AbstractSet, Any, AsyncIterable, AsyncIterator, BinaryIO, Callable, Dict,
19 | Generator, Iterable, Iterator, List, NewType, Optional, Sequence, Set, TextIO, Tuple, Type,
20 | TypeVar, Union, get_type_hints, overload)
21 | from unittest.mock import Mock
22 | from warnings import warn
23 | from weakref import WeakKeyDictionary, WeakValueDictionary
24 |
25 | # Python 3.8+
26 | try:
27 | from typing_extensions import Literal
28 | except ImportError:
29 | try:
30 | from typing import Literal
31 | except ImportError:
32 | Literal = None
33 |
34 | # Python 3.5.4+ / 3.6.2+
35 | try:
36 | from typing_extensions import NoReturn
37 | except ImportError:
38 | try:
39 | from typing import NoReturn
40 | except ImportError:
41 | NoReturn = None
42 |
43 | # Python 3.6+
44 | try:
45 | from inspect import isasyncgen, isasyncgenfunction
46 | from typing import AsyncGenerator
47 | except ImportError:
48 | AsyncGenerator = None
49 |
50 | def isasyncgen(obj):
51 | return False
52 |
53 | def isasyncgenfunction(func):
54 | return False
55 |
56 | # Python 3.8+
57 | try:
58 | from typing import ForwardRef
59 | evaluate_forwardref = ForwardRef._evaluate
60 | except ImportError:
61 | from typing import _ForwardRef as ForwardRef
62 | evaluate_forwardref = ForwardRef._eval_type
63 |
64 | if sys.version_info >= (3, 10):
65 | from typing import is_typeddict
66 | else:
67 | _typed_dict_meta_types = ()
68 | if sys.version_info >= (3, 8):
69 | from typing import _TypedDictMeta
70 | _typed_dict_meta_types += (_TypedDictMeta,)
71 |
72 | try:
73 | from typing_extensions import _TypedDictMeta
74 | _typed_dict_meta_types += (_TypedDictMeta,)
75 | except ImportError:
76 | pass
77 |
78 | def is_typeddict(tp) -> bool:
79 | return isinstance(tp, _typed_dict_meta_types)
80 |
81 |
82 | if TYPE_CHECKING:
83 | _F = TypeVar("_F")
84 |
85 | def typeguard_ignore(f: _F) -> _F:
86 | """This decorator is a noop during static type-checking."""
87 | return f
88 | else:
89 | from typing import no_type_check as typeguard_ignore
90 |
91 |
92 | _type_hints_map = WeakKeyDictionary() # type: Dict[FunctionType, Dict[str, Any]]
93 | _functions_map = WeakValueDictionary() # type: Dict[CodeType, FunctionType]
94 | _missing = object()
95 |
96 | T_CallableOrType = TypeVar('T_CallableOrType', bound=Callable[..., Any])
97 |
98 | # Lifted from mypy.sharedparse
99 | BINARY_MAGIC_METHODS = {
100 | "__add__",
101 | "__and__",
102 | "__cmp__",
103 | "__divmod__",
104 | "__div__",
105 | "__eq__",
106 | "__floordiv__",
107 | "__ge__",
108 | "__gt__",
109 | "__iadd__",
110 | "__iand__",
111 | "__idiv__",
112 | "__ifloordiv__",
113 | "__ilshift__",
114 | "__imatmul__",
115 | "__imod__",
116 | "__imul__",
117 | "__ior__",
118 | "__ipow__",
119 | "__irshift__",
120 | "__isub__",
121 | "__itruediv__",
122 | "__ixor__",
123 | "__le__",
124 | "__lshift__",
125 | "__lt__",
126 | "__matmul__",
127 | "__mod__",
128 | "__mul__",
129 | "__ne__",
130 | "__or__",
131 | "__pow__",
132 | "__radd__",
133 | "__rand__",
134 | "__rdiv__",
135 | "__rfloordiv__",
136 | "__rlshift__",
137 | "__rmatmul__",
138 | "__rmod__",
139 | "__rmul__",
140 | "__ror__",
141 | "__rpow__",
142 | "__rrshift__",
143 | "__rshift__",
144 | "__rsub__",
145 | "__rtruediv__",
146 | "__rxor__",
147 | "__sub__",
148 | "__truediv__",
149 | "__xor__",
150 | }
151 |
152 |
153 | class ForwardRefPolicy(Enum):
154 | """Defines how unresolved forward references are handled."""
155 |
156 | ERROR = 1 #: propagate the :exc:`NameError` from :func:`~typing.get_type_hints`
157 | WARN = 2 #: remove the annotation and emit a TypeHintWarning
158 | #: replace the annotation with the argument's class if the qualified name matches, else remove
159 | #: the annotation
160 | GUESS = 3
161 |
162 |
163 | class TypeHintWarning(UserWarning):
164 | """
165 | A warning that is emitted when a type hint in string form could not be resolved to an actual
166 | type.
167 | """
168 |
169 |
170 | class _TypeCheckMemo:
171 | __slots__ = 'globals', 'locals'
172 |
173 | def __init__(self, globals: Dict[str, Any], locals: Dict[str, Any]):
174 | self.globals = globals
175 | self.locals = locals
176 |
177 |
178 | def _strip_annotation(annotation):
179 | if isinstance(annotation, str):
180 | return annotation.strip("'")
181 | else:
182 | return annotation
183 |
184 |
185 | class _CallMemo(_TypeCheckMemo):
186 | __slots__ = 'func', 'func_name', 'arguments', 'is_generator', 'type_hints'
187 |
188 | def __init__(self, func: Callable, frame_locals: Optional[Dict[str, Any]] = None,
189 | args: tuple = None, kwargs: Dict[str, Any] = None,
190 | forward_refs_policy=ForwardRefPolicy.ERROR):
191 | super().__init__(func.__globals__, frame_locals)
192 | self.func = func
193 | self.func_name = function_name(func)
194 | self.is_generator = isgeneratorfunction(func)
195 | signature = inspect.signature(func)
196 |
197 | if args is not None and kwargs is not None:
198 | self.arguments = signature.bind(*args, **kwargs).arguments
199 | else:
200 | assert frame_locals is not None, 'frame must be specified if args or kwargs is None'
201 | self.arguments = frame_locals
202 |
203 | self.type_hints = _type_hints_map.get(func)
204 | if self.type_hints is None:
205 | while True:
206 | if sys.version_info < (3, 5, 3):
207 | frame_locals = dict(frame_locals)
208 |
209 | try:
210 | hints = get_type_hints(func, localns=frame_locals)
211 | except NameError as exc:
212 | if forward_refs_policy is ForwardRefPolicy.ERROR:
213 | raise
214 |
215 | typename = str(exc).split("'", 2)[1]
216 | for param in signature.parameters.values():
217 | if _strip_annotation(param.annotation) == typename:
218 | break
219 | else:
220 | raise
221 |
222 | func_name = function_name(func)
223 | if forward_refs_policy is ForwardRefPolicy.GUESS:
224 | if param.name in self.arguments:
225 | argtype = self.arguments[param.name].__class__
226 | stripped = _strip_annotation(param.annotation)
227 | if stripped == argtype.__qualname__:
228 | func.__annotations__[param.name] = argtype
229 | msg = ('Replaced forward declaration {!r} in {} with {!r}'
230 | .format(stripped, func_name, argtype))
231 | warn(TypeHintWarning(msg))
232 | continue
233 |
234 | msg = 'Could not resolve type hint {!r} on {}: {}'.format(
235 | param.annotation, function_name(func), exc)
236 | warn(TypeHintWarning(msg))
237 | del func.__annotations__[param.name]
238 | else:
239 | break
240 |
241 | self.type_hints = OrderedDict()
242 | for name, parameter in signature.parameters.items():
243 | if name in hints:
244 | annotated_type = hints[name]
245 |
246 | # PEP 428 discourages it by MyPy does not complain
247 | if parameter.default is None:
248 | annotated_type = Optional[annotated_type]
249 |
250 | if parameter.kind == Parameter.VAR_POSITIONAL:
251 | self.type_hints[name] = Tuple[annotated_type, ...]
252 | elif parameter.kind == Parameter.VAR_KEYWORD:
253 | self.type_hints[name] = Dict[str, annotated_type]
254 | else:
255 | self.type_hints[name] = annotated_type
256 |
257 | if 'return' in hints:
258 | self.type_hints['return'] = hints['return']
259 |
260 | _type_hints_map[func] = self.type_hints
261 |
262 |
263 | def resolve_forwardref(maybe_ref, memo: _TypeCheckMemo):
264 | if isinstance(maybe_ref, ForwardRef):
265 | if sys.version_info < (3, 9, 0):
266 | return evaluate_forwardref(maybe_ref, memo.globals, memo.locals)
267 | else:
268 | return evaluate_forwardref(maybe_ref, memo.globals, memo.locals, frozenset())
269 |
270 | else:
271 | return maybe_ref
272 |
273 |
274 | def get_type_name(type_):
275 | name = (getattr(type_, '__name__', None) or getattr(type_, '_name', None) or
276 | getattr(type_, '__forward_arg__', None))
277 | if name is None:
278 | origin = getattr(type_, '__origin__', None)
279 | name = getattr(origin, '_name', None)
280 | if name is None and not inspect.isclass(type_):
281 | name = type_.__class__.__name__.strip('_')
282 |
283 | args = getattr(type_, '__args__', ()) or getattr(type_, '__values__', ())
284 | if args != getattr(type_, '__parameters__', ()):
285 | if name == 'Literal':
286 | formatted_args = ', '.join(str(arg) for arg in args)
287 | else:
288 | formatted_args = ', '.join(get_type_name(arg) for arg in args)
289 |
290 | name = '{}[{}]'.format(name, formatted_args)
291 |
292 | module = getattr(type_, '__module__', None)
293 | if module not in (None, 'typing', 'typing_extensions', 'builtins'):
294 | name = module + '.' + name
295 |
296 | return name
297 |
298 |
299 | def find_function(frame) -> Optional[Callable]:
300 | """
301 | Return a function object from the garbage collector that matches the frame's code object.
302 |
303 | This process is unreliable as several function objects could use the same code object.
304 | Fortunately the likelihood of this happening with the combination of the function objects
305 | having different type annotations is a very rare occurrence.
306 |
307 | :param frame: a frame object
308 | :return: a function object if one was found, ``None`` if not
309 |
310 | """
311 | func = _functions_map.get(frame.f_code)
312 | if func is None:
313 | for obj in gc.get_referrers(frame.f_code):
314 | if inspect.isfunction(obj):
315 | if func is None:
316 | # The first match was found
317 | func = obj
318 | else:
319 | # A second match was found
320 | return None
321 |
322 | # Cache the result for future lookups
323 | if func is not None:
324 | _functions_map[frame.f_code] = func
325 | else:
326 | raise LookupError('target function not found')
327 |
328 | return func
329 |
330 |
331 | def qualified_name(obj) -> str:
332 | """
333 | Return the qualified name (e.g. package.module.Type) for the given object.
334 |
335 | Builtins and types from the :mod:`typing` package get special treatment by having the module
336 | name stripped from the generated name.
337 |
338 | """
339 | type_ = obj if inspect.isclass(obj) else type(obj)
340 | module = type_.__module__
341 | qualname = type_.__qualname__
342 | return qualname if module in ('typing', 'builtins') else '{}.{}'.format(module, qualname)
343 |
344 |
345 | def function_name(func: Callable) -> str:
346 | """
347 | Return the qualified name of the given function.
348 |
349 | Builtins and types from the :mod:`typing` package get special treatment by having the module
350 | name stripped from the generated name.
351 |
352 | """
353 | # For partial functions and objects with __call__ defined, __qualname__ does not exist
354 | # For functions run in `exec` with a custom namespace, __module__ can be None
355 | module = getattr(func, '__module__', '') or ''
356 | qualname = (module + '.') if module not in ('builtins', '') else ''
357 | return qualname + getattr(func, '__qualname__', repr(func))
358 |
359 |
360 | def check_callable(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
361 | if not callable(value):
362 | raise TypeError('{} must be a callable'.format(argname))
363 |
364 | if getattr(expected_type, "__args__", None):
365 | try:
366 | signature = inspect.signature(value)
367 | except (TypeError, ValueError):
368 | return
369 |
370 | if hasattr(expected_type, '__result__'):
371 | # Python 3.5
372 | argument_types = expected_type.__args__
373 | check_args = argument_types is not Ellipsis
374 | else:
375 | # Python 3.6
376 | argument_types = expected_type.__args__[:-1]
377 | check_args = argument_types != (Ellipsis,)
378 |
379 | if check_args:
380 | # The callable must not have keyword-only arguments without defaults
381 | unfulfilled_kwonlyargs = [
382 | param.name for param in signature.parameters.values() if
383 | param.kind == Parameter.KEYWORD_ONLY and param.default == Parameter.empty]
384 | if unfulfilled_kwonlyargs:
385 | raise TypeError(
386 | 'callable passed as {} has mandatory keyword-only arguments in its '
387 | 'declaration: {}'.format(argname, ', '.join(unfulfilled_kwonlyargs)))
388 |
389 | num_mandatory_args = len([
390 | param.name for param in signature.parameters.values()
391 | if param.kind in (Parameter.POSITIONAL_ONLY, Parameter.POSITIONAL_OR_KEYWORD) and
392 | param.default is Parameter.empty])
393 | has_varargs = any(param for param in signature.parameters.values()
394 | if param.kind == Parameter.VAR_POSITIONAL)
395 |
396 | if num_mandatory_args > len(argument_types):
397 | raise TypeError(
398 | 'callable passed as {} has too many arguments in its declaration; expected {} '
399 | 'but {} argument(s) declared'.format(argname, len(argument_types),
400 | num_mandatory_args))
401 | elif not has_varargs and num_mandatory_args < len(argument_types):
402 | raise TypeError(
403 | 'callable passed as {} has too few arguments in its declaration; expected {} '
404 | 'but {} argument(s) declared'.format(argname, len(argument_types),
405 | num_mandatory_args))
406 |
407 |
408 | def check_dict(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
409 | if not isinstance(value, dict):
410 | raise TypeError('type of {} must be a dict; got {} instead'.
411 | format(argname, qualified_name(value)))
412 |
413 | if expected_type is not dict:
414 | if (hasattr(expected_type, "__args__") and
415 | expected_type.__args__ not in (None, expected_type.__parameters__)):
416 | key_type, value_type = expected_type.__args__
417 | if key_type is not Any or value_type is not Any:
418 | for k, v in value.items():
419 | check_type('keys of {}'.format(argname), k, key_type, memo)
420 | check_type('{}[{!r}]'.format(argname, k), v, value_type, memo)
421 |
422 |
423 | def check_typed_dict(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
424 | declared_keys = frozenset(expected_type.__annotations__)
425 | if hasattr(expected_type, '__required_keys__'):
426 | required_keys = expected_type.__required_keys__
427 | else: # py3.8 and lower
428 | required_keys = declared_keys if expected_type.__total__ else frozenset()
429 |
430 | existing_keys = frozenset(value)
431 | extra_keys = existing_keys - declared_keys
432 | if extra_keys:
433 | keys_formatted = ', '.join('"{}"'.format(key) for key in sorted(extra_keys))
434 | raise TypeError('extra key(s) ({}) in {}'.format(keys_formatted, argname))
435 |
436 | missing_keys = required_keys - existing_keys
437 | if missing_keys:
438 | keys_formatted = ', '.join('"{}"'.format(key) for key in sorted(missing_keys))
439 | raise TypeError('required key(s) ({}) missing from {}'.format(keys_formatted, argname))
440 |
441 | for key, argtype in get_type_hints(expected_type).items():
442 | argvalue = value.get(key, _missing)
443 | if argvalue is not _missing:
444 | check_type('dict item "{}" for {}'.format(key, argname), argvalue, argtype, memo)
445 |
446 |
447 | def check_list(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
448 | if not isinstance(value, list):
449 | raise TypeError('type of {} must be a list; got {} instead'.
450 | format(argname, qualified_name(value)))
451 |
452 | if expected_type is not list:
453 | if hasattr(expected_type, "__args__") and expected_type.__args__ not in \
454 | (None, expected_type.__parameters__):
455 | value_type = expected_type.__args__[0]
456 | if value_type is not Any:
457 | for i, v in enumerate(value):
458 | check_type('{}[{}]'.format(argname, i), v, value_type, memo)
459 |
460 |
461 | def check_sequence(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
462 | if not isinstance(value, collections.abc.Sequence):
463 | raise TypeError('type of {} must be a sequence; got {} instead'.
464 | format(argname, qualified_name(value)))
465 |
466 | if hasattr(expected_type, "__args__") and expected_type.__args__ not in \
467 | (None, expected_type.__parameters__):
468 | value_type = expected_type.__args__[0]
469 | if value_type is not Any:
470 | for i, v in enumerate(value):
471 | check_type('{}[{}]'.format(argname, i), v, value_type, memo)
472 |
473 |
474 | def check_set(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
475 | if not isinstance(value, AbstractSet):
476 | raise TypeError('type of {} must be a set; got {} instead'.
477 | format(argname, qualified_name(value)))
478 |
479 | if expected_type is not set:
480 | if hasattr(expected_type, "__args__") and expected_type.__args__ not in \
481 | (None, expected_type.__parameters__):
482 | value_type = expected_type.__args__[0]
483 | if value_type is not Any:
484 | for v in value:
485 | check_type('elements of {}'.format(argname), v, value_type, memo)
486 |
487 |
488 | def check_tuple(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
489 | # Specialized check for NamedTuples
490 | is_named_tuple = False
491 | if sys.version_info < (3, 8, 0):
492 | is_named_tuple = hasattr(expected_type, '_field_types') # deprecated since python 3.8
493 | else:
494 | is_named_tuple = hasattr(expected_type, '__annotations__')
495 |
496 | if is_named_tuple:
497 | if not isinstance(value, expected_type):
498 | raise TypeError('type of {} must be a named tuple of type {}; got {} instead'.
499 | format(argname, qualified_name(expected_type), qualified_name(value)))
500 |
501 | if sys.version_info < (3, 8, 0):
502 | field_types = expected_type._field_types
503 | else:
504 | field_types = expected_type.__annotations__
505 |
506 | for name, field_type in field_types.items():
507 | check_type('{}.{}'.format(argname, name), getattr(value, name), field_type, memo)
508 |
509 | return
510 | elif not isinstance(value, tuple):
511 | raise TypeError('type of {} must be a tuple; got {} instead'.
512 | format(argname, qualified_name(value)))
513 |
514 | if getattr(expected_type, '__tuple_params__', None):
515 | # Python 3.5
516 | use_ellipsis = expected_type.__tuple_use_ellipsis__
517 | tuple_params = expected_type.__tuple_params__
518 | elif getattr(expected_type, '__args__', None):
519 | # Python 3.6+
520 | use_ellipsis = expected_type.__args__[-1] is Ellipsis
521 | tuple_params = expected_type.__args__[:-1 if use_ellipsis else None]
522 | else:
523 | # Unparametrized Tuple or plain tuple
524 | return
525 |
526 | if use_ellipsis:
527 | element_type = tuple_params[0]
528 | for i, element in enumerate(value):
529 | check_type('{}[{}]'.format(argname, i), element, element_type, memo)
530 | elif tuple_params == ((),):
531 | if value != ():
532 | raise TypeError('{} is not an empty tuple but one was expected'.format(argname))
533 | else:
534 | if len(value) != len(tuple_params):
535 | raise TypeError('{} has wrong number of elements (expected {}, got {} instead)'
536 | .format(argname, len(tuple_params), len(value)))
537 |
538 | for i, (element, element_type) in enumerate(zip(value, tuple_params)):
539 | check_type('{}[{}]'.format(argname, i), element, element_type, memo)
540 |
541 |
542 | def check_union(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
543 | if hasattr(expected_type, '__union_params__'):
544 | # Python 3.5
545 | union_params = expected_type.__union_params__
546 | else:
547 | # Python 3.6+
548 | union_params = expected_type.__args__
549 |
550 | for type_ in union_params:
551 | try:
552 | check_type(argname, value, type_, memo)
553 | return
554 | except TypeError:
555 | pass
556 |
557 | typelist = ', '.join(get_type_name(t) for t in union_params)
558 | raise TypeError('type of {} must be one of ({}); got {} instead'.
559 | format(argname, typelist, qualified_name(value)))
560 |
561 |
562 | def check_class(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None:
563 | if not isclass(value):
564 | raise TypeError('type of {} must be a type; got {} instead'.format(
565 | argname, qualified_name(value)))
566 |
567 | # Needed on Python 3.7+
568 | if expected_type is Type:
569 | return
570 |
571 | if getattr(expected_type, '__origin__', None) in (Type, type):
572 | expected_class = expected_type.__args__[0]
573 | else:
574 | expected_class = expected_type
575 |
576 | if expected_class is Any:
577 | return
578 | elif isinstance(expected_class, TypeVar):
579 | check_typevar(argname, value, expected_class, memo, True)
580 | elif getattr(expected_class, '__origin__', None) is Union:
581 | for arg in expected_class.__args__:
582 | try:
583 | check_class(argname, value, arg, memo)
584 | break
585 | except TypeError:
586 | pass
587 | else:
588 | formatted_args = ', '.join(get_type_name(arg) for arg in expected_class.__args__)
589 | raise TypeError('{} must match one of the following: ({}); got {} instead'.format(
590 | argname, formatted_args, qualified_name(value)
591 | ))
592 | elif not issubclass(value, expected_class):
593 | raise TypeError('{} must be a subclass of {}; got {} instead'.format(
594 | argname, qualified_name(expected_class), qualified_name(value)))
595 |
596 |
597 | def check_typevar(argname: str, value, typevar: TypeVar, memo: _TypeCheckMemo,
598 | subclass_check: bool = False) -> None:
599 | value_type = value if subclass_check else type(value)
600 | subject = argname if subclass_check else 'type of ' + argname
601 |
602 | if typevar.__bound__ is not None:
603 | bound_type = resolve_forwardref(typevar.__bound__, memo)
604 | if not issubclass(value_type, bound_type):
605 | raise TypeError(
606 | '{} must be {} or one of its subclasses; got {} instead'
607 | .format(subject, qualified_name(bound_type), qualified_name(value_type)))
608 | elif typevar.__constraints__:
609 | constraints = [resolve_forwardref(c, memo) for c in typevar.__constraints__]
610 | for constraint in constraints:
611 | try:
612 | check_type(argname, value, constraint, memo)
613 | except TypeError:
614 | pass
615 | else:
616 | break
617 | else:
618 | formatted_constraints = ', '.join(get_type_name(constraint)
619 | for constraint in constraints)
620 | raise TypeError('{} must match one of the constraints ({}); got {} instead'
621 | .format(subject, formatted_constraints, qualified_name(value_type)))
622 |
623 |
624 | def check_literal(argname: str, value, expected_type, memo: _TypeCheckMemo):
625 | def get_args(literal):
626 | try:
627 | args = literal.__args__
628 | except AttributeError:
629 | # Instance of Literal from typing_extensions
630 | args = literal.__values__
631 |
632 | retval = []
633 | for arg in args:
634 | if isinstance(arg, Literal.__class__) or getattr(arg, '__origin__', None) is Literal:
635 | # The first check works on py3.6 and lower, the second one on py3.7+
636 | retval.extend(get_args(arg))
637 | elif isinstance(arg, (int, str, bytes, bool, type(None), Enum)):
638 | retval.append(arg)
639 | else:
640 | raise TypeError('Illegal literal value: {}'.format(arg))
641 |
642 | return retval
643 |
644 | final_args = tuple(get_args(expected_type))
645 | if value not in final_args:
646 | raise TypeError('the value of {} must be one of {}; got {} instead'.
647 | format(argname, final_args, value))
648 |
649 |
650 | def check_number(argname: str, value, expected_type):
651 | if expected_type is complex and not isinstance(value, (complex, float, int)):
652 | raise TypeError('type of {} must be either complex, float or int; got {} instead'.
653 | format(argname, qualified_name(value.__class__)))
654 | elif expected_type is float and not isinstance(value, (float, int)):
655 | raise TypeError('type of {} must be either float or int; got {} instead'.
656 | format(argname, qualified_name(value.__class__)))
657 |
658 |
659 | def check_io(argname: str, value, expected_type):
660 | if expected_type is TextIO:
661 | if not isinstance(value, TextIOBase):
662 | raise TypeError('type of {} must be a text based I/O object; got {} instead'.
663 | format(argname, qualified_name(value.__class__)))
664 | elif expected_type is BinaryIO:
665 | if not isinstance(value, (RawIOBase, BufferedIOBase)):
666 | raise TypeError('type of {} must be a binary I/O object; got {} instead'.
667 | format(argname, qualified_name(value.__class__)))
668 | elif not isinstance(value, IOBase):
669 | raise TypeError('type of {} must be an I/O object; got {} instead'.
670 | format(argname, qualified_name(value.__class__)))
671 |
672 |
673 | def check_protocol(argname: str, value, expected_type):
674 | # TODO: implement proper compatibility checking and support non-runtime protocols
675 | if getattr(expected_type, '_is_runtime_protocol', False):
676 | if not isinstance(value, expected_type):
677 | raise TypeError('type of {} ({}) is not compatible with the {} protocol'.
678 | format(argname, type(value).__qualname__, expected_type.__qualname__))
679 |
680 |
681 | # Equality checks are applied to these
682 | origin_type_checkers = {
683 | AbstractSet: check_set,
684 | Callable: check_callable,
685 | collections.abc.Callable: check_callable,
686 | dict: check_dict,
687 | Dict: check_dict,
688 | list: check_list,
689 | List: check_list,
690 | Sequence: check_sequence,
691 | collections.abc.Sequence: check_sequence,
692 | collections.abc.Set: check_set,
693 | set: check_set,
694 | Set: check_set,
695 | tuple: check_tuple,
696 | Tuple: check_tuple,
697 | type: check_class,
698 | Type: check_class,
699 | Union: check_union
700 | }
701 | _subclass_check_unions = hasattr(Union, '__union_set_params__')
702 | if Literal is not None:
703 | origin_type_checkers[Literal] = check_literal
704 |
705 | generator_origin_types = (Generator, collections.abc.Generator,
706 | Iterator, collections.abc.Iterator,
707 | Iterable, collections.abc.Iterable)
708 | asyncgen_origin_types = (AsyncIterator, collections.abc.AsyncIterator,
709 | AsyncIterable, collections.abc.AsyncIterable)
710 | if AsyncGenerator is not None:
711 | asyncgen_origin_types += (AsyncGenerator,)
712 | if hasattr(collections.abc, 'AsyncGenerator'):
713 | asyncgen_origin_types += (collections.abc.AsyncGenerator,)
714 |
715 |
716 | def check_type(argname: str, value, expected_type, memo: Optional[_TypeCheckMemo] = None, *,
717 | globals: Optional[Dict[str, Any]] = None,
718 | locals: Optional[Dict[str, Any]] = None) -> None:
719 | """
720 | Ensure that ``value`` matches ``expected_type``.
721 |
722 | The types from the :mod:`typing` module do not support :func:`isinstance` or :func:`issubclass`
723 | so a number of type specific checks are required. This function knows which checker to call
724 | for which type.
725 |
726 | :param argname: name of the argument to check; used for error messages
727 | :param value: value to be checked against ``expected_type``
728 | :param expected_type: a class or generic type instance
729 | :param globals: dictionary of global variables to use for resolving forward references
730 | (defaults to the calling frame's globals)
731 | :param locals: dictionary of local variables to use for resolving forward references
732 | (defaults to the calling frame's locals)
733 | :raises TypeError: if there is a type mismatch
734 |
735 | """
736 | if expected_type is Any or isinstance(value, Mock):
737 | return
738 |
739 | if expected_type is None:
740 | # Only happens on < 3.6
741 | expected_type = type(None)
742 |
743 | if memo is None:
744 | frame = sys._getframe(1)
745 | if globals is None:
746 | globals = frame.f_globals
747 | if locals is None:
748 | locals = frame.f_locals
749 |
750 | memo = _TypeCheckMemo(globals, locals)
751 |
752 | expected_type = resolve_forwardref(expected_type, memo)
753 | origin_type = getattr(expected_type, '__origin__', None)
754 | if origin_type is not None:
755 | checker_func = origin_type_checkers.get(origin_type)
756 | if checker_func:
757 | checker_func(argname, value, expected_type, memo)
758 | else:
759 | check_type(argname, value, origin_type, memo)
760 | elif isclass(expected_type):
761 | if issubclass(expected_type, Tuple):
762 | check_tuple(argname, value, expected_type, memo)
763 | elif issubclass(expected_type, (float, complex)):
764 | check_number(argname, value, expected_type)
765 | elif _subclass_check_unions and issubclass(expected_type, Union):
766 | check_union(argname, value, expected_type, memo)
767 | elif isinstance(expected_type, TypeVar):
768 | check_typevar(argname, value, expected_type, memo)
769 | elif issubclass(expected_type, IO):
770 | check_io(argname, value, expected_type)
771 | elif is_typeddict(expected_type):
772 | check_typed_dict(argname, value, expected_type, memo)
773 | elif getattr(expected_type, '_is_protocol', False):
774 | check_protocol(argname, value, expected_type)
775 | else:
776 | expected_type = (getattr(expected_type, '__extra__', None) or origin_type or
777 | expected_type)
778 |
779 | if expected_type is bytes:
780 | # As per https://github.com/python/typing/issues/552
781 | if not isinstance(value, (bytearray, bytes, memoryview)):
782 | raise TypeError('type of {} must be bytes-like; got {} instead'
783 | .format(argname, qualified_name(value)))
784 | elif not isinstance(value, expected_type):
785 | raise TypeError(
786 | 'type of {} must be {}; got {} instead'.
787 | format(argname, qualified_name(expected_type), qualified_name(value)))
788 | elif isinstance(expected_type, TypeVar):
789 | # Only happens on < 3.6
790 | check_typevar(argname, value, expected_type, memo)
791 | elif isinstance(expected_type, Literal.__class__):
792 | # Only happens on < 3.7 when using Literal from typing_extensions
793 | check_literal(argname, value, expected_type, memo)
794 | elif expected_type.__class__ is NewType:
795 | # typing.NewType on Python 3.10+
796 | return check_type(argname, value, expected_type.__supertype__, memo)
797 | elif (isfunction(expected_type) and
798 | getattr(expected_type, "__module__", None) == "typing" and
799 | getattr(expected_type, "__qualname__", None).startswith("NewType.") and
800 | hasattr(expected_type, "__supertype__")):
801 | # typing.NewType on Python 3.9 and below
802 | return check_type(argname, value, expected_type.__supertype__, memo)
803 |
804 |
805 | def check_return_type(retval, memo: Optional[_CallMemo] = None) -> bool:
806 | """
807 | Check that the return value is compatible with the return value annotation in the function.
808 |
809 | :param retval: the value about to be returned from the call
810 | :return: ``True``
811 | :raises TypeError: if there is a type mismatch
812 |
813 | """
814 | if memo is None:
815 | # faster than inspect.currentframe(), but not officially
816 | # supported in all python implementations
817 | frame = sys._getframe(1)
818 |
819 | try:
820 | func = find_function(frame)
821 | except LookupError:
822 | return True # This can happen with the Pydev/PyCharm debugger extension installed
823 |
824 | memo = _CallMemo(func, frame.f_locals)
825 |
826 | if 'return' in memo.type_hints:
827 | if memo.type_hints['return'] is NoReturn:
828 | raise TypeError('{}() was declared never to return but it did'.format(memo.func_name))
829 |
830 | try:
831 | check_type('the return value', retval, memo.type_hints['return'], memo)
832 | except TypeError as exc: # suppress unnecessarily long tracebacks
833 | # Allow NotImplemented if this is a binary magic method (__eq__() et al)
834 | if retval is NotImplemented and memo.type_hints['return'] is bool:
835 | # This does (and cannot) not check if it's actually a method
836 | func_name = memo.func_name.rsplit('.', 1)[-1]
837 | if len(memo.arguments) == 2 and func_name in BINARY_MAGIC_METHODS:
838 | return True
839 |
840 | raise TypeError(*exc.args) from None
841 |
842 | return True
843 |
844 |
845 | def check_argument_types(memo: Optional[_CallMemo] = None) -> bool:
846 | """
847 | Check that the argument values match the annotated types.
848 |
849 | Unless both ``args`` and ``kwargs`` are provided, the information will be retrieved from
850 | the previous stack frame (ie. from the function that called this).
851 |
852 | :return: ``True``
853 | :raises TypeError: if there is an argument type mismatch
854 |
855 | """
856 | if memo is None:
857 | # faster than inspect.currentframe(), but not officially
858 | # supported in all python implementations
859 | frame = sys._getframe(1)
860 |
861 | try:
862 | func = find_function(frame)
863 | except LookupError:
864 | return True # This can happen with the Pydev/PyCharm debugger extension installed
865 |
866 | memo = _CallMemo(func, frame.f_locals)
867 |
868 | for argname, expected_type in memo.type_hints.items():
869 | if argname != 'return' and argname in memo.arguments:
870 | value = memo.arguments[argname]
871 | description = 'argument "{}"'.format(argname)
872 | try:
873 | check_type(description, value, expected_type, memo)
874 | except TypeError as exc: # suppress unnecessarily long tracebacks
875 | raise TypeError(*exc.args) from None
876 |
877 | return True
878 |
879 |
880 | class TypeCheckedGenerator:
881 | def __init__(self, wrapped: Generator, memo: _CallMemo):
882 | rtype_args = []
883 | if hasattr(memo.type_hints['return'], "__args__"):
884 | rtype_args = memo.type_hints['return'].__args__
885 |
886 | self.__wrapped = wrapped
887 | self.__memo = memo
888 | self.__yield_type = rtype_args[0] if rtype_args else Any
889 | self.__send_type = rtype_args[1] if len(rtype_args) > 1 else Any
890 | self.__return_type = rtype_args[2] if len(rtype_args) > 2 else Any
891 | self.__initialized = False
892 |
893 | def __iter__(self):
894 | return self
895 |
896 | def __next__(self):
897 | return self.send(None)
898 |
899 | def __getattr__(self, name: str) -> Any:
900 | return getattr(self.__wrapped, name)
901 |
902 | def throw(self, *args):
903 | return self.__wrapped.throw(*args)
904 |
905 | def close(self):
906 | self.__wrapped.close()
907 |
908 | def send(self, obj):
909 | if self.__initialized:
910 | check_type('value sent to generator', obj, self.__send_type, memo=self.__memo)
911 | else:
912 | self.__initialized = True
913 |
914 | try:
915 | value = self.__wrapped.send(obj)
916 | except StopIteration as exc:
917 | check_type('return value', exc.value, self.__return_type, memo=self.__memo)
918 | raise
919 |
920 | check_type('value yielded from generator', value, self.__yield_type, memo=self.__memo)
921 | return value
922 |
923 |
924 | class TypeCheckedAsyncGenerator:
925 | def __init__(self, wrapped: AsyncGenerator, memo: _CallMemo):
926 | rtype_args = memo.type_hints['return'].__args__
927 | self.__wrapped = wrapped
928 | self.__memo = memo
929 | self.__yield_type = rtype_args[0]
930 | self.__send_type = rtype_args[1] if len(rtype_args) > 1 else Any
931 | self.__initialized = False
932 |
933 | def __aiter__(self):
934 | return self
935 |
936 | def __anext__(self):
937 | return self.asend(None)
938 |
939 | def __getattr__(self, name: str) -> Any:
940 | return getattr(self.__wrapped, name)
941 |
942 | def athrow(self, *args):
943 | return self.__wrapped.athrow(*args)
944 |
945 | def aclose(self):
946 | return self.__wrapped.aclose()
947 |
948 | async def asend(self, obj):
949 | if self.__initialized:
950 | check_type('value sent to generator', obj, self.__send_type, memo=self.__memo)
951 | else:
952 | self.__initialized = True
953 |
954 | value = await self.__wrapped.asend(obj)
955 | check_type('value yielded from generator', value, self.__yield_type, memo=self.__memo)
956 | return value
957 |
958 |
959 | @overload
960 | def typechecked(*, always: bool = False) -> Callable[[T_CallableOrType], T_CallableOrType]:
961 | ...
962 |
963 |
964 | @overload
965 | def typechecked(func: T_CallableOrType, *, always: bool = False) -> T_CallableOrType:
966 | ...
967 |
968 |
969 | def typechecked(func=None, *, always=False, _localns: Optional[Dict[str, Any]] = None):
970 | """
971 | Perform runtime type checking on the arguments that are passed to the wrapped function.
972 |
973 | The return value is also checked against the return annotation if any.
974 |
975 | If the ``__debug__`` global variable is set to ``False``, no wrapping and therefore no type
976 | checking is done, unless ``always`` is ``True``.
977 |
978 | This can also be used as a class decorator. This will wrap all type annotated methods,
979 | including ``@classmethod``, ``@staticmethod``, and ``@property`` decorated methods,
980 | in the class with the ``@typechecked`` decorator.
981 |
982 | :param func: the function or class to enable type checking for
983 | :param always: ``True`` to enable type checks even in optimized mode
984 |
985 | """
986 | if func is None:
987 | return partial(typechecked, always=always, _localns=_localns)
988 |
989 | if not __debug__ and not always: # pragma: no cover
990 | return func
991 |
992 | if isclass(func):
993 | prefix = func.__qualname__ + '.'
994 | for key, attr in func.__dict__.items():
995 | if inspect.isfunction(attr) or inspect.ismethod(attr) or inspect.isclass(attr):
996 | if attr.__qualname__.startswith(prefix) and getattr(attr, '__annotations__', None):
997 | setattr(func, key, typechecked(attr, always=always, _localns=func.__dict__))
998 | elif isinstance(attr, (classmethod, staticmethod)):
999 | if getattr(attr.__func__, '__annotations__', None):
1000 | wrapped = typechecked(attr.__func__, always=always, _localns=func.__dict__)
1001 | setattr(func, key, type(attr)(wrapped))
1002 | elif isinstance(attr, property):
1003 | kwargs = dict(doc=attr.__doc__)
1004 | for name in ("fset", "fget", "fdel"):
1005 | property_func = kwargs[name] = getattr(attr, name)
1006 | if property_func is not None and getattr(property_func, '__annotations__', ()):
1007 | kwargs[name] = typechecked(
1008 | property_func, always=always, _localns=func.__dict__
1009 | )
1010 |
1011 | setattr(func, key, attr.__class__(**kwargs))
1012 |
1013 | return func
1014 |
1015 | if not getattr(func, '__annotations__', None):
1016 | warn('no type annotations present -- not typechecking {}'.format(function_name(func)))
1017 | return func
1018 |
1019 | # Find the frame in which the function was declared, for resolving forward references later
1020 | if _localns is None:
1021 | _localns = sys._getframe(1).f_locals
1022 |
1023 | # Find either the first Python wrapper or the actual function
1024 | python_func = inspect.unwrap(func, stop=lambda f: hasattr(f, '__code__'))
1025 |
1026 | if not getattr(python_func, '__code__', None):
1027 | warn('no code associated -- not typechecking {}'.format(function_name(func)))
1028 | return func
1029 |
1030 | def wrapper(*args, **kwargs):
1031 | memo = _CallMemo(python_func, _localns, args=args, kwargs=kwargs)
1032 | check_argument_types(memo)
1033 | retval = func(*args, **kwargs)
1034 | try:
1035 | check_return_type(retval, memo)
1036 | except TypeError as exc:
1037 | raise TypeError(*exc.args) from None
1038 |
1039 | # If a generator is returned, wrap it if its yield/send/return types can be checked
1040 | if inspect.isgenerator(retval) or isasyncgen(retval):
1041 | return_type = memo.type_hints.get('return')
1042 | if return_type:
1043 | origin = getattr(return_type, '__origin__', None)
1044 | if origin in generator_origin_types:
1045 | return TypeCheckedGenerator(retval, memo)
1046 | elif origin is not None and origin in asyncgen_origin_types:
1047 | return TypeCheckedAsyncGenerator(retval, memo)
1048 |
1049 | return retval
1050 |
1051 | async def async_wrapper(*args, **kwargs):
1052 | memo = _CallMemo(python_func, _localns, args=args, kwargs=kwargs)
1053 | check_argument_types(memo)
1054 | retval = await func(*args, **kwargs)
1055 | check_return_type(retval, memo)
1056 | return retval
1057 |
1058 | if inspect.iscoroutinefunction(func):
1059 | if python_func.__code__ is not async_wrapper.__code__:
1060 | return wraps(func)(async_wrapper)
1061 | else:
1062 | if python_func.__code__ is not wrapper.__code__:
1063 | return wraps(func)(wrapper)
1064 |
1065 | # the target callable was already wrapped
1066 | return func
1067 |
1068 |
1069 | class TypeWarning(UserWarning):
1070 | """
1071 | A warning that is emitted when a type check fails.
1072 |
1073 | :ivar str event: ``call`` or ``return``
1074 | :ivar Callable func: the function in which the violation occurred (the called function if event
1075 | is ``call``, or the function where a value of the wrong type was returned from if event is
1076 | ``return``)
1077 | :ivar str error: the error message contained by the caught :class:`TypeError`
1078 | :ivar frame: the frame in which the violation occurred
1079 | """
1080 |
1081 | __slots__ = ('func', 'event', 'message', 'frame')
1082 |
1083 | def __init__(self, memo: Optional[_CallMemo], event: str, frame,
1084 | exception: Union[str, TypeError]): # pragma: no cover
1085 | self.func = memo.func
1086 | self.event = event
1087 | self.error = str(exception)
1088 | self.frame = frame
1089 |
1090 | if self.event == 'call':
1091 | caller_frame = self.frame.f_back
1092 | event = 'call to {}() from {}:{}'.format(
1093 | function_name(self.func), caller_frame.f_code.co_filename, caller_frame.f_lineno)
1094 | else:
1095 | event = 'return from {}() at {}:{}'.format(
1096 | function_name(self.func), self.frame.f_code.co_filename, self.frame.f_lineno)
1097 |
1098 | super().__init__('[{thread_name}] {event}: {self.error}'.format(
1099 | thread_name=threading.current_thread().name, event=event, self=self))
1100 |
1101 | @property
1102 | def stack(self):
1103 | """Return the stack where the last frame is from the target function."""
1104 | return extract_stack(self.frame)
1105 |
1106 | def print_stack(self, file: TextIO = None, limit: int = None) -> None:
1107 | """
1108 | Print the traceback from the stack frame where the target function was run.
1109 |
1110 | :param file: an open file to print to (prints to stdout if omitted)
1111 | :param limit: the maximum number of stack frames to print
1112 |
1113 | """
1114 | print_stack(self.frame, limit, file)
1115 |
1116 |
1117 | class TypeChecker:
1118 | """
1119 | A type checker that collects type violations by hooking into :func:`sys.setprofile`.
1120 |
1121 | :param packages: list of top level modules and packages or modules to include for type checking
1122 | :param all_threads: ``True`` to check types in all threads created while the checker is
1123 | running, ``False`` to only check in the current one
1124 | :param forward_refs_policy: how to handle unresolvable forward references in annotations
1125 |
1126 | .. deprecated:: 2.6
1127 | Use :func:`~.importhook.install_import_hook` instead. This class will be removed in v3.0.
1128 | """
1129 |
1130 | def __init__(self, packages: Union[str, Sequence[str]], *, all_threads: bool = True,
1131 | forward_refs_policy: ForwardRefPolicy = ForwardRefPolicy.ERROR):
1132 | assert check_argument_types()
1133 | warn('TypeChecker has been deprecated and will be removed in v3.0. '
1134 | 'Use install_import_hook() or the pytest plugin instead.', DeprecationWarning)
1135 | self.all_threads = all_threads
1136 | self.annotation_policy = forward_refs_policy
1137 | self._call_memos = {} # type: Dict[Any, _CallMemo]
1138 | self._previous_profiler = None
1139 | self._previous_thread_profiler = None
1140 | self._active = False
1141 |
1142 | if isinstance(packages, str):
1143 | self._packages = (packages,)
1144 | else:
1145 | self._packages = tuple(packages)
1146 |
1147 | @property
1148 | def active(self) -> bool:
1149 | """Return ``True`` if currently collecting type violations."""
1150 | return self._active
1151 |
1152 | def should_check_type(self, func: Callable) -> bool:
1153 | if not func.__annotations__:
1154 | # No point in checking if there are no type hints
1155 | return False
1156 | elif isasyncgenfunction(func):
1157 | # Async generators cannot be supported because the return arg is of an opaque builtin
1158 | # type (async_generator_wrapped_value)
1159 | return False
1160 | else:
1161 | # Check types if the module matches any of the package prefixes
1162 | return any(func.__module__ == package or func.__module__.startswith(package + '.')
1163 | for package in self._packages)
1164 |
1165 | def start(self):
1166 | if self._active:
1167 | raise RuntimeError('type checker already running')
1168 |
1169 | self._active = True
1170 |
1171 | # Install this instance as the current profiler
1172 | self._previous_profiler = sys.getprofile()
1173 | sys.setprofile(self)
1174 |
1175 | # If requested, set this instance as the default profiler for all future threads
1176 | # (does not affect existing threads)
1177 | if self.all_threads:
1178 | self._previous_thread_profiler = threading._profile_hook
1179 | threading.setprofile(self)
1180 |
1181 | def stop(self):
1182 | if self._active:
1183 | if sys.getprofile() is self:
1184 | sys.setprofile(self._previous_profiler)
1185 | else: # pragma: no cover
1186 | warn('the system profiling hook has changed unexpectedly')
1187 |
1188 | if self.all_threads:
1189 | if threading._profile_hook is self:
1190 | threading.setprofile(self._previous_thread_profiler)
1191 | else: # pragma: no cover
1192 | warn('the threading profiling hook has changed unexpectedly')
1193 |
1194 | self._active = False
1195 |
1196 | def __enter__(self):
1197 | self.start()
1198 | return self
1199 |
1200 | def __exit__(self, exc_type, exc_val, exc_tb):
1201 | self.stop()
1202 |
1203 | def __call__(self, frame, event: str, arg) -> None: # pragma: no cover
1204 | if not self._active:
1205 | # This happens if all_threads was enabled and a thread was created when the checker was
1206 | # running but was then stopped. The thread's profiler callback can't be reset any other
1207 | # way but this.
1208 | sys.setprofile(self._previous_thread_profiler)
1209 | return
1210 |
1211 | # If an actual profiler is running, don't include the type checking times in its results
1212 | if event == 'call':
1213 | try:
1214 | func = find_function(frame)
1215 | except Exception:
1216 | func = None
1217 |
1218 | if func is not None and self.should_check_type(func):
1219 | memo = self._call_memos[frame] = _CallMemo(
1220 | func, frame.f_locals, forward_refs_policy=self.annotation_policy)
1221 | if memo.is_generator:
1222 | return_type_hint = memo.type_hints['return']
1223 | if return_type_hint is not None:
1224 | origin = getattr(return_type_hint, '__origin__', None)
1225 | if origin in generator_origin_types:
1226 | # Check the types of the yielded values
1227 | memo.type_hints['return'] = return_type_hint.__args__[0]
1228 | else:
1229 | try:
1230 | check_argument_types(memo)
1231 | except TypeError as exc:
1232 | warn(TypeWarning(memo, event, frame, exc))
1233 |
1234 | if self._previous_profiler is not None:
1235 | self._previous_profiler(frame, event, arg)
1236 | elif event == 'return':
1237 | if self._previous_profiler is not None:
1238 | self._previous_profiler(frame, event, arg)
1239 |
1240 | if arg is None:
1241 | # a None return value might mean an exception is being raised but we have no way of
1242 | # checking
1243 | return
1244 |
1245 | memo = self._call_memos.get(frame)
1246 | if memo is not None:
1247 | try:
1248 | if memo.is_generator:
1249 | check_type('yielded value', arg, memo.type_hints['return'], memo)
1250 | else:
1251 | check_return_type(arg, memo)
1252 | except TypeError as exc:
1253 | warn(TypeWarning(memo, event, frame, exc))
1254 |
1255 | if not memo.is_generator:
1256 | del self._call_memos[frame]
1257 | elif self._previous_profiler is not None:
1258 | self._previous_profiler(frame, event, arg)
1259 |
--------------------------------------------------------------------------------