├── tests ├── __init__.py ├── test_indexed_dataset.py ├── test_dataset.py └── test_type_validation.py ├── strictly_typed_pandas ├── py.typed ├── _vendor │ ├── __init__.py │ ├── typeguard │ │ ├── py.typed │ │ ├── pytest_plugin.py │ │ ├── importhook.py │ │ └── __init__.py │ └── typeguard-2.13.3.dist-info │ │ ├── REQUESTED │ │ ├── INSTALLER │ │ ├── top_level.txt │ │ ├── entry_points.txt │ │ ├── WHEEL │ │ ├── LICENSE │ │ ├── RECORD │ │ └── METADATA ├── __init__.py ├── create_empty_dataframe.py ├── immutable.py ├── pandas_types.py ├── pytest_plugin.py ├── validate_schema.py ├── typeguard.py └── dataset.py ├── requirements.txt ├── docs ├── requirements.txt ├── source │ ├── api.rst │ ├── stubs │ │ ├── strictly_typed_pandas.DataSet.rst │ │ └── strictly_typed_pandas.IndexedDataSet.rst │ ├── index.rst │ ├── contributing.rst │ ├── conf.py │ ├── typeguard.rst │ ├── advanced.ipynb │ ├── deepdive_into_dtypes.ipynb │ └── getting_started.ipynb ├── Makefile └── make.bat ├── vendorize.toml ├── tox.ini ├── .gitignore ├── requirements-dev.txt ├── .readthedocs.yaml ├── pyproject.toml ├── .github ├── dependabot.yml └── workflows │ ├── autoapprove.yml │ ├── publish.yml │ └── build.yml ├── .pre-commit-config.yaml ├── LICENSE ├── setup.py └── README.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strictly_typed_pandas/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/REQUESTED: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/INSTALLER: -------------------------------------------------------------------------------- 1 | pip 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy<=2.2.4 2 | pandas<=2.2.3 3 | pandas-stubs<=2.2.3.250308 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==8.2.3 2 | sphinx_rtd_theme 3 | nbsphinx 4 | jupyter 5 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/top_level.txt: -------------------------------------------------------------------------------- 1 | typeguard 2 | -------------------------------------------------------------------------------- /vendorize.toml: -------------------------------------------------------------------------------- 1 | target = "strictly_typed_pandas/_vendor" 2 | packages = [ 3 | "typeguard==2.13.3", 4 | ] 5 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [pytest11] 2 | typeguard = typeguard.pytest_plugin 3 | 4 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | 3 | # Hard line limit 4 | max-line-length = 120 5 | 6 | extend-exclude = strictly_typed_pandas/_vendor/ 7 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.37.0) 3 | Root-Is-Purelib: true 4 | Tag: py3-none-any 5 | 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .mypy_cache 2 | .pytest_chache 3 | **/__pycache__ 4 | .vscode 5 | *.pyc 6 | *.ipynb_checkpoints* 7 | .DS_Store 8 | .coverage 9 | .cache 10 | *.egg* 11 | docs/build 12 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | API documentation 3 | ================= 4 | 5 | .. toctree:: 6 | 7 | stubs/strictly_typed_pandas.DataSet 8 | stubs/strictly_typed_pandas.IndexedDataSet 9 | -------------------------------------------------------------------------------- /strictly_typed_pandas/__init__.py: -------------------------------------------------------------------------------- 1 | from strictly_typed_pandas.dataset import DataSet, IndexedDataSet # isort: skip 2 | import strictly_typed_pandas.typeguard # noqa: F401 3 | 4 | __all__ = ["DataSet", "IndexedDataSet"] 5 | -------------------------------------------------------------------------------- /docs/source/stubs/strictly_typed_pandas.DataSet.rst: -------------------------------------------------------------------------------- 1 | strictly\_typed\_pandas.DataSet 2 | =============================== 3 | 4 | .. currentmodule:: strictly_typed_pandas 5 | 6 | .. autoclass:: DataSet 7 | 8 | .. automethod:: __init__ 9 | .. automethod:: to_dataframe 10 | .. automethod:: to_frame 11 | -------------------------------------------------------------------------------- /docs/source/stubs/strictly_typed_pandas.IndexedDataSet.rst: -------------------------------------------------------------------------------- 1 | strictly\_typed\_pandas.IndexedDataSet 2 | ====================================== 3 | 4 | .. currentmodule:: strictly_typed_pandas 5 | 6 | .. autoclass:: IndexedDataSet 7 | 8 | .. automethod:: __init__ 9 | .. automethod:: to_dataframe 10 | .. automethod:: to_frame 11 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | mypy<=1.17.1 2 | flake8<=7.3.0 3 | black[jupyter]<=25.1.0 4 | docformatter<=1.7.7 5 | isort<=6.0.1 6 | coverage<=7.10.6 7 | pytest<=8.4.1 8 | nbconvert==7.16.6 9 | jupyter==1.1.1 10 | sphinx<=8.2.3 11 | sphinx_rtd_theme==3.0.2 12 | nbsphinx==0.9.7 13 | pre-commit<=4.3.0 14 | types-setuptools<=80.9.0.20250822 15 | pyarrow<=21.0.0 16 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.11" 12 | 13 | sphinx: 14 | configuration: docs/source/conf.py 15 | 16 | python: 17 | install: 18 | - method: pip 19 | path: . 20 | - requirements: requirements-dev.txt -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Strictly Typed Pandas documentation master file, created by 2 | sphinx-quickstart on Wed Jul 14 17:15:09 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. include:: ../../README.rst 7 | 8 | .. toctree:: 9 | :hidden: 10 | 11 | getting_started 12 | advanced 13 | deepdive_into_dtypes 14 | typeguard 15 | api 16 | contributing 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | force-exclude = ["strictly_typed_pandas/_vendor/*"] 4 | 5 | [tool.isort] 6 | profile = "black" 7 | line_length = 100 8 | extend_skip_glob = ["strictly_typed_pandas/_vendor/*"] 9 | 10 | [tool.mypy] 11 | exclude = ['strictly_typed_pandas/_vendor/.*'] 12 | 13 | [[tool.mypy.overrides]] 14 | module="strictly_typed_pandas._vendor.*" 15 | follow_imports = 'skip' 16 | 17 | [[tool.mypy.overrides]] 18 | module="typeguard" 19 | ignore_missing_imports = true 20 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/autoapprove.yml: -------------------------------------------------------------------------------- 1 | name: Dependabot auto-approve 2 | on: pull_request 3 | 4 | permissions: 5 | pull-requests: write 6 | 7 | jobs: 8 | dependabot: 9 | runs-on: ubuntu-latest 10 | if: ${{ github.actor == 'dependabot[bot]' }} 11 | steps: 12 | - name: Dependabot metadata 13 | id: metadata 14 | uses: dependabot/fetch-metadata@v1 15 | with: 16 | github-token: "${{ secrets.AUTOAPPROVE_TOKEN }}" 17 | - name: Approve a PR 18 | run: gh pr review --approve "$PR_URL" 19 | env: 20 | PR_URL: ${{github.event.pull_request.html_url}} 21 | GITHUB_TOKEN: ${{secrets.AUTOAPPROVE_TOKEN}} 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: '^strictly_typed_pandas/_vendor' 2 | repos: 3 | - repo: local 4 | hooks: 5 | - id: flake8 6 | name: flake8 7 | entry: flake8 8 | language: system 9 | types: [python] 10 | - id: mypy 11 | name: mypy 12 | entry: mypy 13 | language: system 14 | types: [python] 15 | - id: black 16 | name: black 17 | description: "Black: The uncompromising Python code formatter" 18 | entry: black 19 | language: system 20 | require_serial: true 21 | files: \.(py|ipynb)$ 22 | - id: isort 23 | name: isort 24 | entry: isort 25 | language: system 26 | types: [python] 27 | - id: pytest 28 | name: pytest 29 | entry: coverage run -m pytest --typeguard-packages=tests 30 | language: system 31 | types: [python] 32 | pass_filenames: false 33 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/contributing.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | We welcome contributions! To set up your development environment, we recommend using pyenv. You can find more on how to install ``pyenv`` and ``pyenv-virtualen`` here: 6 | 7 | * https://github.com/pyenv/pyenv 8 | * https://github.com/pyenv/pyenv-virtualenv 9 | 10 | To set up the environment, run: 11 | 12 | .. code-block:: bash 13 | 14 | pyenv install 3.11 15 | pyenv virtualenv 3.11 strictly_typed_pandas 16 | pyenv activate strictly_typed_pandas 17 | pip install -r requirements.txt 18 | pip install -r requirements-dev.txt 19 | 20 | For a list of currently supported Python versions, we refer to ``.github/workflows/build.yml``. 21 | 22 | --------------- 23 | Pre-commit hook 24 | --------------- 25 | We use ``pre-commit`` to run a number of checks on the code before it is committed. To install the pre-commit hook, run: 26 | 27 | .. code-block:: bash 28 | 29 | pre-commit install 30 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard/pytest_plugin.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from ..typeguard.importhook import install_import_hook 4 | 5 | 6 | def pytest_addoption(parser): 7 | group = parser.getgroup('typeguard') 8 | group.addoption('--typeguard-packages', action='store', 9 | help='comma separated name list of packages and modules to instrument for ' 10 | 'type checking') 11 | 12 | 13 | def pytest_configure(config): 14 | value = config.getoption("typeguard_packages") 15 | if not value: 16 | return 17 | 18 | packages = [pkg.strip() for pkg in value.split(",")] 19 | 20 | already_imported_packages = sorted( 21 | package for package in packages if package in sys.modules 22 | ) 23 | if already_imported_packages: 24 | message = ( 25 | "typeguard cannot check these packages because they " 26 | "are already imported: {}" 27 | ) 28 | raise RuntimeError(message.format(", ".join(already_imported_packages))) 29 | 30 | install_import_hook(packages=packages) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Nanne Aben 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/LICENSE: -------------------------------------------------------------------------------- 1 | This is the MIT license: http://www.opensource.org/licenses/mit-license.php 2 | 3 | Copyright (c) Alex Grönholm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | software and associated documentation files (the "Software"), to deal in the Software 7 | without restriction, including without limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 9 | to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or 12 | substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/RECORD: -------------------------------------------------------------------------------- 1 | typeguard-2.13.3.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 2 | typeguard-2.13.3.dist-info/LICENSE,sha256=YWP3mH37ONa8MgzitwsvArhivEESZRbVUu8c1DJH51g,1130 3 | typeguard-2.13.3.dist-info/METADATA,sha256=rrszCBWMnpJt2j9D8QqPgS1kQUFdTu5exwvCVkB0cIY,3591 4 | typeguard-2.13.3.dist-info/RECORD,, 5 | typeguard-2.13.3.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 6 | typeguard-2.13.3.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92 7 | typeguard-2.13.3.dist-info/entry_points.txt,sha256=uBVT0tmiav9LH4v6cq0GIl7TYz07TqFHniXP6zCfHbY,48 8 | typeguard-2.13.3.dist-info/top_level.txt,sha256=4z28AhuDodwRS_c1J_l8H51t5QuwfTseskYzlxp6grs,10 9 | typeguard/__init__.py,sha256=7LyyccpyAXgyd3WO2j1GXCWDdyasGjmA9v9DeydHR70,49186 10 | typeguard/__pycache__/__init__.cpython-311.pyc,, 11 | typeguard/__pycache__/importhook.cpython-311.pyc,, 12 | typeguard/__pycache__/pytest_plugin.cpython-311.pyc,, 13 | typeguard/importhook.py,sha256=nv3-M2SZ4cHxJBakslR_7w73YpT6Lit67txi7H7-xGM,5601 14 | typeguard/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 15 | typeguard/pytest_plugin.py,sha256=T1wfao9RMZ-fQ31bA_gmkoOtHEmXk3o1s0Nty5ZrFnw,917 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | def get_requirements(): 5 | with open("requirements.txt") as f: 6 | return f.read().splitlines() 7 | 8 | 9 | def get_long_description(): 10 | with open("README.rst", encoding="utf-8") as f: 11 | return f.read() 12 | 13 | 14 | setup( 15 | name="strictly_typed_pandas", 16 | url="https://github.com/nanne-aben/strictly_typed_pandas", 17 | license="MIT", 18 | author="Nanne Aben", 19 | author_email="nanne.aben@gmail.com", 20 | description="Static type checking of pandas DataFrames", 21 | keywords="typing type checking pandas mypy linting", 22 | long_description=get_long_description(), 23 | long_description_content_type="text/x-rst", 24 | packages=find_packages(include=["strictly_typed_pandas", "strictly_typed_pandas.*"]), 25 | install_requires=get_requirements(), 26 | python_requires=">=3.8.0", 27 | classifiers=["Typing :: Typed"], 28 | setuptools_git_versioning={"enabled": True}, 29 | setup_requires=["setuptools-git-versioning"], 30 | package_data={"strictly_typed_pandas": ["py.typed"]}, 31 | entry_points={ 32 | "pytest11": [ 33 | "strictly_typed_pandas = strictly_typed_pandas.pytest_plugin", 34 | ], 35 | }, 36 | ) 37 | -------------------------------------------------------------------------------- /strictly_typed_pandas/create_empty_dataframe.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict 2 | 3 | import numpy as np # type: ignore 4 | import pandas as pd 5 | from pandas.api.extensions import ExtensionDtype 6 | 7 | from strictly_typed_pandas.pandas_types import StringDtype 8 | 9 | 10 | def create_empty_dataframe(schema: Dict[str, Any]) -> pd.DataFrame: 11 | res = dict() 12 | for name, dtype in schema.items(): 13 | if dtype == Any: 14 | dtype = object 15 | 16 | if isinstance(dtype, Callable) and isinstance(dtype(), ExtensionDtype): # type: ignore 17 | dtype = dtype.name 18 | 19 | if isinstance(dtype, ExtensionDtype): 20 | dtype = dtype.name 21 | 22 | if dtype == np.datetime64: 23 | dtype = "datetime64[ns]" 24 | 25 | if dtype == np.timedelta64: 26 | dtype = "timedelta64[ns]" 27 | 28 | if dtype == str: 29 | dtype = StringDtype.name 30 | 31 | res[name] = pd.Series([], dtype=dtype) 32 | 33 | return pd.DataFrame(res) 34 | 35 | 36 | def create_empty_indexed_dataframe( 37 | index_schema: Dict[str, Any], data_schema: Dict[str, Any] 38 | ) -> pd.DataFrame: 39 | df_index = create_empty_dataframe(index_schema) 40 | df_data = create_empty_dataframe(data_schema) 41 | return pd.concat([df_index, df_data], axis=1).set_index(list(index_schema.keys())) 42 | -------------------------------------------------------------------------------- /strictly_typed_pandas/immutable.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Any, Callable, Optional 3 | 4 | from pandas.core.indexing import _iLocIndexer, _LocIndexer 5 | 6 | immutable_error_msg = ( 7 | "To ensure that the DataSet adheres to its schema, you cannot perform inplace modifications. You can either use " 8 | + "dataset.to_dataframe() to cast the DataSet to a DataFrame, or use operations that return a DataFrame, e.g. " 9 | + "df = df.assign(...)." 10 | ) 11 | 12 | 13 | class _ImmutableiLocIndexer(_iLocIndexer): 14 | def __setitem__(self, key: Any, value: Any) -> None: 15 | raise NotImplementedError(immutable_error_msg) 16 | 17 | 18 | class _ImmutableLocIndexer(_LocIndexer): 19 | def __setitem__(self, key: Any, value: Any) -> None: 20 | raise NotImplementedError(immutable_error_msg) 21 | 22 | 23 | def _get_index_of_inplace_in_args(call: Callable) -> Optional[int]: 24 | signature = inspect.signature(call) 25 | parameters = signature.parameters.keys() 26 | 27 | if "inplace" in parameters: 28 | return [i for i, v in enumerate(parameters) if v == "inplace"][0] 29 | else: 30 | return None 31 | 32 | 33 | def inplace_argument_interceptor(call: Callable) -> Callable: 34 | inplace_ind = _get_index_of_inplace_in_args(call) 35 | 36 | def func(*args, **kwargs): 37 | if inplace_ind is not None and inplace_ind < len(args) and args[inplace_ind]: 38 | raise NotImplementedError(immutable_error_msg) 39 | 40 | if "inplace" in kwargs and kwargs["inplace"]: 41 | raise NotImplementedError(immutable_error_msg) 42 | 43 | return call(*args, **kwargs) 44 | 45 | return func 46 | -------------------------------------------------------------------------------- /strictly_typed_pandas/pandas_types.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | # for backward compatability with pandas 0.23 - 0.25 5 | class BackwardCompatibility(pd.api.extensions.ExtensionDtype): 6 | name = "object" 7 | 8 | def __init__(self, *args, **kwargs) -> None: 9 | pass # pragma: no cover 10 | 11 | 12 | if hasattr(pd, "StringDtype"): 13 | StringDtype = pd.StringDtype 14 | else: # pragma: no cover 15 | 16 | class StringDtype(BackwardCompatibility): # type: ignore 17 | pass 18 | 19 | 20 | if hasattr(pd, "DatetimeTZDtype"): 21 | DatetimeTZDtype = pd.DatetimeTZDtype 22 | else: # pragma: no cover 23 | 24 | class DatetimeTZDtype(BackwardCompatibility): # type: ignore 25 | pass 26 | 27 | 28 | if hasattr(pd, "CategoricalDtype"): 29 | CategoricalDtype = pd.CategoricalDtype 30 | else: # pragma: no cover 31 | 32 | class CategoricalDtype(BackwardCompatibility): # type: ignore 33 | pass 34 | 35 | 36 | if hasattr(pd, "PeriodDtype"): 37 | PeriodDtype = pd.PeriodDtype 38 | else: # pragma: no cover 39 | 40 | class PeriodDtype(BackwardCompatibility): # type: ignore 41 | pass 42 | 43 | 44 | if hasattr(pd, "SparseDtype"): 45 | SparseDtype = pd.SparseDtype 46 | else: # pragma: no cover 47 | 48 | class SparseDtype(BackwardCompatibility): # type: ignore 49 | pass 50 | 51 | 52 | if hasattr(pd, "IntervalDtype"): 53 | IntervalDtype = pd.IntervalDtype 54 | else: # pragma: no cover 55 | 56 | class IntervalDtype(BackwardCompatibility): # type: ignore 57 | pass 58 | 59 | 60 | if hasattr(pd, "Int64Dtype"): 61 | Int64Dtype = pd.Int64Dtype 62 | else: # pragma: no cover 63 | 64 | class Int64Dtype(BackwardCompatibility): # type: ignore 65 | pass 66 | 67 | 68 | if hasattr(pd, "BooleanDtype"): 69 | BooleanDtype = pd.BooleanDtype 70 | else: # pragma: no cover 71 | 72 | class BooleanDtype(BackwardCompatibility): # type: ignore 73 | pass 74 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [pull_request, workflow_dispatch] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install -r requirements.txt 23 | pip install -r requirements-dev.txt 24 | pip install -e . 25 | - name: Lint with isort 26 | run: isort --check . 27 | - name: Lint with black 28 | run: black --check . 29 | - name: Lint with docformatter 30 | run: docformatter --black -c **/*.py 31 | - name: Lint with flake8 32 | run: flake8 33 | - name: Lint with mypy 34 | run: mypy . 35 | - name: Test with pytest 36 | run: | 37 | coverage run -m pytest --stp-typeguard-packages=tests 38 | coverage report -m 39 | - name: Run notebooks 40 | run: | 41 | for FILE in docs/source/*.ipynb; do 42 | BASE=$(basename $FILE) 43 | cp $FILE . 44 | jupyter nbconvert --to notebook $BASE --execute 45 | done 46 | - name: Run pytest with --typeguard-packages for backwards compatibility 47 | run: | 48 | pytest --typeguard-packages=tests 49 | - name: Run pytest with compatible typeguard installed 50 | run: | 51 | pip install typeguard==2.13.2 52 | pytest --typeguard-packages=tests 53 | pytest --stp-typeguard-packages=tests 54 | - name: Run pytest with incompatible typeguard installed 55 | run: | 56 | pip install typeguard==4.1.5 57 | pytest --stp-typeguard-packages=tests 58 | -------------------------------------------------------------------------------- /tests/test_indexed_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np # type: ignore 2 | import pandas as pd 3 | import pytest 4 | 5 | from strictly_typed_pandas import IndexedDataSet 6 | from strictly_typed_pandas.pandas_types import StringDtype 7 | 8 | 9 | class IndexSchema: 10 | a: int 11 | b: str 12 | 13 | 14 | class DataSchema: 15 | c: int 16 | d: str 17 | 18 | 19 | class AlternativeIndexSchema: 20 | a: int 21 | 22 | 23 | class AlternativeDataSchema: 24 | f: int 25 | 26 | 27 | def test_empty_indexed_dataset() -> None: 28 | df = IndexedDataSet[IndexSchema, DataSchema]() 29 | 30 | assert df.shape[0] == 0 31 | assert np.all(df.index.names == ["a", "b"]) 32 | assert np.all(df.columns == ["c", "d"]) 33 | 34 | assert df.index.get_level_values(0).dtype == int 35 | assert df.index.get_level_values(1).dtype == object or isinstance( 36 | df.index.get_level_values(1).dtype, StringDtype 37 | ) 38 | 39 | assert df.dtypes.iloc[0] == int 40 | assert df.dtypes.iloc[1] == object or isinstance(df.dtypes.iloc[1], StringDtype) 41 | 42 | 43 | def test_indexed_dataset() -> None: 44 | ( 45 | pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1, 2, 3], "d": ["a", "b", "c"]}) 46 | .set_index(["a", "b"]) 47 | .pipe(IndexedDataSet[IndexSchema, DataSchema]) 48 | ) 49 | 50 | 51 | def test_missing_index(): 52 | with pytest.raises(TypeError, match="No named columns in index"): 53 | pd.DataFrame({"a": [1, 2, 3]}).pipe(IndexedDataSet[IndexSchema, DataSchema]) 54 | 55 | 56 | def test_overlapping_columns(): 57 | with pytest.raises(TypeError): 58 | IndexedDataSet[IndexSchema, IndexSchema]() 59 | 60 | 61 | def foo(df: IndexedDataSet[IndexSchema, DataSchema]) -> IndexedDataSet[IndexSchema, DataSchema]: 62 | return df 63 | 64 | 65 | def test_typeguard_indexed_dataset() -> None: 66 | foo(IndexedDataSet[IndexSchema, DataSchema]()) 67 | 68 | with pytest.raises(TypeError): 69 | foo(IndexedDataSet[AlternativeIndexSchema, AlternativeDataSchema]()) # type: ignore 70 | 71 | with pytest.raises(TypeError): 72 | foo(pd.DataFrame()) # type: ignore 73 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | # This is a configuration file we shouldn't be checking it. 18 | # mypy: ignore-errors 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "Strictly Typed Pandas" 23 | copyright = "2021, Nanne Aben" 24 | author = "Nanne Aben" 25 | 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = ["sphinx.ext.autodoc", "sphinx_rtd_theme", "nbsphinx"] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ["_templates"] 36 | 37 | # List of patterns, relative to source directory, that match files and 38 | # directories to ignore when looking for source files. 39 | # This pattern also affects html_static_path and html_extra_path. 40 | exclude_patterns = [] 41 | 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | html_theme = "sphinx_rtd_theme" 49 | 50 | # Add any paths that contain custom static files (such as style sheets) here, 51 | # relative to this directory. They are copied after the builtin static files, 52 | # so a file named "default.css" will overwrite the builtin "default.css". 53 | html_static_path = ["_static"] 54 | -------------------------------------------------------------------------------- /strictly_typed_pandas/pytest_plugin.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import List 3 | 4 | from ._vendor.typeguard.importhook import install_import_hook 5 | 6 | try: 7 | import typeguard 8 | 9 | del typeguard 10 | TYPEGUARD_INSTALLED = True 11 | except ImportError: 12 | TYPEGUARD_INSTALLED = False 13 | 14 | 15 | def pytest_addoption(parser): 16 | group = parser.getgroup("stp_typeguard") 17 | group.addoption( 18 | "--stp-typeguard-packages", 19 | action="store", 20 | help=( 21 | "comma separated name list of packages and modules to " 22 | "instrument for type checking by strictly typed pandas" 23 | ), 24 | ) 25 | if not TYPEGUARD_INSTALLED: 26 | group = parser.getgroup("typeguard") 27 | group.addoption( 28 | "--typeguard-packages", 29 | action="store", 30 | help=( 31 | "comma separated name list of packages and modules to " 32 | "instrument for type checking" 33 | ), 34 | ) 35 | 36 | 37 | def _parse_packages(val: str) -> List[str]: 38 | if val is None or not val.strip(): 39 | return [] 40 | return [pkg.strip() for pkg in val.split(",")] 41 | 42 | 43 | def pytest_configure(config): 44 | packages = _parse_packages(config.getoption("stp_typeguard_packages")) 45 | typeguard_packages = _parse_packages(config.getoption("typeguard_packages")) 46 | 47 | packages_in_both = set(packages) & set(typeguard_packages) 48 | if packages_in_both: 49 | raise RuntimeError( 50 | "If you are going to use both --stp-typeguard-packages " 51 | "and --typeguard-packages at the same time, " 52 | "please don't list the same package in both options: " 53 | f"{', '.join(packages_in_both)}" 54 | ) 55 | 56 | if not TYPEGUARD_INSTALLED: 57 | packages.extend(typeguard_packages) 58 | 59 | if not packages: 60 | return 61 | 62 | already_imported_packages = sorted(p for p in packages if p in sys.modules) 63 | if already_imported_packages: 64 | message = ( 65 | "strictly_typed_pandas cannot check these packages because " 66 | "they are already imported: {}" 67 | ) 68 | raise RuntimeError(message.format(", ".join(already_imported_packages))) 69 | 70 | install_import_hook(packages=packages) 71 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================================================ 2 | Strictly Typed Pandas: static type checking of pandas DataFrames 3 | ================================================================ 4 | 5 | I love Pandas! But in production code I’m always a bit wary when I see: 6 | 7 | .. code-block:: python 8 | 9 | import pandas as pd 10 | 11 | def foo(df: pd.DataFrame) -> pd.DataFrame: 12 | # do stuff 13 | return df 14 | 15 | Because… How do I know which columns are supposed to be in `df`? 16 | 17 | Using `strictly_typed_pandas`, we can be more explicit about what these data should look like. 18 | 19 | .. code-block:: python 20 | 21 | from strictly_typed_pandas import DataSet 22 | 23 | class Schema: 24 | id: int 25 | name: str 26 | 27 | def foo(df: DataSet[Schema]) -> DataSet[Schema]: 28 | # do stuff 29 | return df 30 | 31 | Where `DataSet`: 32 | * is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`. 33 | * validates whether the data adheres to the provided schema upon its initialization. 34 | * is immutable, so its schema cannot be changed using inplace modifications. 35 | 36 | The `DataSet[Schema]` annotations are compatible with: 37 | * `mypy` for type checking during linting-time (i.e. while you write your code). 38 | * `typeguard` (`_. 55 | 56 | FAQ 57 | === 58 | 59 | | **Do you know of something similar for pyspark?** 60 | | Yes! Check out our package `typedspark `_. 61 | | 62 | | **Why use Python if you want static typing?** 63 | | There are just so many good packages for data science in Python. Rather than sacrificing all of that by moving to a different language, I'd like to make the Pythonverse a little bit better. 64 | | 65 | | **I found a bug! What should I do?** 66 | | Great! Contact me and I'll look into it. 67 | | 68 | | **I have a great idea to improve strictly_typed_pandas! How can we make this work?** 69 | | Awesome, drop me a line! 70 | -------------------------------------------------------------------------------- /strictly_typed_pandas/validate_schema.py: -------------------------------------------------------------------------------- 1 | from typing import Any, ClassVar, Dict, Set, get_origin 2 | 3 | import numpy as np # type: ignore 4 | from pandas.api.extensions import ExtensionDtype 5 | from pandas.core.dtypes.common import is_dtype_equal 6 | 7 | from strictly_typed_pandas.pandas_types import StringDtype 8 | 9 | 10 | def check_for_duplicate_columns(names_index: Set[str], names_data: Set[str]) -> None: 11 | intersection = names_index & names_data 12 | if len(intersection) > 0: 13 | msg = "The following column is present in both the index schema and the data schema: {}" 14 | raise TypeError(msg.format(intersection)) 15 | 16 | 17 | def remove_classvars(schema_expected: Dict[str, Any]) -> Dict[str, Any]: 18 | return { 19 | key: value 20 | for key, value in schema_expected.items() if get_origin(value) is not ClassVar 21 | } 22 | 23 | 24 | def validate_schema(schema_expected: Dict[str, Any], schema_observed: Dict[str, Any]): 25 | schema_expected = remove_classvars(schema_expected) 26 | _check_names(set(schema_expected.keys()), set(schema_observed.keys())) 27 | _check_dtypes(schema_expected, schema_observed) 28 | 29 | 30 | def _check_names(names_expected: Set[str], names_observed: Set[str]) -> None: 31 | diff = names_observed - names_expected 32 | if diff: 33 | raise TypeError( 34 | "Data contains the following columns not present in schema: {diff}".format(diff=diff) 35 | ) 36 | 37 | diff = names_expected - names_observed 38 | if diff: 39 | raise TypeError( 40 | "Schema contains the following columns not present in data: {diff}".format(diff=diff) 41 | ) 42 | 43 | 44 | def _check_dtypes(schema_expected: Dict[str, Any], schema_observed: Dict[str, Any]) -> None: 45 | for name, dtype_expected in schema_expected.items(): 46 | dtype_observed = schema_observed[name] 47 | 48 | if dtype_expected in [object, np.object_, Any]: 49 | continue 50 | 51 | if dtype_expected == str and dtype_observed == object: 52 | continue # pandas stores strings as objects by default 53 | 54 | if dtype_expected == str and isinstance(dtype_observed, StringDtype): 55 | continue # since np.int64 == int, I'd say we should also support pd.StringDtype == str 56 | 57 | if isinstance(dtype_observed, np.dtype) and dtype_observed != np.object_: 58 | if dtype_observed == dtype_expected or np.issubdtype(dtype_observed, dtype_expected): 59 | continue 60 | 61 | if isinstance(dtype_expected, ExtensionDtype) and is_dtype_equal( 62 | dtype_expected, dtype_observed 63 | ): 64 | continue 65 | 66 | if dtype_observed != object and isinstance(dtype_observed, dtype_expected): 67 | continue 68 | 69 | msg = "Column {name} is of type {dtype_observed}, but the schema suggests {dtype_expected}" 70 | 71 | if isinstance(dtype_observed, np.dtype): 72 | dtype_observed = "numpy." + str(dtype_observed) 73 | 74 | raise TypeError( 75 | msg.format(name=name, dtype_observed=dtype_observed, dtype_expected=dtype_expected) 76 | ) 77 | -------------------------------------------------------------------------------- /strictly_typed_pandas/typeguard.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import PackageNotFoundError, version 2 | 3 | from strictly_typed_pandas import DataSet, IndexedDataSet 4 | from strictly_typed_pandas._vendor import typeguard 5 | 6 | try: 7 | COMPATIBLE_EXTERNAL_TYPEGUARD_EXISTS = version("typeguard").startswith("2.") 8 | except PackageNotFoundError: 9 | COMPATIBLE_EXTERNAL_TYPEGUARD_EXISTS = False 10 | 11 | if COMPATIBLE_EXTERNAL_TYPEGUARD_EXISTS: 12 | import typeguard as external_typeguard 13 | else: 14 | external_typeguard = None 15 | 16 | 17 | def check_dataset(argname: str, value, expected_type, memo: typeguard._TypeCheckMemo) -> None: 18 | schema_expected = expected_type.__args__[0] 19 | if not isinstance(value, DataSet): 20 | msg = "Type of {argname} must be a DataSet[{schema_expected}]; got {class_observed} instead" 21 | raise TypeError( 22 | msg.format( 23 | argname=argname, 24 | schema_expected=typeguard.qualified_name(schema_expected), 25 | class_observed=typeguard.qualified_name(value), 26 | ) 27 | ) 28 | 29 | schema_observed = value.__orig_class__.__args__[0] 30 | if schema_observed != schema_expected: 31 | msg = "Type of {argname} must be a DataSet[{schema_expected}]; got DataSet[{schema_observed}] instead" 32 | raise TypeError( 33 | msg.format( 34 | argname=argname, 35 | schema_expected=typeguard.qualified_name(schema_expected), 36 | schema_observed=typeguard.qualified_name(schema_observed), 37 | ) 38 | ) 39 | 40 | 41 | def check_indexed_dataset(argname: str, value, expected_type, memo: typeguard._TypeCheckMemo): 42 | schema_index_expected = expected_type.__args__[0] 43 | schema_data_expected = expected_type.__args__[1] 44 | if not isinstance(value, IndexedDataSet): 45 | msg = ( 46 | "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];" 47 | + "got {class_observed} instead" 48 | ) 49 | raise TypeError( 50 | msg.format( 51 | argname=argname, 52 | schema_index_expected=typeguard.qualified_name(schema_index_expected), 53 | schema_data_expected=typeguard.qualified_name(schema_data_expected), 54 | class_observed=typeguard.qualified_name(value), 55 | ) 56 | ) 57 | 58 | schema_index_observed = value.__orig_class__.__args__[0] 59 | schema_data_observed = value.__orig_class__.__args__[1] 60 | if ( 61 | schema_index_observed != schema_index_expected 62 | or schema_data_observed != schema_data_expected 63 | ): 64 | msg = ( 65 | "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];" 66 | + "got IndexedDataSet[{schema_index_observed},{schema_data_observed}] instead" 67 | ) 68 | raise TypeError( 69 | msg.format( 70 | argname=argname, 71 | schema_index_expected=typeguard.qualified_name(schema_index_expected), 72 | schema_data_expected=typeguard.qualified_name(schema_data_expected), 73 | schema_index_observed=typeguard.qualified_name(schema_index_observed), 74 | schema_data_observed=typeguard.qualified_name(schema_data_observed), 75 | ) 76 | ) 77 | 78 | 79 | typeguard.origin_type_checkers[DataSet] = check_dataset 80 | typeguard.origin_type_checkers[IndexedDataSet] = check_indexed_dataset 81 | typechecked = typeguard.typechecked 82 | 83 | if external_typeguard is not None: 84 | external_typeguard.origin_type_checkers[DataSet] = check_dataset 85 | external_typeguard.origin_type_checkers[IndexedDataSet] = check_indexed_dataset 86 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard-2.13.3.dist-info/METADATA: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: typeguard 3 | Version: 2.13.3 4 | Summary: Run-time type checker for Python 5 | Home-page: UNKNOWN 6 | Author: Alex Grönholm 7 | Author-email: alex.gronholm@nextday.fi 8 | License: MIT 9 | Project-URL: Documentation, https://typeguard.readthedocs.io/en/latest/ 10 | Project-URL: Change log, https://typeguard.readthedocs.io/en/latest/versionhistory.html 11 | Project-URL: Source code, https://github.com/agronholm/typeguard 12 | Project-URL: Issue tracker, https://github.com/agronholm/typeguard/issues 13 | Platform: UNKNOWN 14 | Classifier: Development Status :: 5 - Production/Stable 15 | Classifier: Intended Audience :: Developers 16 | Classifier: License :: OSI Approved :: MIT License 17 | Classifier: Programming Language :: Python 18 | Classifier: Programming Language :: Python :: 3 19 | Classifier: Programming Language :: Python :: 3.5 20 | Classifier: Programming Language :: Python :: 3.6 21 | Classifier: Programming Language :: Python :: 3.7 22 | Classifier: Programming Language :: Python :: 3.8 23 | Classifier: Programming Language :: Python :: 3.9 24 | Classifier: Programming Language :: Python :: 3.10 25 | Requires-Python: >=3.5.3 26 | License-File: LICENSE 27 | Provides-Extra: doc 28 | Requires-Dist: sphinx-rtd-theme ; extra == 'doc' 29 | Requires-Dist: sphinx-autodoc-typehints (>=1.2.0) ; extra == 'doc' 30 | Provides-Extra: test 31 | Requires-Dist: pytest ; extra == 'test' 32 | Requires-Dist: typing-extensions ; extra == 'test' 33 | Requires-Dist: mypy ; (platform_python_implementation != "PyPy") and extra == 'test' 34 | 35 | .. image:: https://travis-ci.com/agronholm/typeguard.svg?branch=master 36 | :target: https://travis-ci.com/agronholm/typeguard 37 | :alt: Build Status 38 | .. image:: https://coveralls.io/repos/agronholm/typeguard/badge.svg?branch=master&service=github 39 | :target: https://coveralls.io/github/agronholm/typeguard?branch=master 40 | :alt: Code Coverage 41 | .. image:: https://readthedocs.org/projects/typeguard/badge/?version=latest 42 | :target: https://typeguard.readthedocs.io/en/latest/?badge=latest 43 | 44 | This library provides run-time type checking for functions defined with 45 | `PEP 484 `_ argument (and return) type annotations. 46 | 47 | Four principal ways to do type checking are provided, each with its pros and cons: 48 | 49 | #. the ``check_argument_types()`` and ``check_return_type()`` functions: 50 | 51 | * debugger friendly (except when running with the pydev debugger with the C extension installed) 52 | * does not work reliably with dynamically defined type hints (e.g. in nested functions) 53 | #. the ``@typechecked`` decorator: 54 | 55 | * automatically type checks yields and sends of returned generators (regular and async) 56 | * adds an extra frame to the call stack for every call to a decorated function 57 | #. the stack profiler hook (``with TypeChecker('packagename'):``) (deprecated): 58 | 59 | * emits warnings instead of raising ``TypeError`` 60 | * requires very few modifications to the code 61 | * multiple TypeCheckers can be stacked/nested 62 | * does not work reliably with dynamically defined type hints (e.g. in nested functions) 63 | * may cause problems with badly behaving debuggers or profilers 64 | * cannot distinguish between an exception being raised and a ``None`` being returned 65 | #. the import hook (``typeguard.importhook.install_import_hook()``): 66 | 67 | * automatically annotates classes and functions with ``@typechecked`` on import 68 | * no code changes required in target modules 69 | * requires imports of modules you need to check to be deferred until after the import hook has 70 | been installed 71 | * may clash with other import hooks 72 | 73 | See the documentation_ for further instructions. 74 | 75 | .. _documentation: https://typeguard.readthedocs.io/en/latest/ 76 | 77 | 78 | -------------------------------------------------------------------------------- /docs/source/typeguard.rst: -------------------------------------------------------------------------------- 1 | Typeguard 2 | ========= 3 | 4 | We use typeguard in strictly typed pandas to as an additional runtime check, as described in earlier sections. As per typeguard 3.0.0, a number of breaking changes were introduced, which we couldn't reconcile with strictly typed pandas. Other packages that depend on typeguard 2.13.3 are in a similar situation. 5 | 6 | However, the ``typeguard<=2.13.3`` requirement became problematic over time, as it meant people could not use strictly typed pandas together with packages that depend on ``typeguard>=3.0.0``. For this reason, we have decided to vendor typeguard in ``strictly_typed_pandas==0.2.0``, meaning that we include typeguard within the strictly typed pandas code base, rather than having it as a dependency. 7 | 8 | In this document, we outline how you can use typeguard with ``strictly_typed_pandas>=0.2.0``. 9 | 10 | With typeguard 2.13.3 (backwards compatibility) 11 | ----------------------------------------------- 12 | 13 | To support backwards compatibility, we allow you to use typeguard with ``strictly_typed_pandas>=0.2.0`` by simply installing ``typeguard==2.13.3``, without any other changes required. This can be done by running: 14 | 15 | .. code-block:: bash 16 | 17 | pip install typeguard==2.13.3 18 | 19 | You can use all functionality from typeguard as before: 20 | 21 | Decorator 22 | ^^^^^^^^^ 23 | 24 | .. code-block:: python 25 | 26 | from typeguard import typechecked 27 | 28 | @typechecked 29 | def foo(df: DataSet[Person]) -> DataSet[Person]: 30 | ... 31 | 32 | Import hook 33 | ^^^^^^^^^^^ 34 | 35 | .. code-block:: python 36 | 37 | from typeguard import install_import_hook 38 | 39 | install_import_hook('my_app') 40 | from my_app import some_module # import only AFTER installing the hook, or it won't take effect 41 | 42 | Pytest plugin 43 | ^^^^^^^^^^^^^ 44 | 45 | .. code-block:: bash 46 | 47 | pytest --typeguard-packages=my_app 48 | 49 | With the vendored typeguard version (recommended) 50 | ------------------------------------------------- 51 | 52 | We recommend that you use the vendored typeguard version, as it is the most future-proof solution. 53 | 54 | Decorator 55 | ^^^^^^^^^ 56 | 57 | You can use the vendored version as follows: 58 | 59 | .. code-block:: python 60 | 61 | from strictly_typed_pandas.typeguard import typechecked 62 | 63 | @typechecked 64 | def foo(df: DataSet[Person]) -> DataSet[Person]: 65 | ... 66 | 67 | If you also want to use a second typeguard version in your project (e.g. ``typeguard>=3.0.0``), you can pip install that version and then you can use the following: 68 | 69 | .. code-block:: python 70 | 71 | from typeguard import typechecked as typechecked_vanilla 72 | 73 | @typechecked_vanilla 74 | def foo(a: int) -> int: 75 | ... 76 | 77 | Note that ``@typechecked_vanilla`` will not work with strictly typed pandas types; you can only use it for projects that do not use strictly typed pandas. 78 | 79 | Import hook 80 | ^^^^^^^^^^^ 81 | 82 | The import hook is currently not supported in the vendored version. It should be possible to add support for this, but we have not done so yet. If you would like to use the import hook, please open an issue. 83 | 84 | Of course, you can still use the import hook with the vanilla version, as follows: 85 | 86 | .. code-block:: python 87 | 88 | from typeguard import install_import_hook 89 | 90 | install_import_hook('my_app') 91 | from my_app import some_module # import only AFTER installing the hook, or it won't take effect 92 | 93 | Pytest plugin 94 | ^^^^^^^^^^^^^ 95 | 96 | To use the vendored version of the pytest plugin, you can use the following: 97 | 98 | .. code-block:: bash 99 | 100 | pytest --stp-typeguard-packages=my_app 101 | 102 | If you also want to use a second typeguard version in your project (e.g. ``typeguard>=3.0.0``), you can pip install that version and then you can use the following: 103 | 104 | .. code-block:: bash 105 | 106 | pytest --typeguard-packages=my_other_app 107 | 108 | You can also use them at the same time: 109 | 110 | .. code-block:: bash 111 | 112 | pytest --stp-typeguard-packages=my_app --typeguard-packages=my_other_app 113 | 114 | Please don't define the same package in both flags, this will raise an error. 115 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import tempfile 3 | from typing import ClassVar 4 | 5 | import numpy as np # type: ignore 6 | import pandas as pd 7 | import pytest 8 | 9 | from strictly_typed_pandas import DataSet 10 | from strictly_typed_pandas.pandas_types import StringDtype 11 | 12 | 13 | class Schema: 14 | a: int 15 | b: str 16 | 17 | 18 | class AlternativeSchema: 19 | a: int 20 | 21 | 22 | class SchemaWithClassVar: 23 | a: int 24 | b: ClassVar[str] = "abc" 25 | 26 | 27 | dictionary = {"a": [1, 2, 3], "b": ["a", "b", "c"]} 28 | 29 | 30 | def test_empty_dataset() -> None: 31 | df = DataSet[Schema]() 32 | 33 | assert df.shape[0] == 0 34 | assert np.all(df.columns == ["a", "b"]) 35 | 36 | assert df.dtypes.iloc[0] == int 37 | assert df.dtypes.iloc[1] == object or isinstance(df.dtypes.iloc[1], StringDtype) 38 | 39 | 40 | def test_dataset() -> None: 41 | DataSet[Schema](dictionary) 42 | 43 | 44 | def test_dataset_missing_colnames() -> None: 45 | with pytest.raises(TypeError): 46 | DataSet[Schema]({"a": []}) 47 | 48 | 49 | def test_dataset_too_many_colnames() -> None: 50 | with pytest.raises(TypeError): 51 | DataSet[Schema]({"a": [], "b": [], "c": []}) 52 | 53 | 54 | def test_dataset_check_types() -> None: 55 | with pytest.raises(TypeError): 56 | DataSet[Schema]({"a": ["1", "2", "3"], "b": ""}) 57 | 58 | 59 | def test_dataset_immutable() -> None: 60 | df = DataSet[Schema](dictionary) 61 | strings = ["1", "2", "3"] 62 | 63 | with pytest.raises(NotImplementedError): 64 | df["a"] = strings 65 | 66 | with pytest.raises(NotImplementedError): 67 | df.a = strings 68 | 69 | with pytest.raises(NotImplementedError): 70 | df.loc[:, "a"] = strings 71 | 72 | with pytest.raises(NotImplementedError): 73 | df.iloc[:, 0] = strings 74 | 75 | with pytest.raises(NotImplementedError): 76 | df.assign(a=strings, inplace=True) 77 | 78 | with pytest.raises(NotImplementedError): 79 | # 4th argument is inplace 80 | df.set_index(["a"], True, False, True) # type: ignore 81 | 82 | assert isinstance(df.assign(a=strings), pd.DataFrame) 83 | 84 | 85 | def test_dataset_to_dataframe() -> None: 86 | df = DataSet[Schema](dictionary) 87 | assert isinstance(df.to_dataframe(), pd.DataFrame) 88 | assert isinstance(df.to_frame(), pd.DataFrame) 89 | 90 | 91 | def foo(df: DataSet[Schema]) -> DataSet[Schema]: 92 | return df 93 | 94 | 95 | def test_typeguard_dataset() -> None: 96 | foo(DataSet[Schema]()) 97 | 98 | with pytest.raises(TypeError): 99 | foo(DataSet[AlternativeSchema]()) # type: ignore 100 | 101 | with pytest.raises(TypeError): 102 | foo(pd.DataFrame()) # type: ignore 103 | 104 | 105 | def test_duplicates() -> None: 106 | with pytest.raises(TypeError): 107 | DataSet[AlternativeSchema]([[1, 1]], columns=["a", "a"]) 108 | 109 | 110 | def test_pickle(): 111 | df = DataSet[Schema](dictionary) 112 | 113 | with tempfile.TemporaryDirectory() as tmpdir: 114 | pickle.dump(df, open(f"{tmpdir}/test.pkl", "wb")) 115 | loaded = pickle.load(open(f"{tmpdir}/test.pkl", "rb")) 116 | 117 | assert (df == loaded).all().all() 118 | 119 | 120 | def test_classvar_colum_not_allowed(): 121 | with pytest.raises(TypeError): 122 | DataSet[SchemaWithClassVar](dictionary) 123 | 124 | 125 | def test_classvar_colum_not_required(): 126 | DataSet[SchemaWithClassVar]({"a": [1, 2, 3]}) 127 | 128 | 129 | class A: 130 | a: int 131 | 132 | 133 | class B: 134 | a: int 135 | 136 | 137 | def test_resetting_of_schema_annotations(): 138 | df = DataSet[A]() 139 | 140 | a: pd.DataFrame 141 | 142 | # if no schema is specified, the annotation should be None 143 | a = DataSet(df) 144 | assert a._schema_annotations is None 145 | 146 | # when we specify a schema, the class variable will be set to A, but afterwards it should be 147 | # reset to None again when we initialize a new object without specifying a schema 148 | DataSet[A] 149 | a = DataSet(df) 150 | assert a._schema_annotations is None 151 | 152 | # and then to B 153 | a = DataSet[B](df) 154 | 155 | # and then to None again 156 | a = DataSet(df) 157 | assert a._schema_annotations is None 158 | -------------------------------------------------------------------------------- /tests/test_type_validation.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Union 2 | 3 | import numpy as np # type: ignore 4 | import pandas as pd 5 | 6 | from strictly_typed_pandas import DataSet, IndexedDataSet 7 | from strictly_typed_pandas.pandas_types import ( 8 | BackwardCompatibility, 9 | BooleanDtype, 10 | CategoricalDtype, 11 | DatetimeTZDtype, 12 | Int64Dtype, 13 | IntervalDtype, 14 | PeriodDtype, 15 | SparseDtype, 16 | StringDtype, 17 | ) 18 | 19 | 20 | def is_backward_compatibility_type(dtype) -> bool: 21 | if isinstance(dtype, BackwardCompatibility): 22 | return True 23 | 24 | if dtype != Any: 25 | if isinstance(dtype, Callable) and isinstance(dtype(), BackwardCompatibility): # type: ignore 26 | return True 27 | 28 | return False 29 | 30 | 31 | def are_they_equal(observed, expected) -> Union[bool, float]: 32 | if is_backward_compatibility_type(observed) or is_backward_compatibility_type(expected): 33 | return np.nan 34 | 35 | class SchemaExpected: 36 | a: expected 37 | 38 | class SchemaObserved: 39 | a: observed 40 | 41 | df = DataSet[SchemaObserved]() 42 | 43 | try: 44 | DataSet[SchemaExpected](df) 45 | except TypeError: 46 | return False 47 | 48 | return True 49 | 50 | 51 | def check_list_of_types(observed, expected_to_match, expected_to_fail): 52 | expected_to_match += [object, np.object_, Any] 53 | matches = pd.Series([are_they_equal(observed, expected) for expected in expected_to_match]) 54 | assert matches.dropna().all() 55 | 56 | fails = pd.Series([are_they_equal(observed, expected) for expected in expected_to_fail]) 57 | assert not fails.dropna().any() 58 | 59 | 60 | def test_numeric_base_python_types(): 61 | check_list_of_types(int, [np.int64, np.int_, int], [float, np.float64]) 62 | check_list_of_types(float, [np.float64, float], [int, np.int_]) 63 | check_list_of_types(bool, [np.bool_, bool], [int, np.int_]) 64 | 65 | 66 | def test_numpy_types(): 67 | check_list_of_types(np.int64, [np.int64, np.int_, int], [float, np.float64]) 68 | check_list_of_types(np.float64, [np.float64, float], [int, np.int_]) 69 | check_list_of_types(np.bool_, [np.bool_, bool], [int, np.int_]) 70 | check_list_of_types( 71 | np.datetime64, [np.datetime64], [np.timedelta64, DatetimeTZDtype(tz="UTC"), np.int_] 72 | ) 73 | check_list_of_types(np.timedelta64, [np.timedelta64], [np.datetime64, np.int64]) 74 | 75 | 76 | def test_pandas_types(): 77 | check_list_of_types( 78 | DatetimeTZDtype(tz="UTC"), 79 | [DatetimeTZDtype(tz="UTC")], 80 | [np.datetime64, DatetimeTZDtype(tz="GMT"), np.int_], 81 | ) 82 | check_list_of_types(CategoricalDtype, [CategoricalDtype], [Int64Dtype, np.int_, int]) 83 | check_list_of_types( 84 | PeriodDtype(freq="D"), 85 | [PeriodDtype(freq="D")], 86 | [np.datetime64, PeriodDtype(freq="W"), np.int_], 87 | ) 88 | check_list_of_types( 89 | SparseDtype(dtype=np.int64), 90 | [SparseDtype(dtype=np.int64)], 91 | [np.int64, SparseDtype(dtype=np.float64), int], 92 | ) 93 | check_list_of_types(IntervalDtype, [IntervalDtype], [Int64Dtype, np.int_, int]) 94 | check_list_of_types(Int64Dtype, [Int64Dtype], [IntervalDtype, np.int64, int]) 95 | check_list_of_types(BooleanDtype, [BooleanDtype], [IntervalDtype, np.bool_, bool]) 96 | 97 | 98 | def test_strings(): 99 | check_list_of_types(str, [str, StringDtype], [int, np.int_]) 100 | check_list_of_types(StringDtype, [str, StringDtype], [int, np.int_]) 101 | 102 | # as long as this is true 103 | df = pd.DataFrame({"a": ["a", "b", "c"]}) 104 | assert df.dtypes.iloc[0] == object 105 | # we'll need to do this 106 | check_list_of_types(object, [str], [StringDtype]) 107 | 108 | 109 | def test_any(): 110 | check_list_of_types(Any, [], [int, np.int_]) 111 | check_list_of_types(object, [], [int, np.int_]) 112 | check_list_of_types(np.object_, [], [int, np.int_]) 113 | 114 | 115 | class DataSchema: 116 | b: str 117 | 118 | 119 | def test_supported_index_data_type(): 120 | dtypes = [ 121 | DatetimeTZDtype(tz="UTC"), 122 | CategoricalDtype, 123 | PeriodDtype(freq="D"), 124 | IntervalDtype, 125 | str, 126 | int, 127 | float, 128 | np.int_, 129 | np.float64, 130 | np.datetime64, 131 | np.timedelta64, 132 | Any, 133 | object, 134 | np.object_, 135 | SparseDtype(dtype=np.int64), 136 | np.bool_, 137 | Int64Dtype, 138 | BooleanDtype, 139 | StringDtype, 140 | ] 141 | for dtype in dtypes: 142 | if is_backward_compatibility_type(dtype): 143 | continue 144 | 145 | class IndexSchema: 146 | a: dtype # type: ignore 147 | 148 | IndexedDataSet[IndexSchema, DataSchema]() 149 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard/importhook.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import sys 3 | from importlib.abc import MetaPathFinder 4 | from importlib.machinery import SourceFileLoader 5 | from importlib.util import cache_from_source, decode_source 6 | from inspect import isclass 7 | from typing import Iterable, Type 8 | from unittest.mock import patch 9 | 10 | 11 | # The name of this function is magical 12 | def _call_with_frames_removed(f, *args, **kwargs): 13 | return f(*args, **kwargs) 14 | 15 | 16 | def optimized_cache_from_source(path, debug_override=None): 17 | return cache_from_source(path, debug_override, optimization='typeguard') 18 | 19 | 20 | class TypeguardTransformer(ast.NodeVisitor): 21 | def __init__(self) -> None: 22 | self._parents = [] 23 | 24 | def visit_Module(self, node: ast.Module): 25 | # Insert "from strictly_typed_pandas._vendor import typeguard" after any "from __future__ ..." imports 26 | for i, child in enumerate(node.body): 27 | if isinstance(child, ast.ImportFrom) and child.module == '__future__': 28 | continue 29 | elif isinstance(child, ast.Expr) and isinstance(child.value, ast.Str): 30 | continue # module docstring 31 | else: 32 | node.body.insert( 33 | i, 34 | ast.ImportFrom( 35 | module="strictly_typed_pandas._vendor", 36 | names=[ast.alias(name="typeguard", asname=None)], 37 | level=0, 38 | ), 39 | ) 40 | break 41 | 42 | self._parents.append(node) 43 | self.generic_visit(node) 44 | self._parents.pop() 45 | return node 46 | 47 | def visit_ClassDef(self, node: ast.ClassDef): 48 | node.decorator_list.append( 49 | ast.Attribute(ast.Name(id='typeguard', ctx=ast.Load()), 'typechecked', ast.Load()) 50 | ) 51 | self._parents.append(node) 52 | self.generic_visit(node) 53 | self._parents.pop() 54 | return node 55 | 56 | def visit_FunctionDef(self, node: ast.FunctionDef): 57 | # Let the class level decorator handle the methods of a class 58 | if isinstance(self._parents[-1], ast.ClassDef): 59 | return node 60 | 61 | has_annotated_args = any(arg for arg in node.args.args if arg.annotation) 62 | has_annotated_return = bool(node.returns) 63 | if has_annotated_args or has_annotated_return: 64 | node.decorator_list.insert( 65 | 0, 66 | ast.Attribute(ast.Name(id='typeguard', ctx=ast.Load()), 'typechecked', ast.Load()) 67 | ) 68 | 69 | self._parents.append(node) 70 | self.generic_visit(node) 71 | self._parents.pop() 72 | return node 73 | 74 | 75 | class TypeguardLoader(SourceFileLoader): 76 | def source_to_code(self, data, path, *, _optimize=-1): 77 | source = decode_source(data) 78 | tree = _call_with_frames_removed(compile, source, path, 'exec', ast.PyCF_ONLY_AST, 79 | dont_inherit=True, optimize=_optimize) 80 | tree = TypeguardTransformer().visit(tree) 81 | ast.fix_missing_locations(tree) 82 | return _call_with_frames_removed(compile, tree, path, 'exec', 83 | dont_inherit=True, optimize=_optimize) 84 | 85 | def exec_module(self, module): 86 | # Use a custom optimization marker – the import lock should make this monkey patch safe 87 | with patch('importlib._bootstrap_external.cache_from_source', optimized_cache_from_source): 88 | return super().exec_module(module) 89 | 90 | 91 | class TypeguardFinder(MetaPathFinder): 92 | """ 93 | Wraps another path finder and instruments the module with ``@typechecked`` if 94 | :meth:`should_instrument` returns ``True``. 95 | 96 | Should not be used directly, but rather via :func:`~.install_import_hook`. 97 | 98 | .. versionadded:: 2.6 99 | 100 | """ 101 | 102 | def __init__(self, packages, original_pathfinder): 103 | self.packages = packages 104 | self._original_pathfinder = original_pathfinder 105 | 106 | def find_spec(self, fullname, path=None, target=None): 107 | if self.should_instrument(fullname): 108 | spec = self._original_pathfinder.find_spec(fullname, path, target) 109 | if spec is not None and isinstance(spec.loader, SourceFileLoader): 110 | spec.loader = TypeguardLoader(spec.loader.name, spec.loader.path) 111 | return spec 112 | 113 | return None 114 | 115 | def should_instrument(self, module_name: str) -> bool: 116 | """ 117 | Determine whether the module with the given name should be instrumented. 118 | 119 | :param module_name: full name of the module that is about to be imported (e.g. ``xyz.abc``) 120 | 121 | """ 122 | for package in self.packages: 123 | if module_name == package or module_name.startswith(package + '.'): 124 | return True 125 | 126 | return False 127 | 128 | 129 | class ImportHookManager: 130 | def __init__(self, hook: MetaPathFinder): 131 | self.hook = hook 132 | 133 | def __enter__(self): 134 | pass 135 | 136 | def __exit__(self, exc_type, exc_val, exc_tb): 137 | self.uninstall() 138 | 139 | def uninstall(self): 140 | try: 141 | sys.meta_path.remove(self.hook) 142 | except ValueError: 143 | pass # already removed 144 | 145 | 146 | def install_import_hook(packages: Iterable[str], *, 147 | cls: Type[TypeguardFinder] = TypeguardFinder) -> ImportHookManager: 148 | """ 149 | Install an import hook that decorates classes and functions with ``@typechecked``. 150 | 151 | This only affects modules loaded **after** this hook has been installed. 152 | 153 | :return: a context manager that uninstalls the hook on exit (or when you call ``.uninstall()``) 154 | 155 | .. versionadded:: 2.6 156 | 157 | """ 158 | if isinstance(packages, str): 159 | packages = [packages] 160 | 161 | for i, finder in enumerate(sys.meta_path): 162 | if isclass(finder) and finder.__name__ == 'PathFinder' and hasattr(finder, 'find_spec'): 163 | break 164 | else: 165 | raise RuntimeError('Cannot find a PathFinder in sys.meta_path') 166 | 167 | hook = cls(packages, finder) 168 | sys.meta_path.insert(0, hook) 169 | return ImportHookManager(hook) 170 | -------------------------------------------------------------------------------- /strictly_typed_pandas/dataset.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from abc import ABC 3 | from typing import Any, Generic, TypeVar, get_type_hints 4 | 5 | import pandas as pd 6 | 7 | from strictly_typed_pandas.create_empty_dataframe import ( 8 | create_empty_dataframe, 9 | create_empty_indexed_dataframe, 10 | ) 11 | from strictly_typed_pandas.immutable import ( 12 | _ImmutableiLocIndexer, 13 | _ImmutableLocIndexer, 14 | immutable_error_msg, 15 | inplace_argument_interceptor, 16 | ) 17 | from strictly_typed_pandas.validate_schema import check_for_duplicate_columns, validate_schema 18 | 19 | dataframe_functions = dict(inspect.getmembers(pd.DataFrame, predicate=inspect.isfunction)) 20 | dataframe_member_names = dict(inspect.getmembers(pd.DataFrame)).keys() 21 | 22 | 23 | class DataSetBase(pd.DataFrame, ABC): 24 | def __init__(self, *args, **kwargs) -> None: 25 | """This class is a subclass of `pd.DataFrame`, hence it is initialized with the 26 | same parameters as a `DataFrame`. 27 | 28 | See the Pandas `DataFrame` documentation for more information. 29 | """ 30 | super().__init__(*args, **kwargs) 31 | 32 | if self.columns.duplicated().any(): 33 | msg = "DataSet has duplicate columns: {cols}".format( 34 | cols=self.columns[self.columns.duplicated()] 35 | ) 36 | raise TypeError(msg) 37 | 38 | def __setattr__(self, name: str, value: Any) -> None: 39 | object.__setattr__(self, name, value) 40 | 41 | if name in self.columns and name not in dataframe_member_names: 42 | raise NotImplementedError(immutable_error_msg) 43 | 44 | def __setitem__(self, key: Any, value: Any): 45 | raise NotImplementedError(immutable_error_msg) 46 | 47 | def __getattribute__(self, name: str) -> Any: 48 | if name in dataframe_functions: 49 | attribute = dataframe_functions[name].__get__(self, type(self)) 50 | return inplace_argument_interceptor(attribute) 51 | else: 52 | return object.__getattribute__(self, name) 53 | 54 | @property 55 | def iloc(self) -> _ImmutableiLocIndexer: # type: ignore 56 | return _ImmutableiLocIndexer("iloc", self) # type: ignore 57 | 58 | @property 59 | def loc(self) -> _ImmutableLocIndexer: # type: ignore 60 | return _ImmutableLocIndexer("loc", self) # type: ignore 61 | 62 | def to_dataframe(self) -> pd.DataFrame: 63 | """Converts the object to a pandas `DataFrame`.""" 64 | return pd.DataFrame(self) 65 | 66 | def to_frame(self) -> pd.DataFrame: 67 | """Synonym of to to_dataframe(): converts the object to a pandas `DataFrame`.""" 68 | return self.to_dataframe() 69 | 70 | 71 | T = TypeVar("T") 72 | V = TypeVar("V") 73 | 74 | 75 | class DataSet(Generic[T], DataSetBase): 76 | """`DataSet` allows for static type checking of pandas DataFrames, for example: 77 | 78 | .. code-block:: python 79 | 80 | class Schema: 81 | a: int 82 | 83 | DataSet[Schema]({"a": [1, 2, 3]}) 84 | 85 | Where `DataSet`: 86 | * is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`. 87 | * validates whether the data adheres to the provided schema upon its initialization. 88 | * is immutable, so its schema cannot be changed using inplace modifications. 89 | 90 | The `DataSet[Schema]` annotations are compatible with: 91 | * `mypy` for type checking during linting-time (i.e. while you write your code). 92 | * `typeguard` (<3.0) for type checking during run-time (i.e. while you run your unit tests). 93 | """ 94 | 95 | _schema_annotations = None 96 | 97 | def __class_getitem__(cls, item): 98 | """Allows us to define a schema for the ``DataSet``.""" 99 | cls = super().__class_getitem__(item) 100 | cls._schema_annotations = item 101 | return cls 102 | 103 | def __init__(self, *args, **kwargs): 104 | super().__init__(*args, **kwargs) 105 | 106 | if DataSet._schema_annotations is None: 107 | return 108 | 109 | schema_expected = get_type_hints(DataSet._schema_annotations) 110 | DataSet._schema_annotations = None 111 | 112 | if self.shape == (0, 0): 113 | df = create_empty_dataframe(schema_expected) 114 | super().__init__(df) 115 | else: 116 | schema_observed = dict(zip(self.columns, self.dtypes)) 117 | validate_schema(schema_expected, schema_observed) 118 | 119 | 120 | class IndexedDataSet(Generic[T, V], DataSetBase): 121 | """`IndexedDataSet` allows for static type checking of indexed pandas DataFrames, 122 | for example: 123 | 124 | .. code-block:: text 125 | 126 | class IndexSchema: 127 | a: int 128 | 129 | class DataSchema: 130 | b: str 131 | 132 | df = ( 133 | pd.DataFrame( 134 | { 135 | "a": [1, 2, 3], 136 | "b": ["1", "2", "3"] 137 | } 138 | ) 139 | .set_index(["a"]) 140 | .pipe(IndexedDataSet[IndexSchema, DataSchema]) 141 | ) 142 | 143 | Where `IndexedDataSet`: 144 | * is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`. 145 | * validates whether the data adheres to the provided schema upon its initialization. 146 | * is immutable, so its schema cannot be changed using inplace modifications. 147 | 148 | The `IndexedDataSet[Schema]` annotations are compatible with: 149 | * `mypy` for type checking during linting-time (i.e. while you write your code). 150 | * `typeguard` (<3.0) for type checking during run-time (i.e. while you run your unit tests). 151 | """ 152 | 153 | _schema_index = None 154 | _schema_data = None 155 | 156 | def __class_getitem__(cls, item): 157 | """Allows us to define a schema for the ``DataSet``.""" 158 | cls = super().__class_getitem__(item) 159 | cls._schema_index = item[0] 160 | cls._schema_data = item[1] 161 | return cls 162 | 163 | def __init__(self, *args, **kwargs): 164 | super().__init__(*args, **kwargs) 165 | 166 | if IndexedDataSet._schema_index is None or IndexedDataSet._schema_data is None: 167 | return 168 | 169 | schema_index_expected = get_type_hints(IndexedDataSet._schema_index) 170 | schema_data_expected = get_type_hints(IndexedDataSet._schema_data) 171 | IndexedDataSet._schema_index = None 172 | IndexedDataSet._schema_data = None 173 | 174 | check_for_duplicate_columns( 175 | set(schema_index_expected.keys()), set(schema_data_expected.keys()) 176 | ) 177 | 178 | if self.shape == (0, 0) and self.index.shape == (0,): 179 | df = create_empty_indexed_dataframe(schema_index_expected, schema_data_expected) 180 | super().__init__(df) 181 | else: 182 | schema_data_observed = dict(zip(self.columns, self.dtypes)) 183 | schema_index_observed = { 184 | name: self.index.get_level_values(i).dtype 185 | for i, name in enumerate(self.index.names) 186 | } 187 | 188 | if all(name is None for name in self.index.names): 189 | raise TypeError("No named columns in index. Did you remember to set the index?") 190 | 191 | validate_schema(schema_index_expected, schema_index_observed) 192 | validate_schema(schema_data_expected, schema_data_observed) 193 | -------------------------------------------------------------------------------- /docs/source/advanced.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Advanced\n", 8 | "\n", 9 | "## Subclassing schemas\n", 10 | "\n", 11 | "Subclassing schemas is a useful pattern for pipelines where every next function adds a few columns." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from strictly_typed_pandas import DataSet\n", 21 | "\n", 22 | "\n", 23 | "class SchemaA:\n", 24 | " name: str\n", 25 | "\n", 26 | "\n", 27 | "class SchemaB(SchemaA):\n", 28 | " id: int\n", 29 | "\n", 30 | "\n", 31 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n", 32 | "\n", 33 | "\n", 34 | "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", 35 | " return df.assign(\n", 36 | " id=lambda df: range(df.shape[0]),\n", 37 | " ).pipe(DataSet[SchemaB])" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Similarly, you can use it when merging (or joining or concatenating) two datasets together." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "class SchemaA:\n", 54 | " id: int\n", 55 | " name: str\n", 56 | "\n", 57 | "\n", 58 | "class SchemaB:\n", 59 | " id: int\n", 60 | " job: str\n", 61 | "\n", 62 | "\n", 63 | "class SchemaAB(SchemaA, SchemaB):\n", 64 | " pass\n", 65 | "\n", 66 | "\n", 67 | "df1 = DataSet[SchemaA](\n", 68 | " {\n", 69 | " \"id\": [1, 2, 3],\n", 70 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 71 | " }\n", 72 | ")\n", 73 | "df2 = DataSet[SchemaB](\n", 74 | " {\n", 75 | " \"id\": [1, 2, 3],\n", 76 | " \"job\": \"Data Scientist\",\n", 77 | " }\n", 78 | ")\n", 79 | "df1.merge(df2, on=\"id\").pipe(DataSet[SchemaAB])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## Creating an empty DataSet\n", 87 | "Sometimes it's useful to create a DataSet without any rows. This can be easily done as follows:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "class Schema:\n", 97 | " id: int\n", 98 | " name: str\n", 99 | "\n", 100 | "\n", 101 | "DataSet[Schema]()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## Support for numpy and pandas data types\n", 109 | "We also support using numpy types and pandas types, as well as `typing.Any`. If you miss support for any other data type, drop us a line and we'll see if we can add it!" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "import numpy as np\n", 119 | "import pandas as pd\n", 120 | "from typing import Any\n", 121 | "\n", 122 | "\n", 123 | "class Schema:\n", 124 | " name: pd.StringDtype\n", 125 | " money: np.float64\n", 126 | " eggs: np.int64\n", 127 | " potatoes: Any\n", 128 | "\n", 129 | "\n", 130 | "df = DataSet[Schema](\n", 131 | " {\n", 132 | " \"name\": pd.Series([\"John\", \"Jane\", \"Jack\"], dtype=\"string\"),\n", 133 | " \"money\": pd.Series([100.50, 1000.23, 123.45], dtype=np.float64),\n", 134 | " \"eggs\": pd.Series([1, 2, 3], dtype=np.int64),\n", 135 | " \"potatoes\": [\"1\", 0, np.nan],\n", 136 | " }\n", 137 | ")\n", 138 | "\n", 139 | "df.dtypes" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "# ClassVar variables\n", 147 | "\n", 148 | "Variables annotated with `typing.ClassVar` variables are not included in the schema, so these can be used for example to store metadata about the DataSet." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "from typing import ClassVar\n", 158 | "\n", 159 | "class Schema:\n", 160 | " id: int\n", 161 | " name: str\n", 162 | " file_name: ClassVar[str] = \"schema_data.csv\"\n", 163 | "\n", 164 | "df1 = DataSet[Schema](\n", 165 | " {\n", 166 | " \"id\": [1, 2, 3],\n", 167 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 168 | " }\n", 169 | ")\n", 170 | "\n", 171 | "print(Schema.file_name)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## IndexedDataSet\n", 179 | "\n", 180 | "If you'd like to also strictly type the index, you can use the IndexedDataSet class." 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "from strictly_typed_pandas import IndexedDataSet\n", 190 | "\n", 191 | "\n", 192 | "class IndexSchema:\n", 193 | " id: int\n", 194 | " job: str\n", 195 | "\n", 196 | "\n", 197 | "class DataSchema:\n", 198 | " name: str\n", 199 | "\n", 200 | "\n", 201 | "df = (\n", 202 | " pd.DataFrame(\n", 203 | " {\n", 204 | " \"id\": [1, 2, 3],\n", 205 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 206 | " \"job\": \"Data Scientist\",\n", 207 | " }\n", 208 | " )\n", 209 | " .set_index([\"id\", \"job\"])\n", 210 | " .pipe(IndexedDataSet[IndexSchema, DataSchema])\n", 211 | ")" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## Reusing a variable (e.g. `df`) with different schemas\n", 219 | "Sometimes when building a pipeline, it's useful to reuse a variable (e.g. `df`) with different schemas. If we do that in the following way however, we'll get a mypy error." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "class SchemaA:\n", 229 | " name: str\n", 230 | "\n", 231 | "\n", 232 | "class SchemaB(SchemaA):\n", 233 | " id: int\n", 234 | "\n", 235 | "\n", 236 | "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", 237 | " return df.assign(id=1).pipe(DataSet[SchemaB])\n", 238 | "\n", 239 | "\n", 240 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n", 241 | "df = foo(df)\n", 242 | "# mypy(error): Incompatible types in assignment (expression has type \"DataSet[SchemaB]\", variable has type \"DataSet[SchemaA]\")" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "To avoid this error, we need to declare that `df` will be of the type `DataSet` (implying the the schema may be different at different points)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "df: DataSet\n", 259 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n", 260 | "df = foo(df)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [] 269 | } 270 | ], 271 | "metadata": { 272 | "interpreter": { 273 | "hash": "21955bae40816b58329a864495bd83642121ab031d49eff86d34b7b0569c6cea" 274 | }, 275 | "kernelspec": { 276 | "display_name": "Python 3.8.5 64-bit ('base': conda)", 277 | "name": "python3" 278 | }, 279 | "language_info": { 280 | "name": "python", 281 | "version": "" 282 | }, 283 | "orig_nbformat": 2 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 2 287 | } 288 | -------------------------------------------------------------------------------- /docs/source/deepdive_into_dtypes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Deepdive into data types" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "from typing import Any\n", 20 | "from strictly_typed_pandas import DataSet" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Numeric types\n", 28 | "\n", 29 | "Pandas stores all numeric data using numpy data types. For example, if we make the following `DataFrame` (where we explicitely define the data types using base python types):" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "df = pd.DataFrame(\n", 39 | " {\n", 40 | " \"a\": pd.Series([1, 2, 3], dtype=int),\n", 41 | " \"b\": pd.Series([1.0, 2.0, 3.0], dtype=float),\n", 42 | " \"c\": pd.Series([True, False, True], dtype=bool),\n", 43 | " }\n", 44 | ")\n", 45 | "\n", 46 | "df.dtypes" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Then we see that all columns have a numpy data type." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "assert df.dtypes[\"a\"] == np.int64\n", 63 | "assert df.dtypes[\"b\"] == np.float64\n", 64 | "assert df.dtypes[\"c\"] == np.bool_" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Interestingly, numpy data types are by default equal to their base python counterparts." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "assert df.dtypes[\"a\"] == int\n", 81 | "assert df.dtypes[\"b\"] == float\n", 82 | "assert df.dtypes[\"c\"] == bool" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "Following this mindset, we allow the schemas to be defined using either numpy or base python data types." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "class Schema:\n", 99 | " a: int\n", 100 | " b: float\n", 101 | " c: bool\n", 102 | "\n", 103 | "\n", 104 | "df = DataSet[Schema]()\n", 105 | "df.dtypes" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "class Schema:\n", 115 | " a: np.int64\n", 116 | " b: np.float64\n", 117 | " c: np.bool_\n", 118 | "\n", 119 | "\n", 120 | "df = DataSet[Schema]()\n", 121 | "df.dtypes" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "You can also define your schema with superclasses (e.g. `np.integer`) instead of specific classes (e.g. `np.int64`)." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "class Schema:\n", 138 | " a: np.integer\n", 139 | "\n", 140 | "\n", 141 | "df = DataSet[Schema](\n", 142 | " {\n", 143 | " \"a\": pd.Series([1, 2, 3], dtype=np.int64),\n", 144 | " }\n", 145 | ")\n", 146 | "df.dtypes" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Datetime and timedelta\n", 154 | "These too are defined using numpy.\n" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "class Schema:\n", 164 | " a: np.datetime64\n", 165 | " b: np.timedelta64\n", 166 | "\n", 167 | "\n", 168 | "df = DataSet[Schema]()\n", 169 | "df.dtypes" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## Pandas data types\n", 177 | "Pandas has a number of its own data types, to allow for things like:\n", 178 | "\n", 179 | "* Timezones\n", 180 | "\n", 181 | "* Categorical values\n", 182 | "\n", 183 | "* Sparse data" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "class Schema:\n", 193 | " a: pd.DatetimeTZDtype(tz=\"UTC\") # type: ignore # noqa: F821\n", 194 | " b: pd.CategoricalDtype\n", 195 | " c: pd.PeriodDtype(freq=\"D\") # type: ignore # noqa: F821\n", 196 | " d: pd.SparseDtype(dtype=np.int64) # type: ignore\n", 197 | " e: pd.IntervalDtype\n", 198 | " f: pd.Int64Dtype\n", 199 | " h: pd.BooleanDtype\n", 200 | "\n", 201 | "\n", 202 | "df = DataSet[Schema]()\n", 203 | "df.dtypes" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "Some of these types accept arguments (e.g. `pd.DatetimeTZDtype(tz=\"UTC\")`). While this works perfectly well during run-time, it does result in linting errors. You can suppress these without any problems by using `# type: ignore # noqa: F821`.\n", 211 | "\n", 212 | "Note that the pandas data types are not considered equivalent to their numpy or base python equivalents." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "class SchemaA:\n", 222 | " a: pd.Int64Dtype\n", 223 | "\n", 224 | "\n", 225 | "class SchemaB:\n", 226 | " a: np.int64\n", 227 | "\n", 228 | "\n", 229 | "try:\n", 230 | " DataSet[SchemaA]().pipe(DataSet[SchemaB])\n", 231 | "except TypeError as e:\n", 232 | " print(e)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "## Strings\n", 240 | "String types are complicated business in pandas. From pandas 1.0.0 and higher, we suggest using the `string` (i.e. `pd.StringDtype`) data type. When defining a schema, this data type is compatible with both the base python `str` annotation and the pandas `pd.StringDtype` annotation." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "class Schema:\n", 250 | " a: str\n", 251 | " b: pd.StringDtype\n", 252 | "\n", 253 | "\n", 254 | "df = DataSet[Schema](\n", 255 | " {\n", 256 | " \"a\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\"),\n", 257 | " \"b\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\"),\n", 258 | " }\n", 259 | ")\n", 260 | "df.dtypes" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "Unfortunately, `pd.StringDtype` has only been around briefly: it isn't available in older versions of python, and as of yet it is still not used by default when creating a DataFrame with strings. Instead, strings are by default stored as the non-descript `object` type." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "df = pd.DataFrame({\"a\": [\"a\", \"b\", \"c\"]})\n", 277 | "df.dtypes" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "To be consistent, we have decided to set `str == object` when checking the schema, atleast until `pd.StringDtype` will be the default data type for strings in pandas." 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "class Schema:\n", 294 | " a: str\n", 295 | "\n", 296 | "\n", 297 | "df = DataSet[Schema]({\"a\": [\"a\", \"b\", \"c\"]})\n", 298 | "df.dtypes" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Note that this is horribly unspecific. For example, the following `DataSet` contains a column `a` with data type `object`, which contains several things that are definitely not strings. However, since we had to agree that `object == str`, this currently passes without failure." 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "class Schema:\n", 315 | " a: str\n", 316 | "\n", 317 | "\n", 318 | "df = DataSet[Schema]({\"a\": [None, 42, lambda x: x]})\n", 319 | "df.dtypes" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "We hope that `pd.StringDtype` will soon be the default string type, so that we can avoid the problem outlined above. Until then, if you want to be sure that your string columns are actually strings, it's best to use `pd.StringDtype` for your type annotations." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "class Schema:\n", 336 | " a: pd.StringDtype\n", 337 | "\n", 338 | "\n", 339 | "df = DataSet[Schema]({\"a\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\")})" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "try:\n", 349 | " DataSet[Schema]({\"a\": [None, 42, lambda x: x]})\n", 350 | "except TypeError as e:\n", 351 | " print(e)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "## The `Any` type\n", 359 | "\n", 360 | "In some cases it is useful to be able to define that a column can have `Any` type. This can either be a column of a specific type (e.g. `int64`) or a mix of data types (i.e. an `object`)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "class Schema:\n", 370 | " a: Any\n", 371 | " b: Any\n", 372 | "\n", 373 | "\n", 374 | "df = DataSet[Schema](\n", 375 | " {\n", 376 | " \"a\": [1, 2, 3],\n", 377 | " \"b\": [\"1\", 2, None],\n", 378 | " }\n", 379 | ")\n", 380 | "df.dtypes" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "## Anything missing?\n", 388 | "There's a zoo of data types used in pandas. Is anything missing? Contact me and I'll look into it!" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [] 397 | } 398 | ], 399 | "metadata": { 400 | "kernelspec": { 401 | "display_name": "stp", 402 | "language": "python", 403 | "name": "python3" 404 | }, 405 | "language_info": { 406 | "codemirror_mode": { 407 | "name": "ipython", 408 | "version": 3 409 | }, 410 | "file_extension": ".py", 411 | "mimetype": "text/x-python", 412 | "name": "python", 413 | "nbconvert_exporter": "python", 414 | "pygments_lexer": "ipython3", 415 | "version": "3.8.13" 416 | }, 417 | "orig_nbformat": 2, 418 | "vscode": { 419 | "interpreter": { 420 | "hash": "0785e816af5df78c77a9de5b5385808c06b955fe7dba50fa53415245f1f2e5ee" 421 | } 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 2 426 | } 427 | -------------------------------------------------------------------------------- /docs/source/getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting started\n", 8 | "\n", 9 | "## The problem\n", 10 | "\n", 11 | "I love Pandas! But in production code I’m always a bit wary when I see:" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "\n", 22 | "\n", 23 | "def foo(df: pd.DataFrame) -> pd.DataFrame:\n", 24 | " # do stuff\n", 25 | " return df" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "Because… How do I know which columns are supposed to be in `df`?\n", 33 | "\n", 34 | "Sure, in a notebook this is often not a big problem, because we'll likely have\n", 35 | "\n", 36 | "* a few hundred lines of code\n", 37 | "\n", 38 | "* that you're working on alone\n", 39 | "\n", 40 | "* over a limited amount of time\n", 41 | "\n", 42 | "But what if this is production code, where we have:\n", 43 | "\n", 44 | "* \\>1000 lines of code\n", 45 | "\n", 46 | "* that we are maintaining for years to come\n", 47 | "\n", 48 | "* potentially by colleagues who haven't even been hired yet\n", 49 | "\n", 50 | "You'll probably want to be a bit more explicit about what these DataFrames should look like!\n", 51 | "\n", 52 | "## The solution: static type checking of pandas DataFrames\n", 53 | "\n", 54 | "Suppose we know that our DataFrame has two columns: `id` (an int) and `name` (a str). Using `strictly_typed_pandas`, we may write that down as follows." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "from strictly_typed_pandas import DataSet\n", 64 | "\n", 65 | "\n", 66 | "class Schema:\n", 67 | " id: int\n", 68 | " name: str\n", 69 | "\n", 70 | "\n", 71 | "def foo(df: DataSet[Schema]) -> DataSet[Schema]:\n", 72 | " # do stuff\n", 73 | " return df" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "These type definitions can now be checked using `mypy`, a linter for static type checking. The big benefit of `mypy` is that the type checking doesn't happen during run-time, but rather during linting time (so while you're coding), saving you precious time. If you haven't already, you should really check out how to set up `mypy` for your IDE.\n", 81 | "\n", 82 | "Let's consider an example of how this works. First, we'll create some data. Since `DataSet` is a subclass of `pd.DataFrame`, it has (nearly) all the functionality of a `DataFrame`, including:" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "df = DataSet[Schema](\n", 92 | " {\n", 93 | " \"id\": [1, 2, 3],\n", 94 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 95 | " }\n", 96 | ")\n", 97 | "df" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "We can now call `foo()` with our data. All types check out, so nothing special happens." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "res = foo(df)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "However, if we instead try to run `foo()` on a `DataFrame`, mypy will throw the following error.\n", 121 | "\n", 122 | "(Shown as a comment here, but it will show up in your IDE if you set up mypy.)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "df = pd.DataFrame(df)\n", 132 | "res = foo(df)\n", 133 | "# mypy(error): Argument 1 to \"foo\" has incompatible type \"DataFrame\"; expected \"DataSet[Schema]\"" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Likewise, if we call `foo()` on a `DataSet` with an alternative schema, mypy will throw the following error." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "class AlternativeSchema:\n", 150 | " id: int\n", 151 | " first_name: str\n", 152 | "\n", 153 | "\n", 154 | "df = DataSet[AlternativeSchema](\n", 155 | " {\n", 156 | " \"id\": [1, 2, 3],\n", 157 | " \"first_name\": [\"John\", \"Jane\", \"Jack\"],\n", 158 | " }\n", 159 | ")\n", 160 | "try:\n", 161 | " res = foo(df)\n", 162 | " # mypy(error): Argument 1 to \"foo\" has incompatible type \"DataSet[AlternativeSchema]\"; expected \"DataSet[Schema]\"\n", 163 | "except:\n", 164 | " pass" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## How can we be sure that a DataSet adheres to its schema?\n", 172 | "\n", 173 | "The above is great if everyone is meticulous in keeping the schema annotations correct and up-to-date. But shouldn't we be worried that these schema annotations get out of sync? For example:" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "class Schema:\n", 183 | " id: int\n", 184 | " name: str\n", 185 | "\n", 186 | "\n", 187 | "def foo() -> DataSet[Schema]:\n", 188 | " return DataSet[Schema](\n", 189 | " {\n", 190 | " \"id\": [1, 2, 3],\n", 191 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 192 | " \"job\": \"Data Scientist\",\n", 193 | " }\n", 194 | " )" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "Fortunately, we have some extra precautions in place that prevent the above scenario:\n", 202 | "\n", 203 | "* The schema of the data is validated during the `DataSet` creation.\n", 204 | "\n", 205 | "* `DataSet` is immutable, so its schema cannot change due to inplace modifications.\n", 206 | "\n", 207 | "As we will see, this means that if your codebase (e.g. `foo()`) is unit tested, functions like the above will result in errors and hence they shouldn't make it to the master branch. As such, you will be able to trust the schema annotations in your code base.\n", 208 | "\n", 209 | "Let's have a look at these precautions in more detail. First, if the columns in the data do not correspond to the ones defined in the shema, we get a TypeError, for example:\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "try:\n", 219 | " df = DataSet[Schema]({\"id\": [1, 2, 3]})\n", 220 | "except TypeError as e:\n", 221 | " print(e)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Similarly, if the types defined in the schema don't match the types in the data, we again get a `TypeError`." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "try:\n", 238 | " df = DataSet[Schema](\n", 239 | " {\n", 240 | " \"id\": [1, 2, 3],\n", 241 | " \"name\": [1, 2, 3],\n", 242 | " }\n", 243 | " )\n", 244 | "except TypeError as e:\n", 245 | " print(e)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Hence, when we successsfully create our `DataSet[Schema]`, we can be certain that it adheres to the schema. \n", 253 | "\n", 254 | "Of course, for this to work, we do need to make sure that the `DataSet`'s columns and datatypes cannot be changed after its creation. This brings us to our second point: \n", 255 | "\n", 256 | "* `DataSet` is immutable, so its schema cannot change due to inplace modifications.\n", 257 | "\n", 258 | "To this end, we have disabled operations such as:" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "df = DataSet[Schema](\n", 268 | " {\n", 269 | " \"id\": [1, 2, 3],\n", 270 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 271 | " }\n", 272 | ")\n", 273 | "ids = [\"1\", \"2\", \"3\"]\n", 274 | "try:\n", 275 | " df[\"id\"] = ids\n", 276 | " df.id = ids\n", 277 | " df.loc[:, \"id\"] = ids\n", 278 | " df.iloc[:, 0] = ids\n", 279 | " df.assign(id=ids, inplace=True)\n", 280 | "except NotImplementedError as e:\n", 281 | " print(e)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "When you do need to make changes to the schema, you can either cast the `DataSet` back to a `DataFrame`." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "df = df.to_dataframe()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "Or you can perform the `assign()` in the following way, which also casts it to a `DataFrame`" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "df = df.assign(id=ids)\n", 314 | "assert type(df) == pd.DataFrame" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "In practice, this often means that functions have the following sequence:\n", 322 | "\n", 323 | "1. The input is a `DataSet[SchemaA]`\n", 324 | "\n", 325 | "2. The data is converted to a `DataFrame` so changes can be made\n", 326 | "\n", 327 | "3. The output is cast to `DataSet[SchemaB]`" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "class SchemaA:\n", 337 | " name: str\n", 338 | "\n", 339 | "\n", 340 | "class SchemaB:\n", 341 | " id: int\n", 342 | " name: str\n", 343 | "\n", 344 | "\n", 345 | "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n", 346 | "\n", 347 | "\n", 348 | "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", 349 | " n = df.shape[0]\n", 350 | " ids = range(n)\n", 351 | " new_df = df.assign(id=ids)\n", 352 | " return DataSet[SchemaB](new_df)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "Or alternatively in the more compact version" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "def foo(data: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", 369 | " return df.assign(\n", 370 | " id=lambda df: range(df.shape[0]),\n", 371 | " ).pipe(DataSet[SchemaB])" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "## What about functions that return `Any`?\n", 379 | "So far we've seen that we can strictly type check our pandas data using a combination of linting checks and runtime checks. So is there anything that we haven't covered yet? Well, it turns out there is. Consider the following example.\n" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "class Schema:\n", 389 | " id: int\n", 390 | " name: str\n", 391 | "\n", 392 | "\n", 393 | "def foo() -> DataSet[Schema]:\n", 394 | " return (\n", 395 | " DataSet[Schema](\n", 396 | " {\n", 397 | " \"id\": [1, 2, 3],\n", 398 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 399 | " }\n", 400 | " )\n", 401 | " .assign(job=\"Data Scientist\")\n", 402 | " .iloc[:3]\n", 403 | " )\n", 404 | "\n", 405 | "\n", 406 | "res = foo()" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "Now this is interesting: `foo()` clearly returns something that doesn't adhere to the schema, but the above gives neither a linting error nor a runtime error!\n", 414 | "\n", 415 | "It turns out that the above problem often happens with functions like `iloc`, `loc` and `pipe`, whose return type is `Any` (and when you think about it, these can indeed return any possible datatype). When mypy sees that the return type is `Any`, it reasons that that could still be a `DataSet[Schema]` object, so it doesn't raise an error. It's only during runtime that we find out here that the return type actually is a `DataFrame`, but `mypy` doesn't do any runtime checks.\n", 416 | "\n", 417 | "Fortunately, Python offers other ways to do type checking during runtime. Here, we will use the `typeguard` package. " 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "from strictly_typed_pandas.typeguard import typechecked\n", 427 | "\n", 428 | "\n", 429 | "@typechecked\n", 430 | "def foo() -> DataSet[Schema]:\n", 431 | " return (\n", 432 | " DataSet[Schema](\n", 433 | " {\n", 434 | " \"id\": [1, 2, 3],\n", 435 | " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", 436 | " }\n", 437 | " )\n", 438 | " .assign(job=\"Data Scientist\")\n", 439 | " .iloc[:3]\n", 440 | " )\n", 441 | "\n", 442 | "\n", 443 | "try:\n", 444 | " res = foo()\n", 445 | "except TypeError as e:\n", 446 | " print(e)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "Alright, we now caught the error dead in its tracks! \n", 454 | "\n", 455 | "We can improve this with one more step: instead of adding the `@typechecked` decorator to every function by hand (which could be error prone), `typeguard` can do this automatically when running the unit tests. To do this, simply run your unit tests using `pytest --stp-typeguard-packages=foo.bar` (where `foo.bar` is your package name)\n", 456 | "\n", 457 | "## Conclusions\n", 458 | "\n", 459 | "We can statically type check pandas in the following way:" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "from strictly_typed_pandas import DataSet\n", 469 | "\n", 470 | "\n", 471 | "class Schema:\n", 472 | " id: int\n", 473 | " name: str\n", 474 | "\n", 475 | "\n", 476 | "def foo(df: DataSet[Schema]) -> DataSet[Schema]:\n", 477 | " # do stuff\n", 478 | " return df" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "Where `DataSet`:\n", 486 | "\n", 487 | "* is a subclass of `pd.DataFrame` and hence has the same functionality as `DataFrame`.\n", 488 | "\n", 489 | "* validates whether the data adheres to the provided schema upon its initialization.\n", 490 | "\n", 491 | "* is immutable, so its schema cannot be changed using inplace modifications.\n", 492 | "\n", 493 | "The `DataSet[Schema]` annotations are compatible with:\n", 494 | "\n", 495 | "* `mypy` for type checking during linting-time (i.e. while you write your code).\n", 496 | "\n", 497 | "* `typeguard` for type checking during run-time (i.e. while you run your unit tests).\n", 498 | "\n", 499 | "To get the most out of `strictly_typed_pandas`, be sure to:\n", 500 | "\n", 501 | "* set up `mypy` in your IDE.\n", 502 | "\n", 503 | "* run your unit tests with `pytest --stp-typeguard-packages=foo.bar` (where `foo.bar` is your package name)." 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [] 510 | } 511 | ], 512 | "metadata": { 513 | "interpreter": { 514 | "hash": "21955bae40816b58329a864495bd83642121ab031d49eff86d34b7b0569c6cea" 515 | }, 516 | "kernelspec": { 517 | "display_name": "Python 3.8.5 64-bit ('base': conda)", 518 | "name": "python3" 519 | }, 520 | "language_info": { 521 | "name": "python", 522 | "version": "" 523 | }, 524 | "orig_nbformat": 2 525 | }, 526 | "nbformat": 4, 527 | "nbformat_minor": 2 528 | } 529 | -------------------------------------------------------------------------------- /strictly_typed_pandas/_vendor/typeguard/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ('ForwardRefPolicy', 'TypeHintWarning', 'typechecked', 'check_return_type', 2 | 'check_argument_types', 'check_type', 'TypeWarning', 'TypeChecker', 3 | 'typeguard_ignore') 4 | 5 | import collections.abc 6 | import gc 7 | import inspect 8 | import sys 9 | import threading 10 | from collections import OrderedDict 11 | from enum import Enum 12 | from functools import partial, wraps 13 | from inspect import Parameter, isclass, isfunction, isgeneratorfunction 14 | from io import BufferedIOBase, IOBase, RawIOBase, TextIOBase 15 | from traceback import extract_stack, print_stack 16 | from types import CodeType, FunctionType 17 | from typing import ( 18 | IO, TYPE_CHECKING, AbstractSet, Any, AsyncIterable, AsyncIterator, BinaryIO, Callable, Dict, 19 | Generator, Iterable, Iterator, List, NewType, Optional, Sequence, Set, TextIO, Tuple, Type, 20 | TypeVar, Union, get_type_hints, overload) 21 | from unittest.mock import Mock 22 | from warnings import warn 23 | from weakref import WeakKeyDictionary, WeakValueDictionary 24 | 25 | # Python 3.8+ 26 | try: 27 | from typing_extensions import Literal 28 | except ImportError: 29 | try: 30 | from typing import Literal 31 | except ImportError: 32 | Literal = None 33 | 34 | # Python 3.5.4+ / 3.6.2+ 35 | try: 36 | from typing_extensions import NoReturn 37 | except ImportError: 38 | try: 39 | from typing import NoReturn 40 | except ImportError: 41 | NoReturn = None 42 | 43 | # Python 3.6+ 44 | try: 45 | from inspect import isasyncgen, isasyncgenfunction 46 | from typing import AsyncGenerator 47 | except ImportError: 48 | AsyncGenerator = None 49 | 50 | def isasyncgen(obj): 51 | return False 52 | 53 | def isasyncgenfunction(func): 54 | return False 55 | 56 | # Python 3.8+ 57 | try: 58 | from typing import ForwardRef 59 | evaluate_forwardref = ForwardRef._evaluate 60 | except ImportError: 61 | from typing import _ForwardRef as ForwardRef 62 | evaluate_forwardref = ForwardRef._eval_type 63 | 64 | if sys.version_info >= (3, 10): 65 | from typing import is_typeddict 66 | else: 67 | _typed_dict_meta_types = () 68 | if sys.version_info >= (3, 8): 69 | from typing import _TypedDictMeta 70 | _typed_dict_meta_types += (_TypedDictMeta,) 71 | 72 | try: 73 | from typing_extensions import _TypedDictMeta 74 | _typed_dict_meta_types += (_TypedDictMeta,) 75 | except ImportError: 76 | pass 77 | 78 | def is_typeddict(tp) -> bool: 79 | return isinstance(tp, _typed_dict_meta_types) 80 | 81 | 82 | if TYPE_CHECKING: 83 | _F = TypeVar("_F") 84 | 85 | def typeguard_ignore(f: _F) -> _F: 86 | """This decorator is a noop during static type-checking.""" 87 | return f 88 | else: 89 | from typing import no_type_check as typeguard_ignore 90 | 91 | 92 | _type_hints_map = WeakKeyDictionary() # type: Dict[FunctionType, Dict[str, Any]] 93 | _functions_map = WeakValueDictionary() # type: Dict[CodeType, FunctionType] 94 | _missing = object() 95 | 96 | T_CallableOrType = TypeVar('T_CallableOrType', bound=Callable[..., Any]) 97 | 98 | # Lifted from mypy.sharedparse 99 | BINARY_MAGIC_METHODS = { 100 | "__add__", 101 | "__and__", 102 | "__cmp__", 103 | "__divmod__", 104 | "__div__", 105 | "__eq__", 106 | "__floordiv__", 107 | "__ge__", 108 | "__gt__", 109 | "__iadd__", 110 | "__iand__", 111 | "__idiv__", 112 | "__ifloordiv__", 113 | "__ilshift__", 114 | "__imatmul__", 115 | "__imod__", 116 | "__imul__", 117 | "__ior__", 118 | "__ipow__", 119 | "__irshift__", 120 | "__isub__", 121 | "__itruediv__", 122 | "__ixor__", 123 | "__le__", 124 | "__lshift__", 125 | "__lt__", 126 | "__matmul__", 127 | "__mod__", 128 | "__mul__", 129 | "__ne__", 130 | "__or__", 131 | "__pow__", 132 | "__radd__", 133 | "__rand__", 134 | "__rdiv__", 135 | "__rfloordiv__", 136 | "__rlshift__", 137 | "__rmatmul__", 138 | "__rmod__", 139 | "__rmul__", 140 | "__ror__", 141 | "__rpow__", 142 | "__rrshift__", 143 | "__rshift__", 144 | "__rsub__", 145 | "__rtruediv__", 146 | "__rxor__", 147 | "__sub__", 148 | "__truediv__", 149 | "__xor__", 150 | } 151 | 152 | 153 | class ForwardRefPolicy(Enum): 154 | """Defines how unresolved forward references are handled.""" 155 | 156 | ERROR = 1 #: propagate the :exc:`NameError` from :func:`~typing.get_type_hints` 157 | WARN = 2 #: remove the annotation and emit a TypeHintWarning 158 | #: replace the annotation with the argument's class if the qualified name matches, else remove 159 | #: the annotation 160 | GUESS = 3 161 | 162 | 163 | class TypeHintWarning(UserWarning): 164 | """ 165 | A warning that is emitted when a type hint in string form could not be resolved to an actual 166 | type. 167 | """ 168 | 169 | 170 | class _TypeCheckMemo: 171 | __slots__ = 'globals', 'locals' 172 | 173 | def __init__(self, globals: Dict[str, Any], locals: Dict[str, Any]): 174 | self.globals = globals 175 | self.locals = locals 176 | 177 | 178 | def _strip_annotation(annotation): 179 | if isinstance(annotation, str): 180 | return annotation.strip("'") 181 | else: 182 | return annotation 183 | 184 | 185 | class _CallMemo(_TypeCheckMemo): 186 | __slots__ = 'func', 'func_name', 'arguments', 'is_generator', 'type_hints' 187 | 188 | def __init__(self, func: Callable, frame_locals: Optional[Dict[str, Any]] = None, 189 | args: tuple = None, kwargs: Dict[str, Any] = None, 190 | forward_refs_policy=ForwardRefPolicy.ERROR): 191 | super().__init__(func.__globals__, frame_locals) 192 | self.func = func 193 | self.func_name = function_name(func) 194 | self.is_generator = isgeneratorfunction(func) 195 | signature = inspect.signature(func) 196 | 197 | if args is not None and kwargs is not None: 198 | self.arguments = signature.bind(*args, **kwargs).arguments 199 | else: 200 | assert frame_locals is not None, 'frame must be specified if args or kwargs is None' 201 | self.arguments = frame_locals 202 | 203 | self.type_hints = _type_hints_map.get(func) 204 | if self.type_hints is None: 205 | while True: 206 | if sys.version_info < (3, 5, 3): 207 | frame_locals = dict(frame_locals) 208 | 209 | try: 210 | hints = get_type_hints(func, localns=frame_locals) 211 | except NameError as exc: 212 | if forward_refs_policy is ForwardRefPolicy.ERROR: 213 | raise 214 | 215 | typename = str(exc).split("'", 2)[1] 216 | for param in signature.parameters.values(): 217 | if _strip_annotation(param.annotation) == typename: 218 | break 219 | else: 220 | raise 221 | 222 | func_name = function_name(func) 223 | if forward_refs_policy is ForwardRefPolicy.GUESS: 224 | if param.name in self.arguments: 225 | argtype = self.arguments[param.name].__class__ 226 | stripped = _strip_annotation(param.annotation) 227 | if stripped == argtype.__qualname__: 228 | func.__annotations__[param.name] = argtype 229 | msg = ('Replaced forward declaration {!r} in {} with {!r}' 230 | .format(stripped, func_name, argtype)) 231 | warn(TypeHintWarning(msg)) 232 | continue 233 | 234 | msg = 'Could not resolve type hint {!r} on {}: {}'.format( 235 | param.annotation, function_name(func), exc) 236 | warn(TypeHintWarning(msg)) 237 | del func.__annotations__[param.name] 238 | else: 239 | break 240 | 241 | self.type_hints = OrderedDict() 242 | for name, parameter in signature.parameters.items(): 243 | if name in hints: 244 | annotated_type = hints[name] 245 | 246 | # PEP 428 discourages it by MyPy does not complain 247 | if parameter.default is None: 248 | annotated_type = Optional[annotated_type] 249 | 250 | if parameter.kind == Parameter.VAR_POSITIONAL: 251 | self.type_hints[name] = Tuple[annotated_type, ...] 252 | elif parameter.kind == Parameter.VAR_KEYWORD: 253 | self.type_hints[name] = Dict[str, annotated_type] 254 | else: 255 | self.type_hints[name] = annotated_type 256 | 257 | if 'return' in hints: 258 | self.type_hints['return'] = hints['return'] 259 | 260 | _type_hints_map[func] = self.type_hints 261 | 262 | 263 | def resolve_forwardref(maybe_ref, memo: _TypeCheckMemo): 264 | if isinstance(maybe_ref, ForwardRef): 265 | if sys.version_info < (3, 9, 0): 266 | return evaluate_forwardref(maybe_ref, memo.globals, memo.locals) 267 | else: 268 | return evaluate_forwardref(maybe_ref, memo.globals, memo.locals, frozenset()) 269 | 270 | else: 271 | return maybe_ref 272 | 273 | 274 | def get_type_name(type_): 275 | name = (getattr(type_, '__name__', None) or getattr(type_, '_name', None) or 276 | getattr(type_, '__forward_arg__', None)) 277 | if name is None: 278 | origin = getattr(type_, '__origin__', None) 279 | name = getattr(origin, '_name', None) 280 | if name is None and not inspect.isclass(type_): 281 | name = type_.__class__.__name__.strip('_') 282 | 283 | args = getattr(type_, '__args__', ()) or getattr(type_, '__values__', ()) 284 | if args != getattr(type_, '__parameters__', ()): 285 | if name == 'Literal': 286 | formatted_args = ', '.join(str(arg) for arg in args) 287 | else: 288 | formatted_args = ', '.join(get_type_name(arg) for arg in args) 289 | 290 | name = '{}[{}]'.format(name, formatted_args) 291 | 292 | module = getattr(type_, '__module__', None) 293 | if module not in (None, 'typing', 'typing_extensions', 'builtins'): 294 | name = module + '.' + name 295 | 296 | return name 297 | 298 | 299 | def find_function(frame) -> Optional[Callable]: 300 | """ 301 | Return a function object from the garbage collector that matches the frame's code object. 302 | 303 | This process is unreliable as several function objects could use the same code object. 304 | Fortunately the likelihood of this happening with the combination of the function objects 305 | having different type annotations is a very rare occurrence. 306 | 307 | :param frame: a frame object 308 | :return: a function object if one was found, ``None`` if not 309 | 310 | """ 311 | func = _functions_map.get(frame.f_code) 312 | if func is None: 313 | for obj in gc.get_referrers(frame.f_code): 314 | if inspect.isfunction(obj): 315 | if func is None: 316 | # The first match was found 317 | func = obj 318 | else: 319 | # A second match was found 320 | return None 321 | 322 | # Cache the result for future lookups 323 | if func is not None: 324 | _functions_map[frame.f_code] = func 325 | else: 326 | raise LookupError('target function not found') 327 | 328 | return func 329 | 330 | 331 | def qualified_name(obj) -> str: 332 | """ 333 | Return the qualified name (e.g. package.module.Type) for the given object. 334 | 335 | Builtins and types from the :mod:`typing` package get special treatment by having the module 336 | name stripped from the generated name. 337 | 338 | """ 339 | type_ = obj if inspect.isclass(obj) else type(obj) 340 | module = type_.__module__ 341 | qualname = type_.__qualname__ 342 | return qualname if module in ('typing', 'builtins') else '{}.{}'.format(module, qualname) 343 | 344 | 345 | def function_name(func: Callable) -> str: 346 | """ 347 | Return the qualified name of the given function. 348 | 349 | Builtins and types from the :mod:`typing` package get special treatment by having the module 350 | name stripped from the generated name. 351 | 352 | """ 353 | # For partial functions and objects with __call__ defined, __qualname__ does not exist 354 | # For functions run in `exec` with a custom namespace, __module__ can be None 355 | module = getattr(func, '__module__', '') or '' 356 | qualname = (module + '.') if module not in ('builtins', '') else '' 357 | return qualname + getattr(func, '__qualname__', repr(func)) 358 | 359 | 360 | def check_callable(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 361 | if not callable(value): 362 | raise TypeError('{} must be a callable'.format(argname)) 363 | 364 | if getattr(expected_type, "__args__", None): 365 | try: 366 | signature = inspect.signature(value) 367 | except (TypeError, ValueError): 368 | return 369 | 370 | if hasattr(expected_type, '__result__'): 371 | # Python 3.5 372 | argument_types = expected_type.__args__ 373 | check_args = argument_types is not Ellipsis 374 | else: 375 | # Python 3.6 376 | argument_types = expected_type.__args__[:-1] 377 | check_args = argument_types != (Ellipsis,) 378 | 379 | if check_args: 380 | # The callable must not have keyword-only arguments without defaults 381 | unfulfilled_kwonlyargs = [ 382 | param.name for param in signature.parameters.values() if 383 | param.kind == Parameter.KEYWORD_ONLY and param.default == Parameter.empty] 384 | if unfulfilled_kwonlyargs: 385 | raise TypeError( 386 | 'callable passed as {} has mandatory keyword-only arguments in its ' 387 | 'declaration: {}'.format(argname, ', '.join(unfulfilled_kwonlyargs))) 388 | 389 | num_mandatory_args = len([ 390 | param.name for param in signature.parameters.values() 391 | if param.kind in (Parameter.POSITIONAL_ONLY, Parameter.POSITIONAL_OR_KEYWORD) and 392 | param.default is Parameter.empty]) 393 | has_varargs = any(param for param in signature.parameters.values() 394 | if param.kind == Parameter.VAR_POSITIONAL) 395 | 396 | if num_mandatory_args > len(argument_types): 397 | raise TypeError( 398 | 'callable passed as {} has too many arguments in its declaration; expected {} ' 399 | 'but {} argument(s) declared'.format(argname, len(argument_types), 400 | num_mandatory_args)) 401 | elif not has_varargs and num_mandatory_args < len(argument_types): 402 | raise TypeError( 403 | 'callable passed as {} has too few arguments in its declaration; expected {} ' 404 | 'but {} argument(s) declared'.format(argname, len(argument_types), 405 | num_mandatory_args)) 406 | 407 | 408 | def check_dict(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 409 | if not isinstance(value, dict): 410 | raise TypeError('type of {} must be a dict; got {} instead'. 411 | format(argname, qualified_name(value))) 412 | 413 | if expected_type is not dict: 414 | if (hasattr(expected_type, "__args__") and 415 | expected_type.__args__ not in (None, expected_type.__parameters__)): 416 | key_type, value_type = expected_type.__args__ 417 | if key_type is not Any or value_type is not Any: 418 | for k, v in value.items(): 419 | check_type('keys of {}'.format(argname), k, key_type, memo) 420 | check_type('{}[{!r}]'.format(argname, k), v, value_type, memo) 421 | 422 | 423 | def check_typed_dict(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 424 | declared_keys = frozenset(expected_type.__annotations__) 425 | if hasattr(expected_type, '__required_keys__'): 426 | required_keys = expected_type.__required_keys__ 427 | else: # py3.8 and lower 428 | required_keys = declared_keys if expected_type.__total__ else frozenset() 429 | 430 | existing_keys = frozenset(value) 431 | extra_keys = existing_keys - declared_keys 432 | if extra_keys: 433 | keys_formatted = ', '.join('"{}"'.format(key) for key in sorted(extra_keys)) 434 | raise TypeError('extra key(s) ({}) in {}'.format(keys_formatted, argname)) 435 | 436 | missing_keys = required_keys - existing_keys 437 | if missing_keys: 438 | keys_formatted = ', '.join('"{}"'.format(key) for key in sorted(missing_keys)) 439 | raise TypeError('required key(s) ({}) missing from {}'.format(keys_formatted, argname)) 440 | 441 | for key, argtype in get_type_hints(expected_type).items(): 442 | argvalue = value.get(key, _missing) 443 | if argvalue is not _missing: 444 | check_type('dict item "{}" for {}'.format(key, argname), argvalue, argtype, memo) 445 | 446 | 447 | def check_list(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 448 | if not isinstance(value, list): 449 | raise TypeError('type of {} must be a list; got {} instead'. 450 | format(argname, qualified_name(value))) 451 | 452 | if expected_type is not list: 453 | if hasattr(expected_type, "__args__") and expected_type.__args__ not in \ 454 | (None, expected_type.__parameters__): 455 | value_type = expected_type.__args__[0] 456 | if value_type is not Any: 457 | for i, v in enumerate(value): 458 | check_type('{}[{}]'.format(argname, i), v, value_type, memo) 459 | 460 | 461 | def check_sequence(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 462 | if not isinstance(value, collections.abc.Sequence): 463 | raise TypeError('type of {} must be a sequence; got {} instead'. 464 | format(argname, qualified_name(value))) 465 | 466 | if hasattr(expected_type, "__args__") and expected_type.__args__ not in \ 467 | (None, expected_type.__parameters__): 468 | value_type = expected_type.__args__[0] 469 | if value_type is not Any: 470 | for i, v in enumerate(value): 471 | check_type('{}[{}]'.format(argname, i), v, value_type, memo) 472 | 473 | 474 | def check_set(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 475 | if not isinstance(value, AbstractSet): 476 | raise TypeError('type of {} must be a set; got {} instead'. 477 | format(argname, qualified_name(value))) 478 | 479 | if expected_type is not set: 480 | if hasattr(expected_type, "__args__") and expected_type.__args__ not in \ 481 | (None, expected_type.__parameters__): 482 | value_type = expected_type.__args__[0] 483 | if value_type is not Any: 484 | for v in value: 485 | check_type('elements of {}'.format(argname), v, value_type, memo) 486 | 487 | 488 | def check_tuple(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 489 | # Specialized check for NamedTuples 490 | is_named_tuple = False 491 | if sys.version_info < (3, 8, 0): 492 | is_named_tuple = hasattr(expected_type, '_field_types') # deprecated since python 3.8 493 | else: 494 | is_named_tuple = hasattr(expected_type, '__annotations__') 495 | 496 | if is_named_tuple: 497 | if not isinstance(value, expected_type): 498 | raise TypeError('type of {} must be a named tuple of type {}; got {} instead'. 499 | format(argname, qualified_name(expected_type), qualified_name(value))) 500 | 501 | if sys.version_info < (3, 8, 0): 502 | field_types = expected_type._field_types 503 | else: 504 | field_types = expected_type.__annotations__ 505 | 506 | for name, field_type in field_types.items(): 507 | check_type('{}.{}'.format(argname, name), getattr(value, name), field_type, memo) 508 | 509 | return 510 | elif not isinstance(value, tuple): 511 | raise TypeError('type of {} must be a tuple; got {} instead'. 512 | format(argname, qualified_name(value))) 513 | 514 | if getattr(expected_type, '__tuple_params__', None): 515 | # Python 3.5 516 | use_ellipsis = expected_type.__tuple_use_ellipsis__ 517 | tuple_params = expected_type.__tuple_params__ 518 | elif getattr(expected_type, '__args__', None): 519 | # Python 3.6+ 520 | use_ellipsis = expected_type.__args__[-1] is Ellipsis 521 | tuple_params = expected_type.__args__[:-1 if use_ellipsis else None] 522 | else: 523 | # Unparametrized Tuple or plain tuple 524 | return 525 | 526 | if use_ellipsis: 527 | element_type = tuple_params[0] 528 | for i, element in enumerate(value): 529 | check_type('{}[{}]'.format(argname, i), element, element_type, memo) 530 | elif tuple_params == ((),): 531 | if value != (): 532 | raise TypeError('{} is not an empty tuple but one was expected'.format(argname)) 533 | else: 534 | if len(value) != len(tuple_params): 535 | raise TypeError('{} has wrong number of elements (expected {}, got {} instead)' 536 | .format(argname, len(tuple_params), len(value))) 537 | 538 | for i, (element, element_type) in enumerate(zip(value, tuple_params)): 539 | check_type('{}[{}]'.format(argname, i), element, element_type, memo) 540 | 541 | 542 | def check_union(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 543 | if hasattr(expected_type, '__union_params__'): 544 | # Python 3.5 545 | union_params = expected_type.__union_params__ 546 | else: 547 | # Python 3.6+ 548 | union_params = expected_type.__args__ 549 | 550 | for type_ in union_params: 551 | try: 552 | check_type(argname, value, type_, memo) 553 | return 554 | except TypeError: 555 | pass 556 | 557 | typelist = ', '.join(get_type_name(t) for t in union_params) 558 | raise TypeError('type of {} must be one of ({}); got {} instead'. 559 | format(argname, typelist, qualified_name(value))) 560 | 561 | 562 | def check_class(argname: str, value, expected_type, memo: _TypeCheckMemo) -> None: 563 | if not isclass(value): 564 | raise TypeError('type of {} must be a type; got {} instead'.format( 565 | argname, qualified_name(value))) 566 | 567 | # Needed on Python 3.7+ 568 | if expected_type is Type: 569 | return 570 | 571 | if getattr(expected_type, '__origin__', None) in (Type, type): 572 | expected_class = expected_type.__args__[0] 573 | else: 574 | expected_class = expected_type 575 | 576 | if expected_class is Any: 577 | return 578 | elif isinstance(expected_class, TypeVar): 579 | check_typevar(argname, value, expected_class, memo, True) 580 | elif getattr(expected_class, '__origin__', None) is Union: 581 | for arg in expected_class.__args__: 582 | try: 583 | check_class(argname, value, arg, memo) 584 | break 585 | except TypeError: 586 | pass 587 | else: 588 | formatted_args = ', '.join(get_type_name(arg) for arg in expected_class.__args__) 589 | raise TypeError('{} must match one of the following: ({}); got {} instead'.format( 590 | argname, formatted_args, qualified_name(value) 591 | )) 592 | elif not issubclass(value, expected_class): 593 | raise TypeError('{} must be a subclass of {}; got {} instead'.format( 594 | argname, qualified_name(expected_class), qualified_name(value))) 595 | 596 | 597 | def check_typevar(argname: str, value, typevar: TypeVar, memo: _TypeCheckMemo, 598 | subclass_check: bool = False) -> None: 599 | value_type = value if subclass_check else type(value) 600 | subject = argname if subclass_check else 'type of ' + argname 601 | 602 | if typevar.__bound__ is not None: 603 | bound_type = resolve_forwardref(typevar.__bound__, memo) 604 | if not issubclass(value_type, bound_type): 605 | raise TypeError( 606 | '{} must be {} or one of its subclasses; got {} instead' 607 | .format(subject, qualified_name(bound_type), qualified_name(value_type))) 608 | elif typevar.__constraints__: 609 | constraints = [resolve_forwardref(c, memo) for c in typevar.__constraints__] 610 | for constraint in constraints: 611 | try: 612 | check_type(argname, value, constraint, memo) 613 | except TypeError: 614 | pass 615 | else: 616 | break 617 | else: 618 | formatted_constraints = ', '.join(get_type_name(constraint) 619 | for constraint in constraints) 620 | raise TypeError('{} must match one of the constraints ({}); got {} instead' 621 | .format(subject, formatted_constraints, qualified_name(value_type))) 622 | 623 | 624 | def check_literal(argname: str, value, expected_type, memo: _TypeCheckMemo): 625 | def get_args(literal): 626 | try: 627 | args = literal.__args__ 628 | except AttributeError: 629 | # Instance of Literal from typing_extensions 630 | args = literal.__values__ 631 | 632 | retval = [] 633 | for arg in args: 634 | if isinstance(arg, Literal.__class__) or getattr(arg, '__origin__', None) is Literal: 635 | # The first check works on py3.6 and lower, the second one on py3.7+ 636 | retval.extend(get_args(arg)) 637 | elif isinstance(arg, (int, str, bytes, bool, type(None), Enum)): 638 | retval.append(arg) 639 | else: 640 | raise TypeError('Illegal literal value: {}'.format(arg)) 641 | 642 | return retval 643 | 644 | final_args = tuple(get_args(expected_type)) 645 | if value not in final_args: 646 | raise TypeError('the value of {} must be one of {}; got {} instead'. 647 | format(argname, final_args, value)) 648 | 649 | 650 | def check_number(argname: str, value, expected_type): 651 | if expected_type is complex and not isinstance(value, (complex, float, int)): 652 | raise TypeError('type of {} must be either complex, float or int; got {} instead'. 653 | format(argname, qualified_name(value.__class__))) 654 | elif expected_type is float and not isinstance(value, (float, int)): 655 | raise TypeError('type of {} must be either float or int; got {} instead'. 656 | format(argname, qualified_name(value.__class__))) 657 | 658 | 659 | def check_io(argname: str, value, expected_type): 660 | if expected_type is TextIO: 661 | if not isinstance(value, TextIOBase): 662 | raise TypeError('type of {} must be a text based I/O object; got {} instead'. 663 | format(argname, qualified_name(value.__class__))) 664 | elif expected_type is BinaryIO: 665 | if not isinstance(value, (RawIOBase, BufferedIOBase)): 666 | raise TypeError('type of {} must be a binary I/O object; got {} instead'. 667 | format(argname, qualified_name(value.__class__))) 668 | elif not isinstance(value, IOBase): 669 | raise TypeError('type of {} must be an I/O object; got {} instead'. 670 | format(argname, qualified_name(value.__class__))) 671 | 672 | 673 | def check_protocol(argname: str, value, expected_type): 674 | # TODO: implement proper compatibility checking and support non-runtime protocols 675 | if getattr(expected_type, '_is_runtime_protocol', False): 676 | if not isinstance(value, expected_type): 677 | raise TypeError('type of {} ({}) is not compatible with the {} protocol'. 678 | format(argname, type(value).__qualname__, expected_type.__qualname__)) 679 | 680 | 681 | # Equality checks are applied to these 682 | origin_type_checkers = { 683 | AbstractSet: check_set, 684 | Callable: check_callable, 685 | collections.abc.Callable: check_callable, 686 | dict: check_dict, 687 | Dict: check_dict, 688 | list: check_list, 689 | List: check_list, 690 | Sequence: check_sequence, 691 | collections.abc.Sequence: check_sequence, 692 | collections.abc.Set: check_set, 693 | set: check_set, 694 | Set: check_set, 695 | tuple: check_tuple, 696 | Tuple: check_tuple, 697 | type: check_class, 698 | Type: check_class, 699 | Union: check_union 700 | } 701 | _subclass_check_unions = hasattr(Union, '__union_set_params__') 702 | if Literal is not None: 703 | origin_type_checkers[Literal] = check_literal 704 | 705 | generator_origin_types = (Generator, collections.abc.Generator, 706 | Iterator, collections.abc.Iterator, 707 | Iterable, collections.abc.Iterable) 708 | asyncgen_origin_types = (AsyncIterator, collections.abc.AsyncIterator, 709 | AsyncIterable, collections.abc.AsyncIterable) 710 | if AsyncGenerator is not None: 711 | asyncgen_origin_types += (AsyncGenerator,) 712 | if hasattr(collections.abc, 'AsyncGenerator'): 713 | asyncgen_origin_types += (collections.abc.AsyncGenerator,) 714 | 715 | 716 | def check_type(argname: str, value, expected_type, memo: Optional[_TypeCheckMemo] = None, *, 717 | globals: Optional[Dict[str, Any]] = None, 718 | locals: Optional[Dict[str, Any]] = None) -> None: 719 | """ 720 | Ensure that ``value`` matches ``expected_type``. 721 | 722 | The types from the :mod:`typing` module do not support :func:`isinstance` or :func:`issubclass` 723 | so a number of type specific checks are required. This function knows which checker to call 724 | for which type. 725 | 726 | :param argname: name of the argument to check; used for error messages 727 | :param value: value to be checked against ``expected_type`` 728 | :param expected_type: a class or generic type instance 729 | :param globals: dictionary of global variables to use for resolving forward references 730 | (defaults to the calling frame's globals) 731 | :param locals: dictionary of local variables to use for resolving forward references 732 | (defaults to the calling frame's locals) 733 | :raises TypeError: if there is a type mismatch 734 | 735 | """ 736 | if expected_type is Any or isinstance(value, Mock): 737 | return 738 | 739 | if expected_type is None: 740 | # Only happens on < 3.6 741 | expected_type = type(None) 742 | 743 | if memo is None: 744 | frame = sys._getframe(1) 745 | if globals is None: 746 | globals = frame.f_globals 747 | if locals is None: 748 | locals = frame.f_locals 749 | 750 | memo = _TypeCheckMemo(globals, locals) 751 | 752 | expected_type = resolve_forwardref(expected_type, memo) 753 | origin_type = getattr(expected_type, '__origin__', None) 754 | if origin_type is not None: 755 | checker_func = origin_type_checkers.get(origin_type) 756 | if checker_func: 757 | checker_func(argname, value, expected_type, memo) 758 | else: 759 | check_type(argname, value, origin_type, memo) 760 | elif isclass(expected_type): 761 | if issubclass(expected_type, Tuple): 762 | check_tuple(argname, value, expected_type, memo) 763 | elif issubclass(expected_type, (float, complex)): 764 | check_number(argname, value, expected_type) 765 | elif _subclass_check_unions and issubclass(expected_type, Union): 766 | check_union(argname, value, expected_type, memo) 767 | elif isinstance(expected_type, TypeVar): 768 | check_typevar(argname, value, expected_type, memo) 769 | elif issubclass(expected_type, IO): 770 | check_io(argname, value, expected_type) 771 | elif is_typeddict(expected_type): 772 | check_typed_dict(argname, value, expected_type, memo) 773 | elif getattr(expected_type, '_is_protocol', False): 774 | check_protocol(argname, value, expected_type) 775 | else: 776 | expected_type = (getattr(expected_type, '__extra__', None) or origin_type or 777 | expected_type) 778 | 779 | if expected_type is bytes: 780 | # As per https://github.com/python/typing/issues/552 781 | if not isinstance(value, (bytearray, bytes, memoryview)): 782 | raise TypeError('type of {} must be bytes-like; got {} instead' 783 | .format(argname, qualified_name(value))) 784 | elif not isinstance(value, expected_type): 785 | raise TypeError( 786 | 'type of {} must be {}; got {} instead'. 787 | format(argname, qualified_name(expected_type), qualified_name(value))) 788 | elif isinstance(expected_type, TypeVar): 789 | # Only happens on < 3.6 790 | check_typevar(argname, value, expected_type, memo) 791 | elif isinstance(expected_type, Literal.__class__): 792 | # Only happens on < 3.7 when using Literal from typing_extensions 793 | check_literal(argname, value, expected_type, memo) 794 | elif expected_type.__class__ is NewType: 795 | # typing.NewType on Python 3.10+ 796 | return check_type(argname, value, expected_type.__supertype__, memo) 797 | elif (isfunction(expected_type) and 798 | getattr(expected_type, "__module__", None) == "typing" and 799 | getattr(expected_type, "__qualname__", None).startswith("NewType.") and 800 | hasattr(expected_type, "__supertype__")): 801 | # typing.NewType on Python 3.9 and below 802 | return check_type(argname, value, expected_type.__supertype__, memo) 803 | 804 | 805 | def check_return_type(retval, memo: Optional[_CallMemo] = None) -> bool: 806 | """ 807 | Check that the return value is compatible with the return value annotation in the function. 808 | 809 | :param retval: the value about to be returned from the call 810 | :return: ``True`` 811 | :raises TypeError: if there is a type mismatch 812 | 813 | """ 814 | if memo is None: 815 | # faster than inspect.currentframe(), but not officially 816 | # supported in all python implementations 817 | frame = sys._getframe(1) 818 | 819 | try: 820 | func = find_function(frame) 821 | except LookupError: 822 | return True # This can happen with the Pydev/PyCharm debugger extension installed 823 | 824 | memo = _CallMemo(func, frame.f_locals) 825 | 826 | if 'return' in memo.type_hints: 827 | if memo.type_hints['return'] is NoReturn: 828 | raise TypeError('{}() was declared never to return but it did'.format(memo.func_name)) 829 | 830 | try: 831 | check_type('the return value', retval, memo.type_hints['return'], memo) 832 | except TypeError as exc: # suppress unnecessarily long tracebacks 833 | # Allow NotImplemented if this is a binary magic method (__eq__() et al) 834 | if retval is NotImplemented and memo.type_hints['return'] is bool: 835 | # This does (and cannot) not check if it's actually a method 836 | func_name = memo.func_name.rsplit('.', 1)[-1] 837 | if len(memo.arguments) == 2 and func_name in BINARY_MAGIC_METHODS: 838 | return True 839 | 840 | raise TypeError(*exc.args) from None 841 | 842 | return True 843 | 844 | 845 | def check_argument_types(memo: Optional[_CallMemo] = None) -> bool: 846 | """ 847 | Check that the argument values match the annotated types. 848 | 849 | Unless both ``args`` and ``kwargs`` are provided, the information will be retrieved from 850 | the previous stack frame (ie. from the function that called this). 851 | 852 | :return: ``True`` 853 | :raises TypeError: if there is an argument type mismatch 854 | 855 | """ 856 | if memo is None: 857 | # faster than inspect.currentframe(), but not officially 858 | # supported in all python implementations 859 | frame = sys._getframe(1) 860 | 861 | try: 862 | func = find_function(frame) 863 | except LookupError: 864 | return True # This can happen with the Pydev/PyCharm debugger extension installed 865 | 866 | memo = _CallMemo(func, frame.f_locals) 867 | 868 | for argname, expected_type in memo.type_hints.items(): 869 | if argname != 'return' and argname in memo.arguments: 870 | value = memo.arguments[argname] 871 | description = 'argument "{}"'.format(argname) 872 | try: 873 | check_type(description, value, expected_type, memo) 874 | except TypeError as exc: # suppress unnecessarily long tracebacks 875 | raise TypeError(*exc.args) from None 876 | 877 | return True 878 | 879 | 880 | class TypeCheckedGenerator: 881 | def __init__(self, wrapped: Generator, memo: _CallMemo): 882 | rtype_args = [] 883 | if hasattr(memo.type_hints['return'], "__args__"): 884 | rtype_args = memo.type_hints['return'].__args__ 885 | 886 | self.__wrapped = wrapped 887 | self.__memo = memo 888 | self.__yield_type = rtype_args[0] if rtype_args else Any 889 | self.__send_type = rtype_args[1] if len(rtype_args) > 1 else Any 890 | self.__return_type = rtype_args[2] if len(rtype_args) > 2 else Any 891 | self.__initialized = False 892 | 893 | def __iter__(self): 894 | return self 895 | 896 | def __next__(self): 897 | return self.send(None) 898 | 899 | def __getattr__(self, name: str) -> Any: 900 | return getattr(self.__wrapped, name) 901 | 902 | def throw(self, *args): 903 | return self.__wrapped.throw(*args) 904 | 905 | def close(self): 906 | self.__wrapped.close() 907 | 908 | def send(self, obj): 909 | if self.__initialized: 910 | check_type('value sent to generator', obj, self.__send_type, memo=self.__memo) 911 | else: 912 | self.__initialized = True 913 | 914 | try: 915 | value = self.__wrapped.send(obj) 916 | except StopIteration as exc: 917 | check_type('return value', exc.value, self.__return_type, memo=self.__memo) 918 | raise 919 | 920 | check_type('value yielded from generator', value, self.__yield_type, memo=self.__memo) 921 | return value 922 | 923 | 924 | class TypeCheckedAsyncGenerator: 925 | def __init__(self, wrapped: AsyncGenerator, memo: _CallMemo): 926 | rtype_args = memo.type_hints['return'].__args__ 927 | self.__wrapped = wrapped 928 | self.__memo = memo 929 | self.__yield_type = rtype_args[0] 930 | self.__send_type = rtype_args[1] if len(rtype_args) > 1 else Any 931 | self.__initialized = False 932 | 933 | def __aiter__(self): 934 | return self 935 | 936 | def __anext__(self): 937 | return self.asend(None) 938 | 939 | def __getattr__(self, name: str) -> Any: 940 | return getattr(self.__wrapped, name) 941 | 942 | def athrow(self, *args): 943 | return self.__wrapped.athrow(*args) 944 | 945 | def aclose(self): 946 | return self.__wrapped.aclose() 947 | 948 | async def asend(self, obj): 949 | if self.__initialized: 950 | check_type('value sent to generator', obj, self.__send_type, memo=self.__memo) 951 | else: 952 | self.__initialized = True 953 | 954 | value = await self.__wrapped.asend(obj) 955 | check_type('value yielded from generator', value, self.__yield_type, memo=self.__memo) 956 | return value 957 | 958 | 959 | @overload 960 | def typechecked(*, always: bool = False) -> Callable[[T_CallableOrType], T_CallableOrType]: 961 | ... 962 | 963 | 964 | @overload 965 | def typechecked(func: T_CallableOrType, *, always: bool = False) -> T_CallableOrType: 966 | ... 967 | 968 | 969 | def typechecked(func=None, *, always=False, _localns: Optional[Dict[str, Any]] = None): 970 | """ 971 | Perform runtime type checking on the arguments that are passed to the wrapped function. 972 | 973 | The return value is also checked against the return annotation if any. 974 | 975 | If the ``__debug__`` global variable is set to ``False``, no wrapping and therefore no type 976 | checking is done, unless ``always`` is ``True``. 977 | 978 | This can also be used as a class decorator. This will wrap all type annotated methods, 979 | including ``@classmethod``, ``@staticmethod``, and ``@property`` decorated methods, 980 | in the class with the ``@typechecked`` decorator. 981 | 982 | :param func: the function or class to enable type checking for 983 | :param always: ``True`` to enable type checks even in optimized mode 984 | 985 | """ 986 | if func is None: 987 | return partial(typechecked, always=always, _localns=_localns) 988 | 989 | if not __debug__ and not always: # pragma: no cover 990 | return func 991 | 992 | if isclass(func): 993 | prefix = func.__qualname__ + '.' 994 | for key, attr in func.__dict__.items(): 995 | if inspect.isfunction(attr) or inspect.ismethod(attr) or inspect.isclass(attr): 996 | if attr.__qualname__.startswith(prefix) and getattr(attr, '__annotations__', None): 997 | setattr(func, key, typechecked(attr, always=always, _localns=func.__dict__)) 998 | elif isinstance(attr, (classmethod, staticmethod)): 999 | if getattr(attr.__func__, '__annotations__', None): 1000 | wrapped = typechecked(attr.__func__, always=always, _localns=func.__dict__) 1001 | setattr(func, key, type(attr)(wrapped)) 1002 | elif isinstance(attr, property): 1003 | kwargs = dict(doc=attr.__doc__) 1004 | for name in ("fset", "fget", "fdel"): 1005 | property_func = kwargs[name] = getattr(attr, name) 1006 | if property_func is not None and getattr(property_func, '__annotations__', ()): 1007 | kwargs[name] = typechecked( 1008 | property_func, always=always, _localns=func.__dict__ 1009 | ) 1010 | 1011 | setattr(func, key, attr.__class__(**kwargs)) 1012 | 1013 | return func 1014 | 1015 | if not getattr(func, '__annotations__', None): 1016 | warn('no type annotations present -- not typechecking {}'.format(function_name(func))) 1017 | return func 1018 | 1019 | # Find the frame in which the function was declared, for resolving forward references later 1020 | if _localns is None: 1021 | _localns = sys._getframe(1).f_locals 1022 | 1023 | # Find either the first Python wrapper or the actual function 1024 | python_func = inspect.unwrap(func, stop=lambda f: hasattr(f, '__code__')) 1025 | 1026 | if not getattr(python_func, '__code__', None): 1027 | warn('no code associated -- not typechecking {}'.format(function_name(func))) 1028 | return func 1029 | 1030 | def wrapper(*args, **kwargs): 1031 | memo = _CallMemo(python_func, _localns, args=args, kwargs=kwargs) 1032 | check_argument_types(memo) 1033 | retval = func(*args, **kwargs) 1034 | try: 1035 | check_return_type(retval, memo) 1036 | except TypeError as exc: 1037 | raise TypeError(*exc.args) from None 1038 | 1039 | # If a generator is returned, wrap it if its yield/send/return types can be checked 1040 | if inspect.isgenerator(retval) or isasyncgen(retval): 1041 | return_type = memo.type_hints.get('return') 1042 | if return_type: 1043 | origin = getattr(return_type, '__origin__', None) 1044 | if origin in generator_origin_types: 1045 | return TypeCheckedGenerator(retval, memo) 1046 | elif origin is not None and origin in asyncgen_origin_types: 1047 | return TypeCheckedAsyncGenerator(retval, memo) 1048 | 1049 | return retval 1050 | 1051 | async def async_wrapper(*args, **kwargs): 1052 | memo = _CallMemo(python_func, _localns, args=args, kwargs=kwargs) 1053 | check_argument_types(memo) 1054 | retval = await func(*args, **kwargs) 1055 | check_return_type(retval, memo) 1056 | return retval 1057 | 1058 | if inspect.iscoroutinefunction(func): 1059 | if python_func.__code__ is not async_wrapper.__code__: 1060 | return wraps(func)(async_wrapper) 1061 | else: 1062 | if python_func.__code__ is not wrapper.__code__: 1063 | return wraps(func)(wrapper) 1064 | 1065 | # the target callable was already wrapped 1066 | return func 1067 | 1068 | 1069 | class TypeWarning(UserWarning): 1070 | """ 1071 | A warning that is emitted when a type check fails. 1072 | 1073 | :ivar str event: ``call`` or ``return`` 1074 | :ivar Callable func: the function in which the violation occurred (the called function if event 1075 | is ``call``, or the function where a value of the wrong type was returned from if event is 1076 | ``return``) 1077 | :ivar str error: the error message contained by the caught :class:`TypeError` 1078 | :ivar frame: the frame in which the violation occurred 1079 | """ 1080 | 1081 | __slots__ = ('func', 'event', 'message', 'frame') 1082 | 1083 | def __init__(self, memo: Optional[_CallMemo], event: str, frame, 1084 | exception: Union[str, TypeError]): # pragma: no cover 1085 | self.func = memo.func 1086 | self.event = event 1087 | self.error = str(exception) 1088 | self.frame = frame 1089 | 1090 | if self.event == 'call': 1091 | caller_frame = self.frame.f_back 1092 | event = 'call to {}() from {}:{}'.format( 1093 | function_name(self.func), caller_frame.f_code.co_filename, caller_frame.f_lineno) 1094 | else: 1095 | event = 'return from {}() at {}:{}'.format( 1096 | function_name(self.func), self.frame.f_code.co_filename, self.frame.f_lineno) 1097 | 1098 | super().__init__('[{thread_name}] {event}: {self.error}'.format( 1099 | thread_name=threading.current_thread().name, event=event, self=self)) 1100 | 1101 | @property 1102 | def stack(self): 1103 | """Return the stack where the last frame is from the target function.""" 1104 | return extract_stack(self.frame) 1105 | 1106 | def print_stack(self, file: TextIO = None, limit: int = None) -> None: 1107 | """ 1108 | Print the traceback from the stack frame where the target function was run. 1109 | 1110 | :param file: an open file to print to (prints to stdout if omitted) 1111 | :param limit: the maximum number of stack frames to print 1112 | 1113 | """ 1114 | print_stack(self.frame, limit, file) 1115 | 1116 | 1117 | class TypeChecker: 1118 | """ 1119 | A type checker that collects type violations by hooking into :func:`sys.setprofile`. 1120 | 1121 | :param packages: list of top level modules and packages or modules to include for type checking 1122 | :param all_threads: ``True`` to check types in all threads created while the checker is 1123 | running, ``False`` to only check in the current one 1124 | :param forward_refs_policy: how to handle unresolvable forward references in annotations 1125 | 1126 | .. deprecated:: 2.6 1127 | Use :func:`~.importhook.install_import_hook` instead. This class will be removed in v3.0. 1128 | """ 1129 | 1130 | def __init__(self, packages: Union[str, Sequence[str]], *, all_threads: bool = True, 1131 | forward_refs_policy: ForwardRefPolicy = ForwardRefPolicy.ERROR): 1132 | assert check_argument_types() 1133 | warn('TypeChecker has been deprecated and will be removed in v3.0. ' 1134 | 'Use install_import_hook() or the pytest plugin instead.', DeprecationWarning) 1135 | self.all_threads = all_threads 1136 | self.annotation_policy = forward_refs_policy 1137 | self._call_memos = {} # type: Dict[Any, _CallMemo] 1138 | self._previous_profiler = None 1139 | self._previous_thread_profiler = None 1140 | self._active = False 1141 | 1142 | if isinstance(packages, str): 1143 | self._packages = (packages,) 1144 | else: 1145 | self._packages = tuple(packages) 1146 | 1147 | @property 1148 | def active(self) -> bool: 1149 | """Return ``True`` if currently collecting type violations.""" 1150 | return self._active 1151 | 1152 | def should_check_type(self, func: Callable) -> bool: 1153 | if not func.__annotations__: 1154 | # No point in checking if there are no type hints 1155 | return False 1156 | elif isasyncgenfunction(func): 1157 | # Async generators cannot be supported because the return arg is of an opaque builtin 1158 | # type (async_generator_wrapped_value) 1159 | return False 1160 | else: 1161 | # Check types if the module matches any of the package prefixes 1162 | return any(func.__module__ == package or func.__module__.startswith(package + '.') 1163 | for package in self._packages) 1164 | 1165 | def start(self): 1166 | if self._active: 1167 | raise RuntimeError('type checker already running') 1168 | 1169 | self._active = True 1170 | 1171 | # Install this instance as the current profiler 1172 | self._previous_profiler = sys.getprofile() 1173 | sys.setprofile(self) 1174 | 1175 | # If requested, set this instance as the default profiler for all future threads 1176 | # (does not affect existing threads) 1177 | if self.all_threads: 1178 | self._previous_thread_profiler = threading._profile_hook 1179 | threading.setprofile(self) 1180 | 1181 | def stop(self): 1182 | if self._active: 1183 | if sys.getprofile() is self: 1184 | sys.setprofile(self._previous_profiler) 1185 | else: # pragma: no cover 1186 | warn('the system profiling hook has changed unexpectedly') 1187 | 1188 | if self.all_threads: 1189 | if threading._profile_hook is self: 1190 | threading.setprofile(self._previous_thread_profiler) 1191 | else: # pragma: no cover 1192 | warn('the threading profiling hook has changed unexpectedly') 1193 | 1194 | self._active = False 1195 | 1196 | def __enter__(self): 1197 | self.start() 1198 | return self 1199 | 1200 | def __exit__(self, exc_type, exc_val, exc_tb): 1201 | self.stop() 1202 | 1203 | def __call__(self, frame, event: str, arg) -> None: # pragma: no cover 1204 | if not self._active: 1205 | # This happens if all_threads was enabled and a thread was created when the checker was 1206 | # running but was then stopped. The thread's profiler callback can't be reset any other 1207 | # way but this. 1208 | sys.setprofile(self._previous_thread_profiler) 1209 | return 1210 | 1211 | # If an actual profiler is running, don't include the type checking times in its results 1212 | if event == 'call': 1213 | try: 1214 | func = find_function(frame) 1215 | except Exception: 1216 | func = None 1217 | 1218 | if func is not None and self.should_check_type(func): 1219 | memo = self._call_memos[frame] = _CallMemo( 1220 | func, frame.f_locals, forward_refs_policy=self.annotation_policy) 1221 | if memo.is_generator: 1222 | return_type_hint = memo.type_hints['return'] 1223 | if return_type_hint is not None: 1224 | origin = getattr(return_type_hint, '__origin__', None) 1225 | if origin in generator_origin_types: 1226 | # Check the types of the yielded values 1227 | memo.type_hints['return'] = return_type_hint.__args__[0] 1228 | else: 1229 | try: 1230 | check_argument_types(memo) 1231 | except TypeError as exc: 1232 | warn(TypeWarning(memo, event, frame, exc)) 1233 | 1234 | if self._previous_profiler is not None: 1235 | self._previous_profiler(frame, event, arg) 1236 | elif event == 'return': 1237 | if self._previous_profiler is not None: 1238 | self._previous_profiler(frame, event, arg) 1239 | 1240 | if arg is None: 1241 | # a None return value might mean an exception is being raised but we have no way of 1242 | # checking 1243 | return 1244 | 1245 | memo = self._call_memos.get(frame) 1246 | if memo is not None: 1247 | try: 1248 | if memo.is_generator: 1249 | check_type('yielded value', arg, memo.type_hints['return'], memo) 1250 | else: 1251 | check_return_type(arg, memo) 1252 | except TypeError as exc: 1253 | warn(TypeWarning(memo, event, frame, exc)) 1254 | 1255 | if not memo.is_generator: 1256 | del self._call_memos[frame] 1257 | elif self._previous_profiler is not None: 1258 | self._previous_profiler(frame, event, arg) 1259 | --------------------------------------------------------------------------------