├── .github └── workflows │ └── ci-test.yaml ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── code.rst │ ├── conf.py │ ├── index.rst │ ├── logo-1.png │ ├── logo-1.svg │ ├── logo-2-dark.svg │ ├── logo-2-gray.svg │ ├── logo-2-white.svg │ ├── logo-2.png │ └── logo-2.svg ├── requirements.txt ├── setup.py ├── tests ├── __init__.py ├── test_pandas_typedframe.py └── test_polars_typedframe.py └── typedframe ├── __init__.py ├── base.py ├── pandas_.py └── polars_.py /.github/workflows/ci-test.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | test_pandas: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: ["3.9"] 13 | pandas-version: ["1.2", "1.3", "1.4"] 14 | numpy-version: ["1.20", "1.21", "1.22"] 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install numpy==${{ matrix.numpy-version }} 26 | python -m pip install pandas==${{ matrix.pandas-version }} 27 | python -m pip install flake8 pytest==6.2.4 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 34 | - name: Test with pytest 35 | run: | 36 | pytest tests/test_pandas_typedframe.py 37 | 38 | test_polars: 39 | 40 | runs-on: ubuntu-latest 41 | strategy: 42 | fail-fast: false 43 | matrix: 44 | python-version: [ "3.9" ] 45 | polars-version: [ "0.15"] 46 | pyarrow-version: [ "10.0" ] 47 | 48 | steps: 49 | - uses: actions/checkout@v2 50 | - name: Set up Python ${{ matrix.python-version }} 51 | uses: actions/setup-python@v2 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | python -m pip install --upgrade pip 57 | python -m pip install polars==${{ matrix.polars-version }} 58 | python -m pip install pyarrow==${{ matrix.pyarrow-version }} 59 | python -m pip install pytest==6.2.4 60 | - name: Test with pytest 61 | run: | 62 | pytest tests/test_polars_typedframe.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # VS Code 105 | .vscode 106 | 107 | # Idea 108 | .idea 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # Mac 138 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Alexander Reshytko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![logo](./docs/source/logo-2-dark.svg#gh-dark-mode-only)![logo](./docs/source/logo-2.svg#gh-light-mode-only) 3 | 4 | --- 5 | 6 | # typedframe 7 | 8 | **Typed wrappers over pandas DataFrames with schema validation.** 9 | 10 | [![Tests](https://github.com/areshytko/typedframe/actions/workflows/ci-test.yaml/badge.svg)](https://github.com/areshytko/typedframe/actions/workflows/ci-test.yaml) 11 | 12 | `TypedDataFrame` is a lightweight wrapper over pandas `DataFrame` that provides runtime schema validation and can be used to establish strong data contracts between interfaces in your Python code. 13 | 14 | The goal of the library is to reveal and make explicit all unclear or forgotten assumptions about your DataFrame. 15 | 16 | Check [the Official Documentation](https://typedframe.readthedocs.io/en/latest/). 17 | 18 | ### Quickstart 19 | 20 | Install typedframe library: 21 | ``` 22 | pip install typedframe 23 | ``` 24 | Assume an overly simplified preprocessing code like this: 25 | ```python 26 | def preprocess(df: pd.DataFrame) -> pd.DataFrame: 27 | df = df.copy() 28 | c1_min, c1_max = df['col1'].min(), df['col1'].max() 29 | df['col1'] = 0 if c1_min == c1_max else (df['col1'] - c1_min) / (c1_max - c1_min) 30 | df['month'] = df['date'].dt.month 31 | df['comment'] = df['comment'].str.lower() 32 | return df 33 | ``` 34 | To add `typedframe` schema support for this transformation we will define two schema classes - for the input and for the output: 35 | ```python 36 | import numpy as np 37 | from typedframe import TypedDataFrame, DATE_TIME_DTYPE 38 | 39 | class MyRawData(TypedDataFrame): 40 | schema = { 41 | 'col1': np.float64, 42 | 'date': DATE_TIME_DTYPE, 43 | 'comment': str, 44 | } 45 | 46 | 47 | class PreprocessedData(MyRawData): 48 | schema = { 49 | 'month': np.int8 50 | } 51 | ``` 52 | 53 | Then let's modify the `preprocess` function to take a typed wrapper `MyRawData` as input and return `PreprocessedData`: 54 | ```python 55 | def preprocess(data: MyRawData) -> PreprocessedData: 56 | df = data.df.copy() 57 | c1_min, c1_max = df['col1'].min(), df['col1'].max() 58 | df['col1'] = 0 if c1_min == c1_max else (df['col1'] - c1_min) / (c1_max - c1_min) 59 | df['month'] = df['date'].dt.month 60 | df['comment'] = df['comment'].str.lower() 61 | return PreprocessedData.convert(df) 62 | ``` 63 | 64 | As you can see the actual DataFrame can be accessed via the `.df` attribute of the Typed DataFrame. 65 | 66 | Now clients of the `preprocess` function can easily check what are the inputs and outputs without the need to look at its internals. 67 | And if there are some unforseen changes in the data an exception will be thrown before the actual function will be invoked. 68 | 69 | Let's check: 70 | 71 | ```python 72 | import pandas as pd 73 | 74 | df = pd.DataFrame({ 75 | 'col1': [0.1, 0.2], 76 | 'date': ['2021-01-01', '2022-01-01'], 77 | 'comment': ['foo', 'bar'] 78 | }) 79 | df.date = pd.to_datetime(df.date) 80 | 81 | bad_df = pd.DataFrame({ 82 | 'col1': [1, 2], 83 | 'comment': ['foo', 'bar'] 84 | }) 85 | 86 | df2 = preprocess(MyRawData(df)) 87 | df3 = preprocess(MyRawData(bad_df)) 88 | ``` 89 | 90 | The first call was successful. 91 | But when we've tried to pass a wrong dataframe as input we've got the following error: 92 | 93 | ``` 94 | AssertionError: Dataframe doesn't match schema 95 | Actual: {'col1': dtype('int64'), 'comment': dtype('O')} 96 | Expected: {'col1': , 'date': dtype('} 97 | Difference: {('col1', ), ('date', dtype('NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-rtd-theme==1.0.0 2 | sphinx==4.4.0 -------------------------------------------------------------------------------- /docs/source/code.rst: -------------------------------------------------------------------------------- 1 | 2 | Docstrings 3 | ============ 4 | 5 | .. automodule:: typedframe.typedframe 6 | :members: -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath(os.path.join('..', '..'))) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'Typedframe' 21 | copyright = '2022, Alexander Reshytko' 22 | author = 'Alexander Reshytko' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon' 31 | ] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # List of patterns, relative to source directory, that match files and 37 | # directories to ignore when looking for source files. 38 | # This pattern also affects html_static_path and html_extra_path. 39 | exclude_patterns = [] 40 | 41 | 42 | # -- Options for HTML output ------------------------------------------------- 43 | 44 | # The theme to use for HTML and HTML Help pages. See the documentation for 45 | # a list of builtin themes. 46 | # 47 | html_theme = 'sphinx_rtd_theme' 48 | 49 | # Add any paths that contain custom static files (such as style sheets) here, 50 | # relative to this directory. They are copied after the builtin static files, 51 | # so a file named "default.css" will overwrite the builtin "default.css". 52 | html_static_path = ['_static'] 53 | 54 | html_logo = "logo-2-white.svg" 55 | html_theme_options = { 56 | 'logo_only': True, 57 | 'display_version': False, 58 | } -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Typedframe documentation master file, created by 2 | sphinx-quickstart on Sat Mar 5 19:33:48 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. image:: logo-2.svg 7 | :alt: logo 8 | 9 | ---- 10 | 11 | Typedframe 12 | ============ 13 | 14 | **Typed wrappers over pandas DataFrames with schema validation.** 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | :caption: Contents: 19 | 20 | ``TypedDataFrame`` is a lightweight wrapper over pandas ``DataFrame`` 21 | that provides runtime schema validation and can be used to establish 22 | strong data contracts between interfaces in your Python code. 23 | 24 | The goal of the library is to reveal and make explicit all unclear or 25 | forgotten assumptions about your DataFrame. 26 | 27 | Quickstart 28 | ~~~~~~~~~~ 29 | 30 | Install typedframe library: 31 | 32 | :: 33 | 34 | pip install typedframe 35 | 36 | Assume an overly simplified preprocessing code like this: 37 | 38 | .. code:: python 39 | 40 | def preprocess(df: pd.DataFrame) -> pd.DataFrame: 41 | df = df.copy() 42 | c1_min, c1_max = df['col1'].min(), df['col1'].max() 43 | df['col1'] = 0 if c1_min == c1_max else (df['col1'] - c1_min) / (c1_max - c1_min) 44 | df['month'] = df['date'].dt.month 45 | df['comment'] = df['comment'].str.lower() 46 | return df 47 | 48 | To add ``typedframe`` schema support for this transformation we will 49 | define two schema classes - for the input and for the output: 50 | 51 | .. code:: python 52 | 53 | import numpy as np 54 | from typedframe import TypedDataFrame, DATE_TIME_DTYPE 55 | 56 | class MyRawData(TypedDataFrame): 57 | schema = { 58 | 'col1': np.float64, 59 | 'date': DATE_TIME_DTYPE, 60 | 'comment': str, 61 | } 62 | 63 | 64 | class PreprocessedData(MyRawData): 65 | schema = { 66 | 'month': np.int8 67 | } 68 | 69 | Then let’s modify the ``preprocess`` function to take a typed wrapper 70 | ``MyRawData`` as input and return ``PreprocessedData``: 71 | 72 | .. code:: python 73 | 74 | def preprocess(data: MyRawData) -> PreprocessedData: 75 | df = data.df.copy() 76 | c1_min, c1_max = df['col1'].min(), df['col1'].max() 77 | df['col1'] = 0 if c1_min == c1_max else (df['col1'] - c1_min) / (c1_max - c1_min) 78 | df['month'] = df['date'].dt.month 79 | df['comment'] = df['comment'].str.lower() 80 | return PreprocessedData.convert(df) 81 | 82 | As you can see the actual DataFrame can be accessed via the ``.df`` 83 | attribute of the Typed DataFrame. 84 | 85 | Now clients of the ``preprocess`` function can easily check what are the 86 | inputs and outputs without the need to look at its internals. And if 87 | there are some unforseen changes in the data an exception will be thrown 88 | before the actual function will be invoked. 89 | 90 | Let’s check: 91 | 92 | .. code:: python 93 | 94 | import pandas as pd 95 | 96 | df = pd.DataFrame({ 97 | 'col1': [0.1, 0.2], 98 | 'date': ['2021-01-01', '2022-01-01'], 99 | 'comment': ['foo', 'bar'] 100 | }) 101 | df.date = pd.to_datetime(df.date) 102 | 103 | bad_df = pd.DataFrame({ 104 | 'col1': [1, 2], 105 | 'comment': ['foo', 'bar'] 106 | }) 107 | 108 | df2 = preprocess(MyRawData(df)) 109 | df3 = preprocess(MyRawData(bad_df)) 110 | 111 | The first call was successful. But when we’ve tried to pass a wrong 112 | dataframe as input we’ve got the following error: 113 | 114 | :: 115 | 116 | AssertionError: Dataframe doesn't match schema 117 | Actual: {'col1': dtype('int64'), 'comment': dtype('O')} 118 | Expected: {'col1': , 'date': dtype('} 119 | Difference: {('col1', ), ('date', dtype(' pd.DataFrame: 129 | 130 | Even when we have added type hints to our function, the user doesn’t 131 | really know how he can use it. He must dig inside the code of the 132 | function to find out things like expected columns and their types. This 133 | violates on of the core software development principles - the 134 | encapsulation. 135 | 136 | Pandas DataFrame is an open data type. It introduces a lot of implicit 137 | assumptions about the data. Let’s explore some examples where one can 138 | easily overlook these implicit assumptions: 139 | 140 | Required columns and data types: 141 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 142 | 143 | .. code:: python 144 | 145 | df.grouby('state')['income'].mean() 146 | 147 | The dataframe is expected to have ``state`` and ``income`` columns. 148 | ``income`` column must have a numeric type. 149 | 150 | Index name and type 151 | ^^^^^^^^^^^^^^^^^^^ 152 | 153 | .. code:: python 154 | 155 | df.reset_index(inplace=True) 156 | x = df['my_index'] 157 | 158 | It is expected that a dataframe has a named index with a name 159 | ``my_index``. 160 | 161 | Categorical columns 162 | ^^^^^^^^^^^^^^^^^^^ 163 | 164 | .. code:: python 165 | 166 | df3 = pd.merge(df1, df2, on='categorical_col') 167 | 168 | The result above will differ based on whether a ``categorical_col`` in 169 | ``df1`` and ``df2`` has exactly the same set of categories or not. 170 | 171 | All these scenarios above can lead to a variety of subtle bugs in our 172 | pipeline. 173 | 174 | The concept of Typed DataFrame 175 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 176 | 177 | A Typed DataFrame is a minimalistic wrapper on top of your pandas 178 | DataFrame. You create it by creating a subclass of a ``TypedDataFrame`` 179 | and defining ``schema`` static variable. Then you can wrap your 180 | DataFrame in it by passing it to your Typed DataFrame constructor. The 181 | constructor will do a runtime schema validation and the original 182 | dataframe can be accessed through ``df`` attribute of a wrapper. 183 | 184 | This wrapper serves 2 purposes: 185 | 186 | - Formal explicit documentation about dataframe assumptions. 187 | You can use your Typed DataFrame schema definition as a form 188 | of documentation to communicate your data interfaces to others. 189 | This works very well especially in combination with Python type hints. 190 | 191 | - Runtime schema validation. In case of any data contracts violation 192 | you’ll get an exception explaining the exact reason. If you guard 193 | your pipeline with such Typed DataFrames you’ll be able to catch 194 | errors early - closer to the root causes. 195 | 196 | Features 197 | ~~~~~~~~ 198 | 199 | Required Schema 200 | ^^^^^^^^^^^^^^^ 201 | 202 | You can define the required schema by passing a dictionary to a static 203 | variable ``schema`` of a ``TypeFrame`` subclass. The dictionary defines 204 | the mapping from a column name to a dtype: 205 | 206 | .. code:: python 207 | 208 | class MyTable(TypedDataFrame): 209 | schema = { 210 | "col1": str, 211 | "col2": np.int32, 212 | "col3": ('foo', 'bar') 213 | } 214 | 215 | Schema Inheritance 216 | ^^^^^^^^^^^^^^^^^^ 217 | 218 | You can inherit one Typed DataFrame from another one. 219 | 220 | The semantics of the inheritance relation is the same as with class 221 | methods and attributes in classic OOP. I.e. if Typed DataFrame A is a 222 | subclass of a Typed DataFrame B, all the schema requirements for B must 223 | also be held for A. In case of any conflicts, the schema defined in A 224 | takes a precedence. 225 | 226 | .. code:: python 227 | 228 | class MyDataFrame(TypedDataFrame): 229 | schema = { 230 | 'int_field': np.int16, 231 | 'float_field': np.float64, 232 | 'bool_field': bool, 233 | 'str_field': str, 234 | 'obj_field': object 235 | } 236 | 237 | 238 | class InheritedDataFrame(MyDataFrame): 239 | schema = { 240 | 'new_field': np.int64 241 | } 242 | 243 | Multiple Inheritance 244 | '''''''''''''''''''' 245 | 246 | Multiple Inheritance is allowed. It has a “union” semantics. 247 | 248 | .. code:: python 249 | 250 | class Root(TypedDataFrame): 251 | 252 | schema = { 253 | 'root': bool 254 | } 255 | 256 | 257 | class Left(Root): 258 | schema = { 259 | 'left': bool 260 | } 261 | 262 | 263 | class Right(Root): 264 | schema = { 265 | 'root': object, 266 | 'right': bool 267 | } 268 | 269 | 270 | class Down(Left, Right): 271 | pass 272 | 273 | Index Schema 274 | ^^^^^^^^^^^^ 275 | 276 | You can specify schema for the index of the DataFrame. It’s defined as a 277 | tuple of a dtype and a name which you assign to an ``index_schema`` 278 | static variable: 279 | 280 | .. code:: python 281 | 282 | class IndexDataFrame(TypedDataFrame): 283 | schema = { 284 | 'foo': bool 285 | } 286 | 287 | index_schema = ('bar', np.int32) 288 | 289 | Optional Schema 290 | ^^^^^^^^^^^^^^^ 291 | 292 | You can specify optional columns in a schema definition. Optional column 293 | types will be checked only if present in a DataFrame. In case some 294 | optional column (or all of them) is missing no validation error will be 295 | raised. Besides that all columns from optional schema that are missing 296 | in a dataframe will be added with NaN values. 297 | 298 | .. code:: python 299 | 300 | class DataFrameWithOptional(TypedDataFrame): 301 | schema = { 302 | 'required': bool 303 | } 304 | optional = { 305 | 'optional': bool 306 | } 307 | 308 | Convert Method 309 | ^^^^^^^^^^^^^^ 310 | 311 | ``TypedDataFrame`` provides a convenient ``convert`` classmethod that 312 | tries to convert a given DataFrame to be compliant with a schema. 313 | 314 | .. code:: python 315 | 316 | class IndexDataFrame(TypedDataFrame): 317 | schema = { 318 | 'foo': bool 319 | } 320 | 321 | index_schema = ('bar', DATE_TIME_DTYPE) 322 | 323 | df = pd.DataFrame({'foo': [True, False]}, 324 | index=pd.Series(['2021-06-03', '2021-05-31'])) 325 | data = IndexDataFrame.convert(df) 326 | 327 | Supported types 328 | ~~~~~~~~~~~~~~~ 329 | 330 | Integers 331 | ^^^^^^^^ 332 | 333 | ``np.int16``, ``np.int32``, ``np.int64``, etc. 334 | 335 | Floats 336 | ^^^^^^ 337 | 338 | ``np.float16``, ``np.float32``, ``np.float64``, etc. 339 | 340 | Boolean 341 | ^^^^^^^ 342 | 343 | ``bool`` 344 | 345 | Python objects 346 | ^^^^^^^^^^^^^^ 347 | 348 | ``str``, ``dict``, ``list``, ``object`` 349 | 350 | WARNING: no actual check is performed for Python objects. They are all 351 | considered to be of the same type ``object``. 352 | 353 | Categorical 354 | ^^^^^^^^^^^ 355 | 356 | Categorical dtype is specified as a tuple of categories. To avoid common 357 | categorical pitfalls categorical types are required to have an exact 358 | schema with all categories enumerated in the exact order. 359 | 360 | .. code:: python 361 | 362 | class MyTable(TypedDataFrame): 363 | schema = { 364 | "col": ('foo', 'bar') 365 | } 366 | 367 | df = pd.DataFrame({"col": ['foo', 'foo', 'bar']}) 368 | df.col = pd.Categorical(df.col, categories=('foo', 'bar'), ordered=True) 369 | data = MyTable(df) 370 | 371 | DateTime 372 | ^^^^^^^^ 373 | 374 | ``np.dtype('datetime64[ns]')`` 375 | 376 | ``typedframe`` library provides an alias for that also: 377 | ``DATE_TIME_DTYPE`` 378 | 379 | UTC DateTime 380 | '''''''''''' 381 | 382 | ``pd.DatetimeTZDtype('ns', pytz.UTC)`` 383 | 384 | ``typedframe`` library provides an alias for that also: 385 | ``UTC_DATE_TIME_DTYPE`` 386 | 387 | Best practices to use Typed DataFrame 388 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 389 | 390 | What are the best places to use Typed DataFrame wrappers in your 391 | codebase? 392 | 393 | Our experience with ``typedframe`` library in a number of projects has 394 | shown the following scenarios where it’s use was justified the most: 395 | 396 | Team Borders 397 | ^^^^^^^^^^^^ 398 | 399 | Typed DataFrame helps to establish data contracts between teams. It also 400 | helps to spot the errors caused by miscommunication or inconsistent 401 | system evolution early. Whenever some dataset is being passed between 402 | teams it makes sense to define a Typed DataFrame class with its 403 | specification. 404 | 405 | Public Functions and Methods 406 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 407 | 408 | Typed DataFrame work especially well in combination with Python type 409 | hints. So a good place to use it is when you have a public function or 410 | method that takes as an argument / returns some pandas DataFrame. 411 | 412 | Sources and Sinks of Data Pipelines 413 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 414 | 415 | It is a good practice to provide schema definitions and runtime 416 | validation at the beginning and at the end of data pipelines. I.e. right 417 | after you read from the external storage and before you write to it. 418 | This is where Typed DataFrames can also be used. 419 | 420 | Similar Projects 421 | ~~~~~~~~~~~~~~~~ 422 | 423 | - `Great Expectations `__. It’s a much 424 | more feature-rich library which allows data teams to do a lot of 425 | assertions about the data. ``typedframe`` is a more light-weight 426 | library which can be considered as a thin extension layer on top of 427 | pandas DataFrame. 428 | 429 | - `Marshmallow `__. A library for 430 | Python objects serialization and deserialization with schema 431 | validation. It’s not integrated with pandas or numpy and focuses only 432 | on Python classes and builtin objects. 433 | 434 | 435 | 436 | Indices and tables 437 | ================== 438 | 439 | * :ref:`genindex` 440 | * :ref:`modindex` 441 | * :ref:`search` 442 | -------------------------------------------------------------------------------- /docs/source/logo-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/areshytko/typedframe/dcb5daed3f84296247633840b61d6cd217704b56/docs/source/logo-1.png -------------------------------------------------------------------------------- /docs/source/logo-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/source/logo-2-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/source/logo-2-gray.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/source/logo-2-white.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/source/logo-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/areshytko/typedframe/dcb5daed3f84296247633840b61d6cd217704b56/docs/source/logo-2.png -------------------------------------------------------------------------------- /docs/source/logo-2.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | polars 4 | pyarrow 5 | pytest==6.2.4 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup 3 | 4 | HERE = pathlib.Path(__file__).parent 5 | README = (HERE / "README.md").read_text() 6 | 7 | 8 | setup( 9 | name="typedframe", 10 | version='0.11.0', 11 | description="Typed Wrappers over Pandas and Polars DataFrames with schema validation", 12 | long_description=README, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/areshytko/typedframe", 15 | author="Alexander Reshytko", 16 | author_email="alexander@reshytko.com", 17 | license="MIT", 18 | classifiers=[ 19 | "License :: OSI Approved :: MIT License", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.8", 22 | ], 23 | packages=["typedframe"], 24 | install_requires=[ 25 | ], 26 | extras_require={ 27 | "pandas": ["pandas", "numpy"], 28 | "polars": ["polars", "pyarrow"], 29 | }, 30 | setup_requires=['pytest-runner'], 31 | tests_require=['pytest'] 32 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/areshytko/typedframe/dcb5daed3f84296247633840b61d6cd217704b56/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_pandas_typedframe.py: -------------------------------------------------------------------------------- 1 | 2 | import abc 3 | import datetime 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import pytest 8 | 9 | from typedframe.pandas_ import UTC_DATE_TIME_DTYPE, DATE_TIME_DTYPE, PandasTypedFrame as TypedDataFrame 10 | 11 | 12 | class MyDataFrame(TypedDataFrame): 13 | schema = { 14 | 'int_field': np.int16, 15 | 'float_field': np.float64, 16 | 'bool_field': bool, 17 | 'str_field': str, 18 | 'obj_field': object 19 | } 20 | 21 | 22 | class InheritedDataFrame(MyDataFrame): 23 | schema = { 24 | 'new_field': np.int64 25 | } 26 | 27 | 28 | class DataFrameWithOptional(TypedDataFrame): 29 | schema = { 30 | 'required': bool 31 | } 32 | optional = { 33 | 'optional': bool 34 | } 35 | 36 | 37 | class IndexDataFrame(TypedDataFrame): 38 | schema = { 39 | 'foo': bool 40 | } 41 | 42 | index_schema = ('bar', DATE_TIME_DTYPE) 43 | 44 | 45 | class ChildIndexDataFrame(IndexDataFrame): 46 | pass 47 | 48 | 49 | class UTCDateTimeDataframe(TypedDataFrame): 50 | schema = { 51 | 'date_field': UTC_DATE_TIME_DTYPE 52 | } 53 | 54 | 55 | def test_utc_datetime_success_case(): 56 | df = pd.DataFrame({'date_field': [datetime.date.today(), datetime.date(2021, 5, 31)]}) 57 | df.date_field = pd.to_datetime(df.date_field, utc=True) 58 | _ = UTCDateTimeDataframe(df) 59 | 60 | 61 | def test_utc_datetime_error_case(): 62 | df = pd.DataFrame({'date_field': [datetime.date.today(), datetime.date(2021, 5, 31)]}) 63 | df.date_field = pd.to_datetime(df.date_field) 64 | with pytest.raises(AssertionError): 65 | _ = UTCDateTimeDataframe(df) 66 | 67 | 68 | def test_utc_datetime_convert_case(): 69 | df = pd.DataFrame({'date_field': [datetime.date.today(), datetime.date(2021, 5, 31)]}) 70 | _ = UTCDateTimeDataframe.convert(df) 71 | 72 | 73 | def test_index_success_case(): 74 | df = pd.DataFrame({'foo': [True, False]}) 75 | df.index = pd.to_datetime(pd.Series([datetime.date.today(), datetime.date(2021, 5, 31)], name='bar')) 76 | _ = IndexDataFrame(df) 77 | _ = ChildIndexDataFrame(df) 78 | 79 | 80 | def test_index_fail_case(): 81 | df = pd.DataFrame({'foo': [True, False]}) 82 | with pytest.raises(AssertionError): 83 | _ = IndexDataFrame(df) 84 | 85 | 86 | def test_index_convert_success_case(): 87 | df = pd.DataFrame({'foo': [True, False]}) 88 | df.index = pd.Series(['2021-06-03', '2021-05-31']) 89 | _ = IndexDataFrame.convert(df) 90 | 91 | 92 | def test_base_success_case(): 93 | df = pd.DataFrame({ 94 | 'int_field': np.int16([1, 2]), 95 | 'float_field': np.float64([0.1, 0.2]), 96 | 'bool_field': [True, False], 97 | 'str_field': ["one", "two"], 98 | 'obj_field': ["", ""], 99 | 'new_field': np.int64([10, 20]) 100 | }) 101 | _ = InheritedDataFrame(df) 102 | 103 | 104 | class CategoricalFrame(TypedDataFrame): 105 | schema = { 106 | 'col': ('foo', 'bar') 107 | } 108 | 109 | 110 | def test_categorical_success_1(): 111 | df = pd.DataFrame({'col': ['foo', 'foo', 'bar']}) 112 | df.col = pd.Categorical(df.col, categories=('foo', 'bar'), ordered=True) 113 | _ = CategoricalFrame(df) 114 | 115 | 116 | def test_categorical_success_2(): 117 | df = pd.DataFrame({'col': ['foo', 'foo']}) 118 | df.col = pd.Categorical(df.col, categories=('foo', 'bar'), ordered=True) 119 | _ = CategoricalFrame(df) 120 | 121 | 122 | def test_categorical_failure_1(): 123 | df = pd.DataFrame({'col': ['foo', 'foo']}) 124 | df.col = pd.Categorical(df.col, categories=('foo', 'bar', 'buzz'), ordered=True) 125 | with pytest.raises(AssertionError): 126 | _ = CategoricalFrame(df) 127 | 128 | 129 | def test_categorical_failure_3(): 130 | df = pd.DataFrame({'col': ['foo', 'foo']}) 131 | with pytest.raises(AssertionError): 132 | _ = CategoricalFrame(df) 133 | 134 | 135 | def test_convert_categorical(): 136 | df = pd.DataFrame({'col': ['foo', 'foo']}) 137 | _ = CategoricalFrame.convert(df) 138 | 139 | 140 | def test_convert_optional(): 141 | df = pd.DataFrame({'required': [True]}) 142 | data = DataFrameWithOptional.convert(df, add_optional_cols=True) 143 | assert all(col in data.df.columns for col in DataFrameWithOptional.dtype().keys()) 144 | 145 | 146 | def test_convert_categorical_failure(): 147 | df = pd.DataFrame({'col': ['foo', 'buzz']}) 148 | with pytest.raises(AssertionError): 149 | _ = CategoricalFrame.convert(df) 150 | 151 | 152 | def test_categorical_with_nans_failure(): 153 | df = pd.DataFrame({'col': ['foo', 'buzz']}) 154 | df.col = pd.Categorical(df.col, categories=CategoricalFrame.schema['col'], ordered=True) 155 | with pytest.raises(AssertionError): 156 | _ = CategoricalFrame(df) 157 | 158 | 159 | class PingInterface(metaclass=abc.ABCMeta): 160 | 161 | @abc.abstractmethod 162 | def ping(self): 163 | pass 164 | 165 | 166 | class Parent(TypedDataFrame): 167 | schema = { 168 | 'foo': bool 169 | } 170 | 171 | 172 | class Child(Parent, PingInterface): 173 | 174 | schema = { 175 | 'bar': bool 176 | } 177 | 178 | def ping(self): 179 | print("ping") 180 | 181 | 182 | def test_multiple_inheritance_1_success(): 183 | _ = Child(pd.DataFrame({'foo': [True], 'bar': [False]})) 184 | 185 | 186 | def test_multiple_inheritance_1_failure(): 187 | with pytest.raises(AssertionError): 188 | _ = Child(pd.DataFrame({'bar': [False]})) 189 | 190 | 191 | class Root(TypedDataFrame): 192 | 193 | schema = { 194 | 'root': bool 195 | } 196 | 197 | 198 | class Left(Root): 199 | schema = { 200 | 'left': bool 201 | } 202 | 203 | 204 | class Right(Root): 205 | schema = { 206 | 'root': object, 207 | 'right': bool 208 | } 209 | 210 | 211 | class Down(Left, Right): 212 | pass 213 | 214 | 215 | def test_multiple_inheritance_2_success(): 216 | _ = Down(pd.DataFrame({'root': [True], 'left': [True], 'right': [True]})) 217 | 218 | 219 | def test_multiple_inheritance_2_failure(): 220 | with pytest.raises(AssertionError): 221 | _ = Down(pd.DataFrame({'root': [True], 'left': [True]})) 222 | 223 | 224 | def test_multiple_inheritance_2_failure_with_root_overwrite(): 225 | with pytest.raises(AssertionError): 226 | _ = Down(pd.DataFrame({'root': [True], 'left': [True], 'right': ['string']})) 227 | -------------------------------------------------------------------------------- /tests/test_polars_typedframe.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import polars as pl 4 | import pytest 5 | 6 | from typedframe.polars_ import PolarsTypedFrame as TypedDataFrame 7 | 8 | 9 | class ParentDataFrame(TypedDataFrame): 10 | schema = { 11 | 'int_field': pl.Int16, 12 | 'float_field': pl.Float64, 13 | 'bool_field': pl.Boolean, 14 | 'str_field': pl.Utf8, 15 | 'date_field': pl.Date, 16 | 'datetime_field': pl.Datetime 17 | } 18 | 19 | 20 | class MixinDataFrame(TypedDataFrame): 21 | schema = { 22 | 'mixin_field': pl.Int64 23 | } 24 | 25 | 26 | class ChildDataFrame(ParentDataFrame, MixinDataFrame): 27 | schema = { 28 | 'new_field': pl.Int64 29 | } 30 | 31 | 32 | class OptionalDataFrame(TypedDataFrame): 33 | schema = { 34 | 'required': pl.Boolean 35 | } 36 | optional = { 37 | 'optional': pl.Boolean 38 | } 39 | 40 | 41 | def test_base_success_case(): 42 | df = pl.DataFrame({'int_field': [1, 2, 3], 43 | 'float_field': [1.0, 2.0, 3.0], 44 | 'bool_field': [True, False, True], 45 | 'str_field': ['a', 'b', 'c'], 46 | 'date_field': [datetime.date(2021, 5, 31), datetime.date(2021, 6, 1), datetime.date(2021, 6, 2)], 47 | 'datetime_field': [datetime.datetime(2021, 5, 31, 12, 0, 0), datetime.datetime(2021, 6, 1, 12, 0, 0), datetime.datetime(2021, 6, 2, 12, 0, 0)], 48 | 'mixin_field': [1, 2, 3], 49 | 'new_field': [1, 2, 3]}) 50 | df = df.with_column(pl.col('int_field').cast(pl.Int16)) 51 | _ = ChildDataFrame(df) 52 | 53 | 54 | def test_base_error_case(): 55 | df = pl.DataFrame({'int_field': [1, 2, 3], 56 | 'float_field': [1.0, 2.0, 3.0], 57 | 'bool_field': [True, False, True], 58 | 'str_field': ['a', 'b', 'c'], 59 | 'new_field': [1, 2, 3]}) 60 | with pytest.raises(AssertionError): 61 | _ = ChildDataFrame(df) 62 | 63 | 64 | def test_convert_success_case(): 65 | df = pl.DataFrame({'int_field': [1, 2, 3], 66 | 'float_field': [1.0, 2.0, 3.0], 67 | 'bool_field': [True, False, True], 68 | 'str_field': ['a', 'b', 'c'], 69 | 'date_field': [datetime.date(2021, 5, 31), datetime.date(2021, 6, 1), datetime.date(2021, 6, 2)], 70 | 'datetime_field': [datetime.datetime(2021, 5, 31, 12, 0, 0), 71 | datetime.datetime(2021, 6, 1, 12, 0, 0), 72 | datetime.datetime(2021, 6, 2, 12, 0, 0)]}) 73 | _ = ParentDataFrame.convert(df) 74 | 75 | 76 | def test_convert_error_case(): 77 | df = pl.DataFrame({'int_field': [1, 2, 3], 78 | 'float_field': [1.0, 2.0, 3.0], 79 | 'bool_field': [True, False, True], 80 | 'str_field': ['a', 'b', 'c']}) 81 | with pytest.raises(AssertionError): 82 | _ = ParentDataFrame.convert(df) 83 | 84 | 85 | def test_optional_success_case(): 86 | df = pl.DataFrame({'required': [True, False, True]}) 87 | _ = OptionalDataFrame(df) 88 | 89 | 90 | def test_optional_success_case_2(): 91 | df = pl.DataFrame({'required': [True, False, True], 92 | 'optional': [True, False, True]}) 93 | _ = OptionalDataFrame(df) 94 | 95 | 96 | def test_optional_error_case(): 97 | df = pl.DataFrame({'required': [True, False, True], 98 | 'optional': [2, 3, 1]}) 99 | with pytest.raises(AssertionError): 100 | _ = OptionalDataFrame(df) 101 | 102 | 103 | def test_convert_optional(): 104 | df = pl.DataFrame({'required': [True]}) 105 | data = OptionalDataFrame.convert(df, add_optional_cols=True) 106 | assert all(col in data.df.columns for col in OptionalDataFrame.dtype().keys()) 107 | -------------------------------------------------------------------------------- /typedframe/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | try: 3 | import pandas 4 | except ImportError: 5 | pass 6 | else: 7 | from typedframe.pandas_ import PandasTypedFrame as TypedDataFrame, DATE_TIME_DTYPE, UTC_DATE_TIME_DTYPE 8 | 9 | try: 10 | import polars 11 | except ImportError: 12 | pass 13 | else: 14 | from typedframe.polars_ import PolarsTypedFrame 15 | 16 | __version__ = '0.11.0' 17 | -------------------------------------------------------------------------------- /typedframe/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic classes for typed wrappers over dataframes 3 | """ 4 | from abc import abstractmethod 5 | from itertools import chain 6 | from typing import Type, TypeVar, Any 7 | 8 | 9 | T = TypeVar("T", bound="TypedDataFrameBase") 10 | 11 | 12 | class TypedDataFrameBase: 13 | """ 14 | Wrapper class over DataFrame to provide explicit schema specs. 15 | 16 | Provide expected dataframe schema in schema static variable. 17 | Provide optional columns in optional static variable. 18 | All columns from optional schema that are missing in a wrapped dataframe will be added with NaN values. 19 | 20 | Schemas can be inheritted via Python class inheritance. The semantics of it is the following: 21 | all columns of the parent are also included to a child schema. 22 | """ 23 | 24 | schema = {} 25 | 26 | optional = {} 27 | 28 | @classmethod 29 | @abstractmethod 30 | def convert(cls: Type[T], df, add_optional_cols: bool = True) -> T: 31 | pass 32 | 33 | @classmethod 34 | def dtype(cls: Type[T], with_optional: bool = True) -> dict: 35 | """ 36 | Combines schema of a current class and all super classes 37 | """ 38 | return dict(chain(*(chain(cls.schema.items(), cls.optional.items()) 39 | if with_optional else cls.schema.items() 40 | for cls in cls.__mro__[:-1] if hasattr(cls, 'schema')))) 41 | 42 | @classmethod 43 | def _extract_actual_dtypes(cls: Type[T], df) -> dict: 44 | return cls._extract_actual_dtypes(df) 45 | 46 | @classmethod 47 | @abstractmethod 48 | def _normalize_actual_dtype(cls: Type[T], dtype: Any) -> Any: 49 | pass 50 | 51 | @classmethod 52 | @abstractmethod 53 | def _normalize_expected_dtype(cls: Type[T], dtype: Any) -> Any: 54 | pass 55 | 56 | @classmethod 57 | def _dtypes_mismatch(cls: Type[T], actual: Any, expected: Any) -> bool: 58 | actual = cls._normalize_actual_dtype(actual) 59 | expected = cls._normalize_expected_dtype(expected) 60 | return actual != expected 61 | 62 | def __init__(self, df): 63 | 64 | actual_dtypes = self._extract_actual_dtypes(df) 65 | expected = self.dtype(with_optional=False).items() 66 | 67 | diff = set() 68 | for col, dtype in expected: 69 | try: 70 | if col not in actual_dtypes or self._dtypes_mismatch(actual_dtypes[col], dtype): 71 | diff.add((col, dtype)) 72 | except TypeError: 73 | diff.add((col, dtype)) 74 | 75 | optional = self.dtype().items() 76 | for col, dtype in optional: 77 | try: 78 | if col in actual_dtypes and self._dtypes_mismatch(actual_dtypes[col], dtype): 79 | diff.add((col, dtype)) 80 | except TypeError: 81 | diff.add((col, dtype)) 82 | 83 | if diff: 84 | actual = {key: self._normalize_actual_dtype(value) for key, value in actual_dtypes.items()} 85 | expected = {key: self._normalize_expected_dtype(value) for key, value in self.dtype().items()} 86 | raise AssertionError( 87 | "Dataframe doesn't match schema\n" 88 | f"Actual: {actual}\nExpected: {expected}\nDifference: {diff}" 89 | ) 90 | -------------------------------------------------------------------------------- /typedframe/pandas_.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Type, TypeVar, Any 3 | import pytz 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from typedframe.base import TypedDataFrameBase 9 | 10 | try: 11 | from pandas.api.types import CategoricalDtype 12 | except ImportError: 13 | from pandas.types.dtypes import CategoricalDtype 14 | 15 | """ 16 | dtype for datetime column 17 | """ 18 | DATE_TIME_DTYPE = np.dtype('datetime64[ns]') 19 | UTC_DATE_TIME_DTYPE = pd.DatetimeTZDtype('ns', pytz.UTC) 20 | 21 | T = TypeVar("T", bound="PandasTypedFrame") 22 | 23 | _OBJECT_TYPES = {list, str, dict} 24 | 25 | 26 | class PandasTypedFrame(TypedDataFrameBase): 27 | """ 28 | Wrapper class over pandas 29 | """ 30 | 31 | index_schema = (None, None) # (name, dtype) 32 | 33 | @classmethod 34 | def convert(cls: Type[T], df: pd.DataFrame, add_optional_cols: bool = True) -> T: 35 | """ 36 | Tries to convert a given dataframe and wrap in a typed dataframe. 37 | 38 | Examples 39 | -------- 40 | 41 | >>> from typedframe.pandas_ import PandasTypedFrame, DATE_TIME_DTYPE 42 | >>> class MyTable(PandasTypedFrame): 43 | ... schema = { 44 | ... "col1": str, 45 | ... "col2": np.int32, 46 | ... "col3": ('foo', 'bar') 47 | ... } 48 | ... optional = { 49 | ... "col4": bool, 50 | ... "col5": DATE_TIME_DTYPE 51 | ... } 52 | 53 | >>> df = pd.DataFrame({"col1": ['foo'], "col2": np.array([1], dtype=np.int32), "col3": ['bar']}) 54 | >>> df.col3 = pd.Categorical(df.col3, categories=('foo', 'bar'), ordered=True) 55 | >>> print(MyTable.convert(df).df) 56 | """ 57 | df = df.copy() 58 | 59 | if add_optional_cols: 60 | required = cls.dtype(with_optional=False) 61 | addon = {col: dtype for col, dtype in cls.dtype().items() if col not in df.columns and col not in required} 62 | df: pd.DataFrame = df if len(addon) == 0 else pd.concat( 63 | [df, pd.DataFrame(columns=addon.keys()).astype(addon)], axis=1) 64 | 65 | expected = cls.dtype() 66 | for col in df.columns: 67 | try: 68 | if col in expected: 69 | if isinstance(expected[col], tuple): 70 | actual_cats = set(df[col].unique()) 71 | categories_diff = actual_cats.difference(set(expected[col])) 72 | if categories_diff: 73 | raise AssertionError(f"For column: {col} there are unknown categories: {categories_diff}") 74 | df[col] = pd.Categorical(df[col], categories=expected[col], ordered=True) 75 | elif expected[col] == DATE_TIME_DTYPE: 76 | df[col] = pd.to_datetime(df[col]) 77 | elif expected[col] == UTC_DATE_TIME_DTYPE: 78 | df[col] = pd.to_datetime(df[col], utc=True) 79 | else: 80 | df[col] = df[col].astype(expected[col]) 81 | except Exception as e: 82 | raise AssertionError(f"Failed to convert column: {col}") from e 83 | 84 | if cls.index_schema[1]: 85 | df.index = df.index.astype(cls.index_schema[1]) 86 | df.index.name = cls.index_schema[0] 87 | 88 | return cls(df) 89 | 90 | @classmethod 91 | def _extract_actual_dtypes(cls: Type[T], df: pd.DataFrame) -> dict: 92 | return df.dtypes.to_dict() 93 | 94 | @classmethod 95 | def _normalize_actual_dtype(cls: Type[T], dtype: Any) -> Any: 96 | if isinstance(dtype, CategoricalDtype): 97 | return tuple(dtype.categories) 98 | else: 99 | return dtype 100 | 101 | @classmethod 102 | def _normalize_expected_dtype(cls: Type[T], dtype: Any) -> Any: 103 | try: 104 | if dtype in _OBJECT_TYPES: 105 | return object 106 | else: 107 | return dtype 108 | except TypeError: 109 | return dtype 110 | 111 | def __init__(self, df: pd.DataFrame): 112 | 113 | if not isinstance(df, pd.DataFrame): 114 | raise AssertionError(f"Input argument of type {type(df)} is not an instance of pandas DataFrame") 115 | 116 | super().__init__(df) 117 | 118 | if self.index_schema[1]: 119 | if df.index.name != self.index_schema[0]: 120 | raise AssertionError(f"expected index name {self.index_schema[0]}, actual index name {df.index.name}") 121 | try: 122 | if self._dtypes_mismatch(df.index.dtype, self.index_schema[1]): 123 | raise AssertionError(f"expected index dtype {self.index_schema[1]}, actual index dtype {df.index.dtype}") 124 | except TypeError: 125 | raise AssertionError(f"expected index dtype {self.index_schema[1]}, actual index dtype {df.index.dtype}") 126 | 127 | categoricals = [df[c] for c in df.columns if isinstance(df[c].dtype, CategoricalDtype)] 128 | for col in categoricals: 129 | if object != col.values.categories.dtype: 130 | raise AssertionError("Categoricals must have str categories") 131 | if np.nan in col.unique(): 132 | raise AssertionError("Categoricals must not have NaNs") 133 | 134 | self.df = df 135 | -------------------------------------------------------------------------------- /typedframe/polars_.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from typing import Type, TypeVar, Any 3 | 4 | import polars as pl 5 | 6 | from typedframe.base import TypedDataFrameBase 7 | 8 | T = TypeVar("T", bound="PolarsTypedFrame") 9 | 10 | 11 | class PolarsTypedFrame(TypedDataFrameBase): 12 | 13 | @classmethod 14 | def convert(cls: Type["T"], df: pl.DataFrame, add_optional_cols: bool = True) -> T: 15 | 16 | addon = {} 17 | if add_optional_cols: 18 | required = cls.dtype(with_optional=False) 19 | addon = {col: dtype for col, dtype in cls.dtype().items() if col not in df.columns and col not in required} 20 | 21 | expected = cls.dtype() 22 | df = df.with_columns([ 23 | pl.col(col).cast(expected[col]) for col in df.columns if col in expected 24 | ] + [ 25 | pl.lit(None, dtype=dtype).alias(col) for col, dtype in addon.items() 26 | ]) 27 | return cls(df) 28 | 29 | @classmethod 30 | def _extract_actual_dtypes(cls, df: pl.DataFrame) -> dict: 31 | return dict(zip(df.columns, df.dtypes)) 32 | 33 | @classmethod 34 | def _normalize_actual_dtype(cls, dtype: Any) -> Any: 35 | return dtype 36 | 37 | @classmethod 38 | def _normalize_expected_dtype(cls, dtype: Any) -> Any: 39 | return dtype 40 | 41 | def __init__(self, df: pl.DataFrame): 42 | 43 | if not isinstance(df, pl.DataFrame): 44 | raise AssertionError(f"Input argument of type {type(df)} is not an instance of polars DataFrame") 45 | 46 | super().__init__(df) 47 | self.df = df 48 | --------------------------------------------------------------------------------