├── databackend ├── py.typed ├── tests │ ├── an_unimported_module.py │ ├── a_data_class.py │ └── test_databackend.py └── __init__.py ├── examples └── broom │ ├── requirements.txt │ ├── readme.qmd │ ├── tidy.py │ └── readme.md ├── .github └── workflows │ ├── code-checks.yml │ └── ci.yml ├── .pre-commit-config.yaml ├── LICENSE ├── pyproject.toml ├── .gitignore ├── README.md └── README.qmd /databackend/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /databackend/tests/an_unimported_module.py: -------------------------------------------------------------------------------- 1 | class UnimportedClass: 2 | pass 3 | -------------------------------------------------------------------------------- /examples/broom/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | pymc3 3 | scikit-learn 4 | statsmodels 5 | -------------------------------------------------------------------------------- /databackend/tests/a_data_class.py: -------------------------------------------------------------------------------- 1 | class ADataClass: 2 | pass 3 | 4 | 5 | class ADataClass2: 6 | pass 7 | -------------------------------------------------------------------------------- /.github/workflows/code-checks.yml: -------------------------------------------------------------------------------- 1 | name: Code Checks 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: ['main'] 7 | pull_request: 8 | release: 9 | types: [published] 10 | 11 | jobs: 12 | pre-commit: 13 | name: "Run pre-commit" 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - uses: actions/setup-python@v2 18 | - uses: pre-commit/action@v2.0.3 19 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: "(.*\\.csv)|(.*\\.q?md)" 2 | repos: 3 | - repo: https://github.com/pycqa/flake8 4 | rev: 6.0.0 5 | hooks: 6 | - id: flake8 7 | types: 8 | - python 9 | additional_dependencies: 10 | - flake8-pyproject 11 | - repo: https://github.com/pre-commit/pre-commit-hooks 12 | rev: v4.5.0 13 | hooks: 14 | - id: trailing-whitespace 15 | - id: end-of-file-fixer 16 | - id: check-yaml 17 | args: ["--unsafe"] 18 | - id: check-added-large-files 19 | - repo: https://github.com/psf/black 20 | rev: 24.2.0 21 | hooks: 22 | - id: black 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 databackend contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=45", "wheel", "setuptools_scm>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools_scm] 6 | 7 | [project] 8 | name = "databackend" 9 | description = "Abstract data dispatch" 10 | readme = "README.md" 11 | license.file = "LICENSE" 12 | authors = [ 13 | { name = "Michael Chow", email = "mc_al_github@fastmail.com" } 14 | ] 15 | dynamic = ["version"] 16 | keywords = ["data"] 17 | classifiers = [ 18 | "Programming Language :: Python :: 3.7", 19 | "Programming Language :: Python :: 3.8", 20 | "Programming Language :: Python :: 3.9", 21 | "Programming Language :: Python :: 3.10", 22 | "Programming Language :: Python :: 3.11", 23 | "Programming Language :: Python :: 3.12" 24 | ] 25 | requires-python = ">=3.7" 26 | 27 | [project.urls] 28 | homepage = "https://github.com/machow/databackend" 29 | 30 | [project.optional-dependencies] 31 | dev = [ 32 | "pip-tools", 33 | "pytest" 34 | ] 35 | 36 | [tool.pytest.ini_options] 37 | testpaths = ["databackend"] 38 | addopts = "--doctest-modules" 39 | doctest_optionflags = "NORMALIZE_WHITESPACE" 40 | 41 | [tool.flake8] 42 | max-line-length = 88 43 | 44 | ignore = [ 45 | "E702", # multiple statements on one line (semicolon) 46 | "E701", # multiple statements on one line (colon) 47 | "E704", # multiple statements on one line (def) 48 | ] 49 | -------------------------------------------------------------------------------- /examples/broom/readme.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | text_representation: 5 | extension: .qmd 6 | format_name: quarto 7 | format_version: '1.0' 8 | jupytext_version: 1.14.1 9 | kernelspec: 10 | display_name: Python 3 (ipykernel) 11 | language: python 12 | name: python3 13 | --- 14 | 15 | ```{python} 16 | import numpy as np 17 | import pandas as pd 18 | 19 | from siuba.data import mtcars 20 | 21 | # imported from tidy.py in this folder 22 | from tidy import tidy 23 | ``` 24 | 25 | ## fit statsmodels ---- 26 | 27 | ```{python} 28 | import statsmodels.api as sm 29 | import statsmodels.formula.api as smf 30 | 31 | results = smf.ols('mpg ~ hp', data=mtcars).fit() 32 | 33 | tidy_sm = tidy(results) 34 | 35 | tidy_sm 36 | ``` 37 | 38 | ## fit scikit ---- 39 | 40 | ```{python} 41 | from sklearn.linear_model import LinearRegression 42 | 43 | X = mtcars[['hp']] 44 | y = mtcars['mpg'] 45 | 46 | # y = 1 * x_0 + 2 * x_1 + 3 47 | reg = LinearRegression().fit(X, y) 48 | 49 | tidy_sk = tidy(reg) 50 | 51 | tidy_sk 52 | ``` 53 | 54 | ## fit pymc3 ---- 55 | 56 | ```{python} 57 | from pymc3 import Model, HalfCauchy, Normal, sample 58 | 59 | x = mtcars['hp'].values 60 | y = mtcars['mpg'].values 61 | 62 | data = dict(x=x, y=y) 63 | 64 | np.random.seed(999999) 65 | with Model() as model: # model specifications in PyMC3 are wrapped in a with-statement 66 | # Define priors 67 | sigma = HalfCauchy('sigma', beta=10, testval=1.) 68 | intercept = Normal('intercept', 0, sigma=20) 69 | x_coeff = Normal('hp', 0, sigma=20) 70 | 71 | # Define likelihood 72 | likelihood = Normal('mpg', mu=intercept + x_coeff * x, 73 | sigma=sigma, observed= y) 74 | 75 | # Inference! 76 | trace = sample(500, cores=2, progressbar = False) # draw 3000 posterior samples using NUTS sampling 77 | 78 | tidy_pymc3 = tidy(trace) 79 | 80 | tidy_pymc3 81 | ``` 82 | -------------------------------------------------------------------------------- /databackend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 databackend contributors (MIT License) 2 | # 3 | # See https://github.com/machow/databackend 4 | 5 | from __future__ import annotations 6 | 7 | import sys 8 | import importlib 9 | 10 | from typing import Type, List, Tuple 11 | 12 | from abc import ABCMeta 13 | 14 | 15 | def _load_class(mod_name: str, cls_name: str) -> Type[object]: 16 | mod = importlib.import_module(mod_name) 17 | return getattr(mod, cls_name) 18 | 19 | 20 | class _AbstractBackendMeta(ABCMeta): 21 | def register_backend(cls, mod_name: str, cls_name: str): 22 | """Register a backend class to use in issubclass checks. 23 | 24 | This method is similar to the ABCMeta.register method, except that 25 | it accepts strings, so that an import of the class is not required. 26 | 27 | Note that the arguments to this class match the two pieces in import statements. 28 | E.g. `from a.b.c import d` would become `mod_name="a.b.c"` and `cls_name="d"`. 29 | 30 | Parameters 31 | ---------- 32 | mod_name: str 33 | A module path the class is imported from. 34 | cls_name: str 35 | The name of the class in the imported module. 36 | """ 37 | cls._backends.append((mod_name, cls_name)) 38 | cls._abc_caches_clear() 39 | 40 | 41 | class AbstractBackend(metaclass=_AbstractBackendMeta): 42 | """Represent a class, without needing to import it.""" 43 | 44 | _backends: List[Tuple[str, str]] 45 | 46 | @classmethod 47 | def __init_subclass__(cls): 48 | if not hasattr(cls, "_backends"): 49 | cls._backends = [] 50 | 51 | @classmethod 52 | def __subclasshook__(cls, subclass: Type[object]): 53 | for mod_name, cls_name in cls._backends: 54 | if mod_name not in sys.modules: 55 | # module isn't loaded, so it can't be the subclass 56 | # we don't want to import the module to explicitly run the check 57 | # so skip here. 58 | continue 59 | else: 60 | parent_candidate = _load_class(mod_name, cls_name) 61 | if issubclass(subclass, parent_candidate): 62 | return True 63 | 64 | return NotImplemented 65 | -------------------------------------------------------------------------------- /examples/broom/tidy.py: -------------------------------------------------------------------------------- 1 | from databackend import AbstractBackend 2 | from functools import singledispatch 3 | 4 | # Abstract backend classes ==================================================== 5 | 6 | 7 | class BaseSklearnModel(AbstractBackend): 8 | _backends = [("sklearn.linear_model", "LinearRegression")] 9 | 10 | 11 | class BaseSmRegressionResult(AbstractBackend): 12 | _backends = [("statsmodels.regression.linear_model", "RegressionResultsWrapper")] 13 | 14 | 15 | class BasePymcMultiTrace(AbstractBackend): 16 | _backends = [("pymc3.backends.base", "MultiTrace")] 17 | 18 | 19 | # Implement generic function: tidy ============================================ 20 | 21 | 22 | @singledispatch 23 | def tidy(fit, *args, **kwargs): 24 | raise NotImplementedError(f"No tidy method for class {fit.__class__}") 25 | 26 | 27 | # sklearn ---- 28 | 29 | 30 | @tidy.register 31 | def _tidy_sklearn(fit: BaseSklearnModel, col_names=None): 32 | from pandas import DataFrame, NA 33 | 34 | estimates = [fit.intercept_, *fit.coef_] 35 | 36 | if col_names is None: 37 | terms = list(range(len(estimates))) 38 | else: 39 | terms = ["intercept", *col_names] 40 | 41 | # pd.DataFrame() 42 | return DataFrame({"term": terms, "estimate": estimates, "std_error": NA}) 43 | 44 | 45 | # statsmodels ---- 46 | 47 | 48 | @tidy.register 49 | def _tidy_statsmodels(fit: BaseSmRegressionResult): 50 | from statsmodels.iolib.summary import summary_params_frame 51 | 52 | tidied = summary_params_frame(fit).reset_index() 53 | rename_cols = { 54 | "index": "term", 55 | "coef": "estimate", 56 | "std err": "std_err", 57 | "t": "statistic", 58 | "P>|t|": "p_value", 59 | "Conf. Int. Low": "conf_int_low", 60 | "Conf. Int. Upp.": "conf_int_high", 61 | } 62 | 63 | return tidied.rename(columns=rename_cols) 64 | 65 | 66 | # pymc3 ---- 67 | 68 | 69 | @tidy.register 70 | def _tidy_trace(fit: BasePymcMultiTrace, robust=False): 71 | from pymc3 import trace_to_dataframe 72 | 73 | trace_df = trace_to_dataframe(fit) 74 | 75 | agg_funcs = ["median", "mad"] if robust else ["mean", "std"] 76 | 77 | # data frame with columns like: median, mad. 78 | tidied = trace_df.agg(agg_funcs).T.reset_index() 79 | tidied.columns = ["term", "estimate", "std_err"] 80 | 81 | return tidied 82 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | release: 8 | types: [published] 9 | 10 | jobs: 11 | run-if: 12 | name: "Run If" 13 | runs-on: ubuntu-latest 14 | if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false 15 | steps: 16 | - run: | 17 | echo "Running CI" 18 | test-python: 19 | name: "Test Python Version" 20 | needs: ["run-if"] 21 | runs-on: ubuntu-latest 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | # Checks based on python versions --- 26 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 27 | 28 | steps: 29 | - uses: actions/checkout@v2 30 | - uses: actions/setup-python@v2 31 | with: 32 | python-version: "${{ matrix.python-version }}" 33 | - name: Install dependencies from requirements file 34 | run: | 35 | python -m pip install --upgrade pip 36 | python -m pip install .[dev] 37 | env: 38 | REQUIREMENTS: ${{ matrix.requirements }} 39 | - name: Run tests 40 | run: | 41 | pytest 42 | 43 | release-pypi: 44 | name: "Release to pypi" 45 | runs-on: ubuntu-latest 46 | if: github.event_name == 'release' 47 | needs: [test-python] 48 | steps: 49 | - uses: actions/checkout@v2 50 | - uses: actions/setup-python@v2 51 | with: 52 | python-version: "3.10" 53 | - name: "Build Package" 54 | run: | 55 | python -m pip install build wheel 56 | python -m build --sdist --wheel 57 | - name: "Deploy to PyPI" 58 | uses: pypa/gh-action-pypi-publish@release/v1 59 | with: 60 | user: __token__ 61 | password: ${{ secrets.PYPI_API_TOKEN }} 62 | # build-docs: 63 | # name: "Build Docs" 64 | # needs: ["run-if"] 65 | # runs-on: ubuntu-latest 66 | # steps: 67 | # - uses: actions/checkout@v2 68 | # - uses: actions/setup-python@v2 69 | # with: 70 | # python-version: "3.10" 71 | # - name: Install dependencies from requirements file 72 | # run: | 73 | # python -m pip install --upgrade pip 74 | # python -m pip install -r requirements/dev.txt 75 | # - name: Build docs 76 | # run: | 77 | # make docs-build 78 | -------------------------------------------------------------------------------- /databackend/tests/test_databackend.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import sys 3 | import importlib 4 | 5 | from databackend import AbstractBackend 6 | from databackend.tests.a_data_class import ADataClass, ADataClass2 7 | 8 | CLASS_MOD = "databackend.tests.a_data_class" 9 | CLASS_NAME = "ADataClass" 10 | 11 | 12 | @pytest.fixture 13 | def Base(): 14 | class Base(AbstractBackend): 15 | pass 16 | 17 | Base.register_backend(CLASS_MOD, CLASS_NAME) 18 | 19 | return Base 20 | 21 | 22 | def test_check_unimported_mod(Base): 23 | class ABase(AbstractBackend): 24 | pass 25 | 26 | mod_name = "databackend.tests.an_unimported_module" 27 | ABase.register_backend(mod_name, "UnimportedClass") 28 | 29 | # check pre-import and verify it's still not imported ---- 30 | assert not issubclass(int, ABase) 31 | assert mod_name not in sys.modules 32 | 33 | # do import and verify ABC is seen as parent class ---- 34 | mod = importlib.import_module(mod_name) 35 | 36 | assert issubclass(mod.UnimportedClass, ABase) 37 | 38 | 39 | def test_issubclass(Base): 40 | assert issubclass(ADataClass, Base) 41 | 42 | 43 | def test_isinstance(Base): 44 | assert isinstance(ADataClass(), Base) 45 | 46 | 47 | def test_check_is_cached(): 48 | checks = [0] 49 | 50 | class ABase(AbstractBackend): 51 | @classmethod 52 | def __subclasshook__(cls, subclass): 53 | # increment the number in checks, as a dumb way 54 | # of seeing how often this runs 55 | # could also use abc.ABCMeta._dump_registry 56 | checks[0] = checks[0] + 1 57 | return super().__subclasshook__(subclass) 58 | 59 | # this check runs subclasshook ---- 60 | issubclass(ADataClass, ABase) 61 | assert checks[0] == 1 62 | 63 | # now that ADataClass is in the abc.ABCMeta cache, it 64 | # does *not* run subclasshook 65 | issubclass(ADataClass, ABase) 66 | assert checks[0] == 1 67 | 68 | 69 | def test_backends_spec_at_class_declaration(): 70 | class ABase(AbstractBackend): 71 | _backends = [(CLASS_MOD, CLASS_NAME)] 72 | 73 | assert issubclass(ADataClass, ABase) 74 | 75 | 76 | def test_backends_do_not_overlap(): 77 | class ABase1(AbstractBackend): ... 78 | 79 | class ABase2(AbstractBackend): ... 80 | 81 | ABase1.register_backend(CLASS_MOD, "ADataClass") 82 | ABase2.register_backend(CLASS_MOD, "ADataClass2") 83 | 84 | obj1 = ADataClass() 85 | obj2 = ADataClass2() 86 | 87 | assert isinstance(obj1, ABase1) 88 | assert not isinstance(obj1, ABase2) 89 | 90 | assert not isinstance(obj2, ABase1) 91 | assert isinstance(obj2, ABase2) 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Vim 2 | .*.sw[po] 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # databackend 3 | 4 | The `databackend` package allows you to register a subclass, without 5 | needing to import the subclass itself. This is useful for implementing 6 | actions over optional dependencies. 7 | 8 | ## Example 9 | 10 | For this example, we’ll implement a function, `fill_na()`, that fills in 11 | missing values in a DataFrame. It works with DataFrame objects from two 12 | popular libraries: `pandas` and `polars`. Importantly, neither library 13 | needs to be installed. 14 | 15 | ### Setup 16 | 17 | The code below defines “abstract” parent classes for each of the 18 | DataFrame classes in the two libraries. 19 | 20 | ``` python 21 | from databackend import AbstractBackend 22 | 23 | class AbstractPandasFrame(AbstractBackend): 24 | _backends = [("pandas", "DataFrame")] 25 | 26 | 27 | class AbstractPolarsFrame(AbstractBackend): 28 | _backends = [("polars", "DataFrame")] 29 | ``` 30 | 31 | Note that the abstract classes can be used as stand-ins for the real 32 | thing in `issubclass()` and `isinstance`. 33 | 34 | ``` python 35 | from pandas import DataFrame 36 | 37 | issubclass(DataFrame, AbstractPandasFrame) 38 | isinstance(DataFrame(), AbstractPandasFrame) 39 | ``` 40 | 41 | True 42 | 43 | > 📝 Note that you can use 44 | > `AbstractPandasFrame.register_backend("pandas", "DataFrame")`, as an 45 | > alternative way to register backends. 46 | 47 | ### Simple fill_na: isinstance to switch behavior 48 | 49 | The `fill_na()` function below uses custom handling for pandas and 50 | polars. 51 | 52 | ``` python 53 | def fill_na(data, x): 54 | if isinstance(data, AbstractPolarsFrame): 55 | return data.fill_nan(x) 56 | elif isinstance(data, AbstractPandasFrame): 57 | return data.fillna(x) 58 | else: 59 | raise NotImplementedError() 60 | ``` 61 | 62 | Notice that neither `pandas` nor `polars` need to be imported when 63 | defining `fill_na()`. 64 | 65 | Here is an example of calling `fill_na()` on both kinds of DataFrames. 66 | 67 | ``` python 68 | # test polars ---- 69 | 70 | import polars as pl 71 | 72 | df = pl.DataFrame({"x": [1, 2, None]}) 73 | fill_na(df, 3) 74 | 75 | 76 | # test pandas ---- 77 | 78 | import pandas as pd 79 | 80 | df = pd.DataFrame({"x": [1, 2, None]}) 81 | fill_na(df, 3) 82 | ``` 83 | 84 | x 85 | 0 1.0 86 | 1 2.0 87 | 2 3.0 88 | 89 | The key here is that a user could have only pandas, or only polars, 90 | installed. Importantly, doing the isinstance checks do not import any 91 | libraries! 92 | 93 | ### Advanced fill_na: generic function dispatch 94 | 95 | `databackend` shines when combined with [generic function 96 | dispatch](https://mchow.com/posts/2020-02-24-single-dispatch-data-science/). 97 | This is a programming approach where you declare a function 98 | (e.g. `fill_na()`), and then register each backend specific 99 | implementation on the function. 100 | 101 | Python has a built-in function implementing this called 102 | [`functools.singledispatch`](https://docs.python.org/3/library/functools.html#functools.singledispatch). 103 | 104 | Here is an example of the previous `fill_na()` function written using 105 | it. 106 | 107 | ``` python 108 | from functools import singledispatch 109 | 110 | @singledispatch 111 | def fill_na2(data, x): 112 | raise NotImplementedError(f"No support for class: {type(data)}") 113 | 114 | 115 | # handle polars ---- 116 | 117 | @fill_na2.register 118 | def _(data: AbstractPolarsFrame, x): 119 | return data.fill_nan(x) 120 | 121 | 122 | # handle pandas ---- 123 | 124 | @fill_na2.register 125 | def _(data: AbstractPandasFrame, x): 126 | return data.fillna(x) 127 | ``` 128 | 129 | Note two important decorators: 130 | 131 | - `@singledispatch` defines a default function. This gets called if no 132 | specific implementations are found. 133 | - `@fill_na2.register` defines specific versions of the function. 134 | 135 | Here’s an example of it in action. 136 | 137 | ``` python 138 | # example ---- 139 | 140 | import pandas as pd 141 | import polars as pl 142 | 143 | df = pl.DataFrame({"x": [1, 2, None]}) 144 | fill_na2(df, 3) 145 | 146 | df = pd.DataFrame({"x": [1, 2, None]}) 147 | fill_na2(df, 3) 148 | ``` 149 | 150 | x 151 | 0 1.0 152 | 1 2.0 153 | 2 3.0 154 | 155 | ### How it works 156 | 157 | Under the hood, `AbstractBackend` behaves similarly to python’s builtin 158 | [`abc.ABC` class](https://docs.python.org/3/library/abc.html#abc.ABC). 159 | 160 | ``` python 161 | from abc import ABC 162 | 163 | class MyABC(ABC): 164 | pass 165 | 166 | from io import StringIO 167 | 168 | MyABC.register(StringIO) 169 | 170 | 171 | # StringIO is a "virtual subclass" of MyABC 172 | isinstance(StringIO("abc"), MyABC) 173 | ``` 174 | 175 | True 176 | 177 | The key difference is that you can specify the virtual subclass using 178 | the tuple `("", "")`. 179 | 180 | When `issubclass(SomeClass, AbstractBackend)` runs, then… 181 | 182 | - The standard ABC caching mechanism is checked, and potentially 183 | returns the answer immediately. 184 | - Otherwise, a subclass hook cycles through registered backends. 185 | - The hook runs the subclass check for any backends that are imported 186 | (e.g. are in `sys.modules`). 187 | 188 | Technically, `AbstractBackend` inherits all the useful metaclass things 189 | from `abc.ABCMeta`, so these can be used also. 190 | -------------------------------------------------------------------------------- /README.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: python3 3 | format: gfm 4 | --- 5 | 6 | # databackend 7 | 8 | ```{python} 9 | #| include: false 10 | # this keeps the pandas dataframe repr from spitting out scoped style tags 11 | # which don't render on github 12 | import pandas as pd 13 | pd.set_option("display.notebook_repr_html", False) 14 | 15 | ``` 16 | 17 | The `databackend` package allows you to register a subclass, without needing to import the subclass itself. 18 | This is useful for implementing actions over optional dependencies. 19 | 20 | 21 | ## Example 22 | 23 | For this example, we'll implement a function, `fill_na()`, that fills in missing values 24 | in a DataFrame. 25 | It works with DataFrame objects from two popular libraries: `pandas` and `polars`. 26 | Importantly, neither library needs to be installed. 27 | 28 | ### Setup 29 | 30 | The code below defines "abstract" parent classes for each of the DataFrame classes in the two libraries. 31 | 32 | ```{python} 33 | from databackend import AbstractBackend 34 | 35 | class AbstractPandasFrame(AbstractBackend): 36 | _backends = [("pandas", "DataFrame")] 37 | 38 | 39 | class AbstractPolarsFrame(AbstractBackend): 40 | _backends = [("polars", "DataFrame")] 41 | 42 | 43 | ``` 44 | 45 | Note that the abstract classes can be used as stand-ins for the real thing 46 | in `issubclass()` and `isinstance`. 47 | 48 | ```{python} 49 | from pandas import DataFrame 50 | 51 | issubclass(DataFrame, AbstractPandasFrame) 52 | isinstance(DataFrame(), AbstractPandasFrame) 53 | ``` 54 | 55 | > 📝 Note that you can use `AbstractPandasFrame.register_backend("pandas", "DataFrame")`, as an alternative way to register backends. 56 | 57 | ### Simple fill_na: isinstance to switch behavior 58 | 59 | The `fill_na()` function below uses custom handling for pandas and polars. 60 | 61 | ```{python} 62 | def fill_na(data, x): 63 | if isinstance(data, AbstractPolarsFrame): 64 | return data.fill_nan(x) 65 | elif isinstance(data, AbstractPandasFrame): 66 | return data.fillna(x) 67 | else: 68 | raise NotImplementedError() 69 | 70 | ``` 71 | 72 | Notice that neither `pandas` nor `polars` need to be imported when defining `fill_na()`. 73 | 74 | Here is an example of calling `fill_na()` on both kinds of DataFrames. 75 | 76 | ```{python} 77 | # test polars ---- 78 | 79 | import polars as pl 80 | 81 | df = pl.DataFrame({"x": [1, 2, None]}) 82 | fill_na(df, 3) 83 | 84 | 85 | # test pandas ---- 86 | 87 | import pandas as pd 88 | 89 | df = pd.DataFrame({"x": [1, 2, None]}) 90 | fill_na(df, 3) 91 | ``` 92 | 93 | The key here is that a user could have only pandas, or only polars, installed. 94 | Importantly, doing the isinstance checks do not import any libraries! 95 | 96 | 97 | ### Advanced fill_na: generic function dispatch 98 | 99 | `databackend` shines when combined with [generic function dispatch](https://mchow.com/posts/2020-02-24-single-dispatch-data-science/). 100 | This is a programming approach where you declare a function (e.g. `fill_na()`), 101 | and then register each backend specific implementation on the function. 102 | 103 | Python has a built-in function implementing this called [`functools.singledispatch`](https://docs.python.org/3/library/functools.html#functools.singledispatch). 104 | 105 | Here is an example of the previous `fill_na()` function written using it. 106 | 107 | ```{python} 108 | from functools import singledispatch 109 | 110 | @singledispatch 111 | def fill_na2(data, x): 112 | raise NotImplementedError(f"No support for class: {type(data)}") 113 | 114 | 115 | # handle polars ---- 116 | 117 | @fill_na2.register 118 | def _(data: AbstractPolarsFrame, x): 119 | return data.fill_nan(x) 120 | 121 | 122 | # handle pandas ---- 123 | 124 | @fill_na2.register 125 | def _(data: AbstractPandasFrame, x): 126 | return data.fillna(x) 127 | 128 | ``` 129 | 130 | Note two important decorators: 131 | 132 | * `@singledispatch` defines a default function. This gets called if no specific implementations are found. 133 | * `@fill_na2.register` defines specific versions of the function. 134 | 135 | Here's an example of it in action. 136 | 137 | ```{python} 138 | # example ---- 139 | 140 | import pandas as pd 141 | import polars as pl 142 | 143 | df = pl.DataFrame({"x": [1, 2, None]}) 144 | fill_na2(df, 3) 145 | 146 | df = pd.DataFrame({"x": [1, 2, None]}) 147 | fill_na2(df, 3) 148 | ``` 149 | 150 | ### How it works 151 | 152 | Under the hood, `AbstractBackend` behaves similarly to python's builtin [`abc.ABC` class](https://docs.python.org/3/library/abc.html#abc.ABC). 153 | 154 | ```{python} 155 | from abc import ABC 156 | 157 | class MyABC(ABC): 158 | pass 159 | 160 | from io import StringIO 161 | 162 | MyABC.register(StringIO) 163 | 164 | 165 | # StringIO is a "virtual subclass" of MyABC 166 | isinstance(StringIO("abc"), MyABC) 167 | ``` 168 | 169 | The key difference is that you can specify the virtual subclass using the tuple `("", "")`. 170 | 171 | When `issubclass(SomeClass, AbstractBackend)` runs, then... 172 | 173 | * The standard ABC caching mechanism is checked, and potentially returns the answer immediately. 174 | * Otherwise, a subclass hook cycles through registered backends. 175 | * The hook runs the subclass check for any backends that are imported (e.g. are in `sys.modules`). 176 | 177 | Technically, `AbstractBackend` inherits all the useful metaclass things from `abc.ABCMeta`, so these can be used also. 178 | -------------------------------------------------------------------------------- /examples/broom/readme.md: -------------------------------------------------------------------------------- 1 | ```python 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from siuba.data import mtcars 6 | 7 | # imported from tidy.py in this folder 8 | from tidy import tidy 9 | ``` 10 | 11 | ## fit statsmodels ---- 12 | 13 | 14 | ```python 15 | import statsmodels.api as sm 16 | import statsmodels.formula.api as smf 17 | 18 | results = smf.ols('mpg ~ hp', data=mtcars).fit() 19 | 20 | tidy_sm = tidy(results) 21 | 22 | tidy_sm 23 | ``` 24 | 25 | 26 | 27 | 28 |
29 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 |
termestimatestd_errstatisticp_valueconf_int_lowconf_int_high
0Intercept30.0988611.63392118.4212466.642736e-1826.76194933.435772
1hp-0.0682280.010119-6.7423891.787835e-07-0.088895-0.047562
78 |
79 | 80 | 81 | 82 | ## fit scikit ---- 83 | 84 | 85 | ```python 86 | from sklearn.linear_model import LinearRegression 87 | 88 | X = mtcars[['hp']] 89 | y = mtcars['mpg'] 90 | 91 | # y = 1 * x_0 + 2 * x_1 + 3 92 | reg = LinearRegression().fit(X, y) 93 | 94 | tidy_sk = tidy(reg) 95 | 96 | tidy_sk 97 | ``` 98 | 99 | 100 | 101 | 102 |
103 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 |
termestimatestd_error
0030.098861<NA>
11-0.068228<NA>
140 |
141 | 142 | 143 | 144 | ## fit pymc3 ---- 145 | 146 | 147 | ```python 148 | from pymc3 import Model, HalfCauchy, Normal, sample 149 | 150 | x = mtcars['hp'].values 151 | y = mtcars['mpg'].values 152 | 153 | data = dict(x=x, y=y) 154 | 155 | np.random.seed(999999) 156 | with Model() as model: # model specifications in PyMC3 are wrapped in a with-statement 157 | # Define priors 158 | sigma = HalfCauchy('sigma', beta=10, testval=1.) 159 | intercept = Normal('intercept', 0, sigma=20) 160 | x_coeff = Normal('hp', 0, sigma=20) 161 | 162 | # Define likelihood 163 | likelihood = Normal('mpg', mu=intercept + x_coeff * x, 164 | sigma=sigma, observed= y) 165 | 166 | # Inference! 167 | trace = sample(500, cores=2, progressbar = False) # draw 3000 posterior samples using NUTS sampling 168 | 169 | tidy_pymc3 = tidy(trace) 170 | 171 | tidy_pymc3 172 | ``` 173 | 174 | /Users/machow/.virtualenvs/databackend/lib/python3.8/site-packages/deprecat/classic.py:215: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning. 175 | return wrapped_(*args_, **kwargs_) 176 | Auto-assigning NUTS sampler... 177 | Initializing NUTS using jitter+adapt_diag... 178 | Multiprocess sampling (2 chains in 2 jobs) 179 | NUTS: [hp, intercept, sigma] 180 | /Users/machow/.virtualenvs/databackend/lib/python3.8/site-packages/scipy/stats/_continuous_distns.py:624: RuntimeWarning: overflow encountered in _beta_ppf 181 | return _boost._beta_ppf(q, a, b) 182 | /Users/machow/.virtualenvs/databackend/lib/python3.8/site-packages/scipy/stats/_continuous_distns.py:624: RuntimeWarning: overflow encountered in _beta_ppf 183 | return _boost._beta_ppf(q, a, b) 184 | Sampling 2 chains for 1_000 tune and 500 draw iterations (2_000 + 1_000 draws total) took 3 seconds. 185 | The acceptance probability does not match the target. It is 0.8794624791437439, but should be close to 0.8. Try to increase the number of tuning steps. 186 | 187 | 188 | 189 | 190 | 191 |
192 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 |
termestimatestd_err
0intercept30.0517211.705625
1hp-0.0680230.010389
2sigma4.0305650.557375
235 |
236 | 237 | 238 | --------------------------------------------------------------------------------