├── binder ├── runtime.txt ├── postBuild └── requirements.txt ├── dbcooper ├── data │ ├── __init__.py │ └── lahman.py ├── __init__.py ├── base.py ├── tests │ ├── conftest.py │ ├── test_example_schemas.py │ └── helpers.py ├── utils.py ├── collect.py ├── tables.py ├── dbcooper.py ├── finder.py └── inspect.py ├── docs ├── .gitignore ├── api │ ├── index.rst │ └── example_class.rst ├── index.rst └── conf.py ├── MANIFEST.in ├── .env.dev ├── .pre-commit-config.yaml ├── docker-compose.yml ├── Makefile ├── LICENSE ├── examples ├── backends.Rmd └── lahman.Rmd ├── pyproject.toml ├── requirements ├── 2022-01-01.txt └── dev.txt ├── .gitignore ├── .github └── workflows │ └── ci.yml ├── README.Rmd └── README.md /binder/runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.9 2 | -------------------------------------------------------------------------------- /binder/postBuild: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | pip install -e . 4 | -------------------------------------------------------------------------------- /dbcooper/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .lahman import lahman_sqlite 2 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | # autosummary generated doc pages 2 | api/api_card 3 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | API 2 | ============= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude .* 2 | prune .* 3 | prune requirements 4 | prune binder 5 | prune docs 6 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to dbcooper's documentation! 2 | ================================ 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | api/index 9 | 10 | 11 | 12 | Indices and tables 13 | ================== 14 | 15 | * :ref:`genindex` 16 | * :ref:`modindex` 17 | * :ref:`search` 18 | -------------------------------------------------------------------------------- /docs/api/example_class.rst: -------------------------------------------------------------------------------- 1 | Example Class 2 | ============= 3 | 4 | .. currentmodule:: dbcooper 5 | 6 | Constructor 7 | ----------- 8 | 9 | .. autosummary:: 10 | :toctree: api_card 11 | 12 | ExampleClass 13 | 14 | Methods 15 | ------- 16 | 17 | .. autosummary:: 18 | :toctree: api_card 19 | 20 | ExampleClass.show 21 | -------------------------------------------------------------------------------- /.env.dev: -------------------------------------------------------------------------------- 1 | SB_TEST_PGDATABASE=postgres 2 | SB_TEST_PGPORT=5432 3 | SB_TEST_PGUSER=postgres 4 | SB_TEST_PGPASSWORD="" 5 | SB_Test_PGHOST=localhost 6 | 7 | SB_TEST_BQ_DATABASE=ci 8 | SB_TEST_BQPROJECT=siuba-tests 9 | 10 | SB_TEST_SNOWFLAKEDATABASE=DATASETS 11 | SB_TEST_SNOWFLAKEUSER="FILL_ME_IN" 12 | SB_TEST_SNOWFLAKEPASSWORD="FILL_ME_IN" 13 | SB_TEST_SNOWFLAKEHOST="FILL_ME_IN" 14 | SB_TEST_SNOWFLAKEOPTIONS="warehouse=COMPUTE_WH&role=CI_USER" 15 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ".*\\.csv" 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v2.4.0 5 | hooks: 6 | - id: flake8 7 | # line too long and line before binary operator (black is ok with these) 8 | types: 9 | - python 10 | - id: trailing-whitespace 11 | - id: end-of-file-fixer 12 | - id: check-yaml 13 | args: ["--unsafe"] 14 | - id: check-added-large-files 15 | - repo: https://github.com/psf/black 16 | rev: 19.10b0 17 | hooks: 18 | - id: black 19 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | 3 | services: 4 | 5 | db_mysql: 6 | image: mysql 7 | restart: always 8 | environment: 9 | MYSQL_ROOT_PASSWORD: "" 10 | MYSQL_ALLOW_EMPTY_PASSWORD: 1 11 | MYSQL_DATABASE: "public" 12 | ports: 13 | - 3307:3306 14 | # by default, mysql rounds to 4 decimals, but tests require more precision 15 | command: --div-precision-increment=30 16 | 17 | db: 18 | image: postgres 19 | restart: always 20 | environment: 21 | POSTGRES_PASSWORD: "" 22 | POSTGRES_HOST_AUTH_METHOD: "trust" 23 | ports: 24 | - 5433:5432 25 | -------------------------------------------------------------------------------- /dbcooper/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version as _v 2 | 3 | # Set version ----------------------------------------------------------------- 4 | 5 | __version__ = _v("dbcooper") 6 | 7 | del _v 8 | 9 | # Main imports ---------------------------------------------------------------- 10 | 11 | from .dbcooper import DbCooper # noqa 12 | from .finder import TableFinder, AccessorBuilder, AccessorHierarchyBuilder 13 | from .tables import DbcDocumentedTable, DbcSimpleTable 14 | 15 | __all__ = ( 16 | "DbCooper", 17 | "TableFinder", 18 | "AccessorBuilder", 19 | "AccessorHierarchyBuilder", 20 | "DbcDocumentedTable", 21 | "DbcSimpleTable", 22 | ) -------------------------------------------------------------------------------- /dbcooper/base.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, astuple 2 | 3 | @dataclass(frozen=True) 4 | class TableName: 5 | database: "str | None" 6 | schema: "str | None" 7 | table: "str" 8 | 9 | def to_tuple(self, exists=False): 10 | tup = astuple(self) 11 | 12 | if exists: 13 | return tuple(x for x in tup if x is not None) 14 | 15 | return tup 16 | 17 | def field_index_from_end(self, part): 18 | # could derive from dataclasses.fields, but probably not worth it. 19 | if part == "database": 20 | return -3 21 | elif part == "schema": 22 | return -2 23 | elif part == "table": 24 | return -1 25 | 26 | def apply_maybe(self, f): 27 | tup = self.to_tuple() 28 | return self.__class__(*[f(x) if x is not None else x for x in tup]) 29 | 30 | @dataclass(frozen=True) 31 | class TableIdentity: 32 | schema: "str | quoted_name | None" 33 | table: "str | quoted_name" 34 | 35 | -------------------------------------------------------------------------------- /dbcooper/data/lahman.py: -------------------------------------------------------------------------------- 1 | descriptions = { 2 | "Salaries": { 3 | "schema": "Player salaries, going back to 1985.", 4 | "columns": { 5 | "yearID": "Year.", 6 | "teamID": "Team ID.", 7 | "lgID": "League ID.", 8 | "playerID": 'Player ID. See e.g. the "People" table for player info.', 9 | "Salary": "Salary (in dollars).", 10 | } 11 | } 12 | } 13 | 14 | def lahman_sqlite(engine=None, schema="lahman"): 15 | from sqlalchemy import create_engine 16 | if engine is None: 17 | NotImplementedError() 18 | #engine = create_engine("sqlite://") 19 | 20 | engine.execute("ATTACH ':memory:' AS %s" % schema) 21 | load_tables_for_engine(engine, schema=schema) 22 | #return engine 23 | 24 | def load_tables_for_engine(engine, exclude=[], **kwargs): 25 | import lahman 26 | for name in lahman._accessors: 27 | if name in exclude: continue 28 | df = getattr(lahman, name)() 29 | df.to_sql(name, engine, **kwargs) 30 | 31 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SPHINX_BUILDARGS= 2 | 3 | .PHONY: requirements test 4 | 5 | dev-start: 6 | docker-compose up -d 7 | 8 | dev-stop: 9 | docker-compose down 10 | 11 | test: 12 | pytest 13 | 14 | requirements/dev.txt: setup.cfg 15 | @# allows you to do this... 16 | @# make requirements | tee > requirements/some_file.txt 17 | @pip-compile setup.cfg --rebuild --extra dev --output-file=- > $@ 18 | 19 | binder/requirements.txt: setup.cfg 20 | @pip-compile setup.cfg --rebuild --extra binder --output-file=- > $@ 21 | 22 | docs-build: 23 | cd docs && sphinx-build . ./_build/html $(SPHINX_BUILDARGS) 24 | 25 | docs-watch: 26 | cd docs && sphinx-autobuild . ./_build/html $(SPHINX_BUILDARGS) 27 | 28 | README.md: README.Rmd 29 | jupytext --from Rmd --to ipynb --output - $^ \ 30 | | jupyter nbconvert \ 31 | --stdin --to markdown \ 32 | --execute \ 33 | --ExecutePreprocessor.kernel_name='venv-dbcooper-py' \ 34 | --TagRemovePreprocessor.remove_all_outputs_tags='hide-cell' \ 35 | --TagRemovePreprocessor.remove_input_tags='hide-cell' \ 36 | --output $@ 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Michael Chow 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dbcooper/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from siuba.tests.helpers import SqlBackend, BigqueryBackend, CloudBackend 4 | from dbcooper.tests.helpers import create_examples 5 | 6 | params_backend = [ 7 | pytest.param(lambda: SqlBackend("postgresql"), id = "postgresql", marks=pytest.mark.postgresql), 8 | pytest.param(lambda: SqlBackend("mysql"), id = "mysql", marks=pytest.mark.mysql), 9 | pytest.param(lambda: SqlBackend("sqlite"), id = "sqlite", marks=pytest.mark.sqlite), 10 | pytest.param(lambda: SqlBackend("duckdb"), id = "duckdb", marks=pytest.mark.duckdb), 11 | pytest.param(lambda: BigqueryBackend("bigquery"), id = "bigquery", marks=pytest.mark.bigquery), 12 | pytest.param(lambda: CloudBackend("snowflake"), id = "snowflake", marks=pytest.mark.snowflake), 13 | ] 14 | 15 | @pytest.fixture(params=params_backend, scope = "session") 16 | def backend(request): 17 | backend = request.param() 18 | if backend.name in ["snowflake", "bigquery"]: 19 | # We can't easily set up and teardown new databases for cloud providers 20 | # so really on creating the data outside of tests 21 | pass 22 | else: 23 | create_examples(backend.engine) 24 | 25 | return backend 26 | -------------------------------------------------------------------------------- /examples/backends.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | formats: ipynb,Rmd 5 | text_representation: 6 | extension: .Rmd 7 | format_name: rmarkdown 8 | format_version: '1.2' 9 | jupytext_version: 1.13.7 10 | kernelspec: 11 | display_name: venv-dbcooper-py 12 | language: python 13 | name: venv-dbcooper-py 14 | --- 15 | 16 | ```{python} 17 | from dotenv import load_dotenv 18 | from dbcooper.utils import SingleGeneric 19 | from sqlalchemy.schema import CreateSchema 20 | 21 | load_dotenv() 22 | 23 | from dbcooper.tests.helpers import create_examples 24 | ``` 25 | 26 | ```{python} 27 | from siuba.tests.helpers import CloudBackend, SqlBackend, BigqueryBackend 28 | 29 | be_snow = CloudBackend("snowflake") 30 | be_sqlite = SqlBackend("sqlite") 31 | be_pg = SqlBackend("postgresql") 32 | be_mysql = SqlBackend("mysql") 33 | be_bq = BigqueryBackend("bigquery") 34 | ``` 35 | 36 | ```{python} 37 | #create_examples(be_snow.engine) 38 | ``` 39 | 40 | ```{python} 41 | from dbcooper.autotables import AutoTable 42 | from dbcooper.builder import TableFinder 43 | 44 | #find_from_schema = TableFinder(exclude=(format_from_part="table") 45 | 46 | tbl = AutoTable(be_snow.engine)#, find_from_schema) 47 | tbl._init() 48 | ``` 49 | 50 | ```{python} 51 | tbl.list() 52 | ``` 53 | -------------------------------------------------------------------------------- /dbcooper/utils.py: -------------------------------------------------------------------------------- 1 | class SingleGeneric: 2 | def __init__(self, name, dispatch_on_attr = "name"): 3 | self.name = name 4 | self.registry = {} 5 | self.dispatch_on_attr = dispatch_on_attr 6 | self.default = None 7 | 8 | def __call__(self, dialect, *args, **kwargs): 9 | f_concrete = self.dispatch(dialect) 10 | 11 | return f_concrete(dialect, *args, **kwargs) 12 | 13 | def trait(self, obj): 14 | return getattr(obj, self.dispatch_on_attr) 15 | 16 | def dispatch(self, obj): 17 | type_str = self.trait(obj) 18 | 19 | try: 20 | f_concrete = self.registry[type_str] 21 | except KeyError: 22 | if self.default is not None: 23 | f_concrete = self.default 24 | else: 25 | raise NotImplementedError(f"Cannot dispatch on {type_str} and no default implementation.") 26 | 27 | return f_concrete 28 | 29 | def register(self, type_str, func=None): 30 | # allow it to function as a decorator 31 | if func is None: 32 | return lambda f: self.register(type_str, f) 33 | 34 | self.registry[type_str] = func 35 | 36 | return func 37 | 38 | def register_default(self, func): 39 | self.default = func 40 | 41 | def __repr__(self): 42 | return f"{type(self)}({self.name})" 43 | 44 | 45 | -------------------------------------------------------------------------------- /binder/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.8 3 | # To update, run: 4 | # 5 | # pip-compile --extra=binder --output-file=- setup.cfg 6 | # 7 | attrs==21.4.0 8 | # via jsonschema 9 | fastjsonschema==2.15.3 10 | # via nbformat 11 | importlib-resources==5.7.1 12 | # via 13 | # jsonschema 14 | # lahman 15 | jsonschema==4.6.0 16 | # via nbformat 17 | jupyter-core==4.10.0 18 | # via nbformat 19 | jupytext==1.13.8 20 | # via dbcooper (setup.cfg) 21 | lahman==0.0.1 22 | # via dbcooper (setup.cfg) 23 | markdown-it-py==2.1.0 24 | # via 25 | # jupytext 26 | # mdit-py-plugins 27 | mdit-py-plugins==0.3.0 28 | # via jupytext 29 | mdurl==0.1.1 30 | # via markdown-it-py 31 | nbformat==5.4.0 32 | # via jupytext 33 | numpy==1.22.4 34 | # via 35 | # pandas 36 | # siuba 37 | pandas==1.4.2 38 | # via 39 | # lahman 40 | # siuba 41 | pyrsistent==0.18.1 42 | # via jsonschema 43 | python-dateutil==2.8.2 44 | # via pandas 45 | pytz==2022.1 46 | # via pandas 47 | pyyaml==6.0 48 | # via 49 | # jupytext 50 | # siuba 51 | siuba==0.3.0 52 | # via dbcooper (setup.cfg) 53 | six==1.16.0 54 | # via python-dateutil 55 | sqlalchemy==1.4.37 56 | # via 57 | # dbcooper (setup.cfg) 58 | # siuba 59 | tabulate==0.8.9 60 | # via dbcooper (setup.cfg) 61 | toml==0.10.2 62 | # via jupytext 63 | traitlets==5.2.2.post1 64 | # via 65 | # jupyter-core 66 | # nbformat 67 | zipp==3.8.0 68 | # via importlib-resources 69 | -------------------------------------------------------------------------------- /examples/lahman.Rmd: -------------------------------------------------------------------------------- 1 | ```{python} 2 | # %load_ext autoreload 3 | ``` 4 | 5 | ```{python} 6 | from dbcooper import DbCooper 7 | from sqlalchemy import create_engine 8 | import lahman 9 | 10 | def load_tables_for_engine(engine, exclude=[], **kwargs): 11 | for name in lahman._accessors: 12 | if name in exclude: continue 13 | df = getattr(lahman, name)() 14 | df.to_sql(name, engine, **kwargs) 15 | ``` 16 | 17 | The example below shows 3 modes: 18 | 19 | * simple: table names are `_`. 20 | * formatted: table names are `
`. 21 | * grouped: each schema is its own dictionary, accessed using `.
`. 22 | 23 | 24 | ## Sqlite (simple) 25 | 26 | ```{python} 27 | engine = create_engine("sqlite://") 28 | engine.execute("ATTACH ':memory:' AS lahman") 29 | load_tables_for_engine(engine, schema="lahman") 30 | ``` 31 | 32 | ```{python} 33 | tbl_flat = DbCooper(engine) 34 | ``` 35 | 36 | ```{python} 37 | tbl_flat.lahman_allstar_full() 38 | ``` 39 | 40 | ## Formatting table names 41 | 42 | ```{python} 43 | from dbcooper import AccessorBuilder 44 | 45 | # omits schema, and keeps only table name 46 | builder = AccessorBuilder(format_from_part="table") 47 | 48 | tbl_flat2 = DbCooper(engine, accessor_builder=builder) 49 | tbl_flat2.allstar_full() 50 | ``` 51 | 52 | ## Grouping tables by schema 53 | 54 | ```{python} 55 | from dbcooper import TableFinder 56 | from dbcooper.finder import AccessorHierarchyBuilder 57 | 58 | tbl_nested = DbCooper(engine, accessor_builder=AccessorHierarchyBuilder()) 59 | tbl_nested.lahman.allstar_full() 60 | ``` 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=45", "wheel", "setuptools_scm>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools_scm] 6 | 7 | [tool.setuptools.packages.find] 8 | include = ["dbcooper"] 9 | 10 | [project] 11 | name = "dbcooper" 12 | description = "The dbcooper package turns a database connection into a collection of functions, handling logic for keeping track of connections and letting you take advantage of autocompletion when exploring a database." 13 | readme = "README.md" 14 | keywords = ["template", "packaging"] 15 | license.text = "MIT" 16 | authors = [ 17 | { name = "Michael Chow", email = "mc_al_github@fastmail.com" } 18 | ] 19 | dynamic = ["version"] 20 | classifiers = [ 21 | "Programming Language :: Python :: 3.8", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | ] 25 | dependencies = [ 26 | "sqlalchemy", 27 | "tabulate", 28 | ] 29 | requires-python = ">=3.10" 30 | 31 | [project.optional-dependencies] 32 | siuba = [ 33 | "siuba>=0.4.4", 34 | ] 35 | 36 | [dependency-groups] 37 | dev = [ 38 | "pip-tools", 39 | "importlib-resources", 40 | "ipykernel", 41 | "pydata-sphinx-theme", 42 | "pytest", 43 | "pytest-dotenv", 44 | "sqlalchemy-bigquery", 45 | "sphinx~=4.4.0", 46 | "snowflake-sqlalchemy", 47 | "psycopg2-binary", 48 | "pymysql", 49 | "jupytext", 50 | "numpy<2.0", 51 | "polars>=1.33.1", 52 | "duckdb<1.4.0", 53 | "siuba==0.4.5.dev1", 54 | "duckdb-engine>=0.17.0", 55 | "pyarrow>=21.0.0", 56 | ] 57 | 58 | binder = [ 59 | "jupytext", 60 | "lahman" 61 | ] 62 | 63 | [tool.pytest.ini_options] 64 | markers = [ 65 | "ex: a test runs against ExampleClass", 66 | "ex2: a test runs against ExampleClass2", 67 | ] 68 | testpaths = [ 69 | "dbcooper", 70 | ] 71 | 72 | [tool.ruff.lint] 73 | max-line-length = 90 74 | ignore = [ 75 | "E501", # line too long 76 | "W503", # line before binary operator 77 | ] 78 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "dbcooper" 21 | copyright = "2022, Michael Chow" 22 | author = "Michael Chow" 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "sphinx.ext.autodoc", 32 | "sphinx.ext.autosummary", 33 | ] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ["_templates"] 37 | 38 | # List of patterns, relative to source directory, that match files and 39 | # directories to ignore when looking for source files. 40 | # This pattern also affects html_static_path and html_extra_path. 41 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 42 | 43 | 44 | # -- Options for HTML output ------------------------------------------------- 45 | 46 | # The theme to use for HTML and HTML Help pages. See the documentation for 47 | # a list of builtin themes. 48 | # 49 | html_theme = "pydata_sphinx_theme" 50 | 51 | # Add any paths that contain custom static files (such as style sheets) here, 52 | # relative to this directory. They are copied after the builtin static files, 53 | # so a file named "default.css" will overwrite the builtin "default.css". 54 | html_static_path = ["_static"] 55 | -------------------------------------------------------------------------------- /requirements/2022-01-01.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile --extra=dev --output-file=- setup.cfg 6 | # 7 | alabaster==0.7.12 8 | # via sphinx 9 | attrs==21.4.0 10 | # via pytest 11 | babel==2.9.1 12 | # via sphinx 13 | beautifulsoup4==4.10.0 14 | # via pydata-sphinx-theme 15 | certifi==2021.10.8 16 | # via requests 17 | charset-normalizer==2.0.12 18 | # via requests 19 | docutils==0.17.1 20 | # via 21 | # pydata-sphinx-theme 22 | # sphinx 23 | idna==3.3 24 | # via requests 25 | imagesize==1.3.0 26 | # via sphinx 27 | importlib-metadata==4.11.2 28 | # via sphinx 29 | importlib-resources==5.4.0 30 | # via template-python-pkg (setup.cfg) 31 | iniconfig==1.1.1 32 | # via pytest 33 | jinja2==3.0.3 34 | # via sphinx 35 | markupsafe==2.1.0 36 | # via jinja2 37 | packaging==21.3 38 | # via 39 | # pytest 40 | # sphinx 41 | pluggy==1.0.0 42 | # via pytest 43 | py==1.11.0 44 | # via pytest 45 | pydata-sphinx-theme==0.8.0 46 | # via template-python-pkg (setup.cfg) 47 | pygments==2.11.2 48 | # via sphinx 49 | pyparsing==3.0.7 50 | # via packaging 51 | pytest==7.0.1 52 | # via 53 | # pytest-dotenv 54 | # template-python-pkg (setup.cfg) 55 | pytest-dotenv==0.5.2 56 | # via template-python-pkg (setup.cfg) 57 | python-dotenv==0.19.2 58 | # via pytest-dotenv 59 | pytz==2021.3 60 | # via babel 61 | pyyaml==6.0 62 | # via template-python-pkg (setup.cfg) 63 | requests==2.27.1 64 | # via sphinx 65 | snowballstemmer==2.2.0 66 | # via sphinx 67 | soupsieve==2.3.1 68 | # via beautifulsoup4 69 | sphinx==4.4.0 70 | # via pydata-sphinx-theme 71 | sphinxcontrib-applehelp==1.0.2 72 | # via sphinx 73 | sphinxcontrib-devhelp==1.0.2 74 | # via sphinx 75 | sphinxcontrib-htmlhelp==2.0.0 76 | # via sphinx 77 | sphinxcontrib-jsmath==1.0.1 78 | # via sphinx 79 | sphinxcontrib-qthelp==1.0.3 80 | # via sphinx 81 | sphinxcontrib-serializinghtml==1.1.5 82 | # via sphinx 83 | tomli==2.0.1 84 | # via pytest 85 | urllib3==1.26.8 86 | # via requests 87 | zipp==3.7.0 88 | # via 89 | # importlib-metadata 90 | # importlib-resources 91 | -------------------------------------------------------------------------------- /dbcooper/tests/test_example_schemas.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from polars import DataFrame as PlDataFrame 4 | from duckdb import DuckDBPyConnection 5 | 6 | from dbcooper import DbCooper 7 | from dbcooper.tests.helpers import EXAMPLE_SCHEMAS, EXAMPLE_DATA, assert_frame_sort_equal 8 | from dbcooper.tables import DbcSimpleTable 9 | from dbcooper.finder import TableFinder, AccessorBuilder 10 | from dbcooper.collect import to_polars, to_duckdb, name_to_tbl 11 | 12 | from siuba import collect 13 | 14 | @pytest.fixture 15 | def tbl(backend): 16 | if backend.name == "snowflake": 17 | # snowflake can't do reflection on schemas that aren't uppercase, see 18 | # see https://github.com/snowflakedb/snowflake-sqlalchemy/issues/276 19 | tbl = DbCooper(backend.engine, table_factory=DbcSimpleTable) 20 | elif backend.name == "duckdb": 21 | # tests currently assume database name isn't used in accessor 22 | tbl = DbCooper(backend.engine, accessor_builder=AccessorBuilder(format_from_part="schema")) 23 | else: 24 | tbl = DbCooper(backend.engine) 25 | 26 | tbl._init() 27 | return tbl 28 | 29 | 30 | 31 | def test_example_number_of_accessors(tbl): 32 | assert len(tbl._accessors) == len(EXAMPLE_SCHEMAS) 33 | 34 | 35 | def test_example_repr_exists(tbl): 36 | if tbl._engine.name == "snowflake": 37 | # see https://github.com/snowflakedb/snowflake-sqlalchemy/issues/276 38 | pytest.xfail() 39 | 40 | for (schema, table_name), attr_name in EXAMPLE_SCHEMAS.items(): 41 | table = getattr(tbl, attr_name) 42 | assert table_name in repr(table) 43 | 44 | 45 | def test_example_data_roundtrip_siuba(tbl): 46 | for (schema, table_name), attr_name in EXAMPLE_SCHEMAS.items(): 47 | table = getattr(tbl, attr_name) 48 | assert_frame_sort_equal(collect(table()), EXAMPLE_DATA) 49 | 50 | def test_to_polars(tbl): 51 | res = to_polars(tbl._engine, name_to_tbl(tbl._engine, "lower", "mai")) 52 | assert isinstance(res, PlDataFrame) 53 | 54 | def test_to_duckdb(tbl): 55 | if tbl._engine.name != "duckdb": 56 | pytest.skip("to_duckdb only works with duckdb engines") 57 | 58 | res = to_duckdb(tbl._engine, name_to_tbl(tbl._engine, "lower", "mai")) 59 | assert isinstance(res, DuckDBPyConnection) 60 | 61 | -------------------------------------------------------------------------------- /dbcooper/collect.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from sqlalchemy import sql 4 | from sqlalchemy.engine import Engine 5 | from sqlalchemy.sql.expression import TextClause 6 | from typing import TYPE_CHECKING 7 | 8 | 9 | if TYPE_CHECKING: 10 | from siuba.sql import LazyTbl 11 | from duckdb import DuckDBPyConnection 12 | from polars import DataFrame as PlDataFrame 13 | 14 | 15 | def query_to_tbl(engine: Engine, query: str) -> TextClause: 16 | 17 | full_query = f""" 18 | SELECT * FROM (\n{query}\n) WHERE 1 = 0 19 | """ 20 | 21 | with engine.connect() as con: 22 | q = con.execute(sql.text(full_query)) 23 | 24 | columns = [sql.column(k) for k in q.keys()] 25 | text_as_from = sql.text(query).columns(*columns).alias() 26 | 27 | return text_as_from 28 | 29 | 30 | def name_to_tbl(engine: Engine, table_name: str, schema: str | None=None) -> sql.TableClause: 31 | # sql dialects like snowflake do not have great reflection capabilities, 32 | # so we execute a trivial query to discover the column names 33 | explore_table = sql.table(table_name, schema=schema) 34 | trivial = explore_table.select(sql.text("0 = 1")).add_columns(sql.text("*")) 35 | 36 | with engine.connect() as con: 37 | q = con.execute(trivial) 38 | 39 | columns = [sql.column(k) for k in q.keys()] 40 | return sql.table(table_name, *columns, schema=schema) 41 | 42 | 43 | def to_siuba(engine: Engine, expr: str | TextClause | sql.TableClause) -> LazyTbl: 44 | from siuba.sql import LazyTbl 45 | 46 | expr = query_to_tbl(engine, expr) if isinstance(expr, str) else expr 47 | 48 | return LazyTbl(engine, expr) 49 | 50 | 51 | def to_polars(engine: Engine, expr: str | TextClause | sql.TableClause) -> PlDataFrame: 52 | from polars import read_database 53 | 54 | expr = query_to_tbl(engine, expr) if isinstance(expr, str) else expr 55 | 56 | if isinstance(expr, sql.TableClause): 57 | expr = expr.select().add_columns() 58 | 59 | with engine.connect() as con: 60 | return read_database(expr, con) 61 | 62 | 63 | def to_duckdb(engine: Engine, expr: str | TextClause | sql.TableClause) -> DuckDBPyConnection: 64 | import duckdb 65 | 66 | if engine.name != "duckdb": 67 | raise ValueError("This function only works with duckdb engines") 68 | 69 | expr = query_to_tbl(engine, expr) if isinstance(expr, str) else expr 70 | 71 | if isinstance(expr, sql.TableClause): 72 | expr = expr.select().add_columns() 73 | 74 | with engine.connect() as con: 75 | # assumes we are using duckdb_engine 76 | # TODO: expr should be compiled? 77 | return con.connection.execute(str(expr)) 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac OSX ===================================================================== 2 | .DS_Store 3 | 4 | # Vim ========================================================================= 5 | .*.sw[po] 6 | 7 | # Python ====================================================================== 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /dbcooper/tables.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from tabulate import tabulate 4 | from typing import TYPE_CHECKING 5 | from sqlalchemy import Table, MetaData 6 | 7 | from .collect import name_to_tbl, to_siuba 8 | 9 | if TYPE_CHECKING: 10 | import sqlalchemy as sqla 11 | from sqlalchemy.engine import Engine 12 | 13 | class DbcSimpleTable: 14 | """Represent a database table.""" 15 | def __init__(self, engine: Engine, table_name: str, schema: str | None = None, to_frame=to_siuba): 16 | self.engine = engine 17 | self.table_name = table_name 18 | self.schema = schema 19 | self.to_frame = to_frame 20 | 21 | def __repr__(self): 22 | repr_args = map(repr, [self.table_name, self.schema]) 23 | joined_repr = ", ".join(repr_args) 24 | return f"{self.__class__.__name__}(..., {joined_repr})" 25 | 26 | def _repr_html_(self): 27 | raise NotImplementedError() 28 | 29 | def __call__(self): 30 | sqla_tbl = self._create_table() 31 | return self.to_frame(self.engine, sqla_tbl) 32 | 33 | def _create_table(self) -> sqla.sql.TableClause: 34 | return name_to_tbl(self.engine, self.table_name, self.schema) 35 | 36 | 37 | class DbcDocumentedTable(DbcSimpleTable): 38 | """Represent a database table with a nice column summary (including comments). 39 | 40 | Note that this class's objects return a siuba LazyTbl when called, and print 41 | out the table and column descriptions otherwise. 42 | """ 43 | 44 | table_comment_fields = {"name": "name", "type": "type", "description": "comment"} 45 | 46 | def _create_table(self) -> sqla.Table: 47 | table = Table(self.table_name, MetaData(), schema=self.schema, autoload_with = self.engine) 48 | return table 49 | 50 | # methods for representation ---------------------------------------------- 51 | 52 | def _col_to_row(self, col): 53 | return {k: getattr(col, v) for k,v in self.table_comment_fields.items()} 54 | 55 | def _repr_body(self, table, tablefmt): 56 | rows = [self._col_to_row(col) for col in table.columns] 57 | return tabulate(rows, headers="keys", tablefmt=tablefmt) 58 | 59 | @staticmethod 60 | def _get_table_comment(table): 61 | if table.comment is None: 62 | return "(No table description.)" 63 | else: 64 | return table.comment 65 | 66 | def _repr_html_(self): 67 | table = self._create_table() 68 | 69 | table_comment = self._get_table_comment(table) 70 | 71 | return f"""\ 72 |

{table.name}

73 |

{table_comment}

74 | {self._repr_body(table, "html")}\ 75 | """ 76 | 77 | def __repr__(self): 78 | table = self._create_table() 79 | 80 | table_comment = self._get_table_comment(table) 81 | 82 | return f"""\ 83 | {table.name} 84 | {table_comment} 85 | 86 | {self._repr_body(table, "simple")}\ 87 | """ 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /dbcooper/dbcooper.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from sqlalchemy import create_engine 4 | 5 | from .finder import TableFinder, AccessorBuilder 6 | from .tables import DbcDocumentedTable 7 | from .collect import query_to_tbl, name_to_tbl, to_siuba 8 | 9 | import typing 10 | 11 | if typing.TYPE_CHECKING: 12 | from sqlalchemy.engine import Engine 13 | 14 | 15 | class DbCooper: 16 | def __init__( 17 | self, 18 | engine: "str | Engine", 19 | table_finder=TableFinder(), 20 | table_factory=DbcDocumentedTable, 21 | accessor_builder=AccessorBuilder(), 22 | to_frame=to_siuba, 23 | initialize=True, 24 | ): 25 | 26 | if isinstance(engine, str): 27 | engine = create_engine(engine) 28 | 29 | self._engine: Engine = engine 30 | self._accessors = {} 31 | self._table_finder = table_finder 32 | self._table_factory = table_factory 33 | self._accessor_builder = accessor_builder 34 | self._to_frame = to_frame 35 | 36 | if initialize: 37 | self._init() 38 | 39 | def __getattr__(self, k): 40 | if k in self._accessors: 41 | return self._accessors[k] 42 | 43 | raise AttributeError("No such attribute %s" % k) 44 | 45 | def __getitem__(self, k): 46 | if k in self._accessors: 47 | return self._accessors[k] 48 | 49 | raise AttributeError("No such attribute %s" % k) 50 | 51 | 52 | def __dir__(self): 53 | dbc_methods = ["reset", "query", "list", "tbl"] 54 | return dbc_methods + list(self._accessors.keys()) 55 | 56 | def _ipython_key_completions_(self): 57 | return list(self._accessors) 58 | 59 | def _init(self): 60 | with self._engine.connect() as conn: 61 | table_map = self._table_finder.map_tables(self._engine.dialect, conn) 62 | 63 | accessors = self._accessor_builder.create_accessors( 64 | self._engine, 65 | self._table_factory, 66 | table_map, 67 | self._to_frame, 68 | ) 69 | self._accessors = accessors 70 | 71 | def reset(self): 72 | self._init() 73 | 74 | def list(self, raw=False): 75 | dialect = self._engine.dialect 76 | with self._engine.connect() as conn: 77 | tables = self._table_finder.list_tables(dialect, conn) 78 | 79 | if raw: 80 | return tables 81 | else: 82 | results = [] 83 | for table in tables: 84 | ident = self._table_finder.identify_table(dialect, table) 85 | results.append(self._table_finder.join_identifiers(ident)) 86 | 87 | return results 88 | 89 | def query(self, query): 90 | expr = query_to_tbl(self._engine, query) 91 | return self._to_frame(self._engine, expr) 92 | 93 | def tbl(self, name, schema=None): 94 | expr = name_to_tbl(self._engine, name, schema) 95 | return self._to_frame(self._engine, expr) 96 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: ["main", "dev-*"] 7 | pull_request: 8 | release: 9 | types: [published] 10 | 11 | jobs: 12 | run-if: 13 | name: "Run If" 14 | runs-on: ubuntu-latest 15 | if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false 16 | steps: 17 | - run: | 18 | echo "Running CI" 19 | test-python: 20 | name: "Test Python Version" 21 | needs: ["run-if"] 22 | runs-on: ubuntu-latest 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | # Checks based on python versions --- 27 | python-version: ["3.10", "3.11", "3.12"] 28 | requirements: [""] 29 | 30 | steps: 31 | - uses: actions/checkout@v2 32 | 33 | # setup docker and gcp authentication ---- 34 | # 35 | - name: Run docker-compose 36 | run: | 37 | docker compose up --build -d 38 | 39 | - name: Set up Cloud SDK 40 | uses: google-github-actions/setup-gcloud@v0 41 | with: 42 | project_id: siuba-tests 43 | service_account_key: ${{ secrets.GCP_SA_KEY }} 44 | export_default_credentials: true 45 | 46 | # install python w/ dependencies, and run tests ---- 47 | # 48 | - uses: actions/setup-python@v2 49 | with: 50 | python-version: "${{ matrix.python-version }}" 51 | - name: Install uv 52 | uses: astral-sh/setup-uv@v6 53 | - name: Install dependencies 54 | run: | 55 | uv sync --dev 56 | - name: Run tests 57 | run: | 58 | uv run pytest -m "postgresql" 59 | env: 60 | SB_TEST_MYSQLPORT: 3307 61 | SB_TEST_PGPORT: 5433 62 | 63 | SB_TEST_BQDATABASE: "" 64 | SB_TEST_BQPROJECT: dbcooper-tests 65 | 66 | SB_TEST_SNOWFLAKEDATABASE: "DBCOOPER_DB1" 67 | SB_TEST_SNOWFLAKEUSER: "DBCOOPER_CI" 68 | SB_TEST_SNOWFLAKEPASSWORD: ${{ secrets.SB_TEST_SNOWFLAKEPASSWORD }} 69 | SB_TEST_SNOWFLAKEHOST: "qf04441.us-east-2.aws" 70 | SB_TEST_SNOWFLAKEOPTIONS: "warehouse=COMPUTE_WH&role=USER_DBCOOPER_CI" 71 | 72 | build-docs: 73 | name: "Build Docs" 74 | needs: ["run-if"] 75 | runs-on: ubuntu-latest 76 | steps: 77 | - uses: actions/checkout@v2 78 | - uses: actions/setup-python@v2 79 | with: 80 | python-version: "3.10" 81 | - name: Install dependencies from requirements file 82 | run: | 83 | python -m pip install --upgrade pip 84 | python -m pip install -r requirements/dev.txt 85 | - name: Build docs 86 | run: | 87 | make docs-build 88 | 89 | release-pypi: 90 | name: "Release to pypi" 91 | runs-on: ubuntu-latest 92 | if: github.event_name == 'release' 93 | needs: [build-docs] 94 | steps: 95 | - uses: actions/checkout@v2 96 | - uses: actions/setup-python@v2 97 | with: 98 | python-version: "3.10" 99 | - name: "Build Package" 100 | run: | 101 | python -m pip install build wheel 102 | python -m build --sdist --wheel 103 | - name: "Deploy to Test PyPI" 104 | uses: pypa/gh-action-pypi-publish@release/v1 105 | with: 106 | user: __token__ 107 | password: ${{ secrets.PYPI_API_TOKEN }} 108 | -------------------------------------------------------------------------------- /dbcooper/tests/helpers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from dbcooper.utils import SingleGeneric 4 | from sqlalchemy.sql.elements import quoted_name 5 | 6 | from siuba.tests.helpers import SqlBackend, BigqueryBackend, assert_frame_sort_equal 7 | 8 | EXAMPLE_SCHEMAS = { 9 | ("mai", "lower"): "mai_lower", 10 | ("mai", "UPPER"): "mai_UPPER", 11 | ("mai", "MiXeD"): "mai_MiXeD", 12 | 13 | ("MAIN_UPPER", "some_table"): "MAIN_UPPER_some_table", 14 | } 15 | 16 | 17 | EXAMPLE_DATA = pd.DataFrame({"x": [1,2,3], "y": ['a', 'b', 'b']}) 18 | 19 | # utilities ------------------------------------------------------------------- 20 | 21 | write_table = SingleGeneric("write_table") 22 | 23 | @write_table.register_default 24 | def _wt_default(engine, df, table_name, schema): 25 | return df.to_sql(quoted_name(table_name, True), engine, schema=quoted_name(schema, True), if_exists="replace",index=False) 26 | 27 | @write_table.register("sqlite") 28 | def _wt_sqlite(engine, df, table_name, schema): 29 | return df.to_sql(quoted_name(table_name, True), engine, schema=quoted_name(schema, True), if_exists="replace",index=False) 30 | 31 | 32 | @write_table.register("snowflake") 33 | def _wt_snowflake(engine, df, table_name, schema): 34 | # Note that I have literally spent more time trying to support writing 35 | # case sensitive schema + table names to snowflake, than in the development 36 | # of the rest of this library. The sqlalchemy dialect is not made for it, 37 | # the python connector methods fail silently, and pandas to_sql fails on 38 | # reflection (due to dialect issues). 39 | from sqlalchemy.sql.elements import quoted_name 40 | from snowflake.connector.pandas_tools import write_pandas, pd_writer 41 | 42 | ip = engine.dialect.identifier_preparer 43 | quoted_schema = ip.quote_identifier(schema) 44 | quoted_table_name = ip.quote_identifier(table_name) 45 | with engine.connect() as conn: 46 | conn.execute(f""" 47 | CREATE OR REPLACE TABLE {quoted_schema}.{quoted_table_name} ( 48 | x integer, 49 | y varchar(100) 50 | ) 51 | """) 52 | 53 | conn.execute(f""" 54 | INSERT INTO {quoted_schema}.{quoted_table_name} 55 | VALUES (1, 'a'), 56 | (2, 'b'), 57 | (3, 'b') 58 | """) 59 | #sf_conn = conn.connection.connection 60 | #conn.execute(f"CREATE SCHEMA IF NOT EXISTS {schema_name}") 61 | #write_pandas( 62 | # sf_conn, 63 | # df, table_name, schema= schema, auto_create_table=True, 64 | #) 65 | 66 | 67 | create_examples = SingleGeneric("create_examples") 68 | 69 | @create_examples.register_default 70 | def _create_examples_default(engine): 71 | ip = engine.dialect.identifier_preparer 72 | 73 | for schema, table in EXAMPLE_SCHEMAS.keys(): 74 | with engine.connect() as conn: 75 | conn.execute(f"CREATE SCHEMA IF NOT EXISTS {ip.quote_identifier(schema)}") 76 | write_table(engine, EXAMPLE_DATA, table, schema) 77 | 78 | @create_examples.register("sqlite") 79 | def _create_examples_sqlite(engine): 80 | ip = engine.dialect.identifier_preparer 81 | prev_schemas = set() 82 | 83 | for schema, table in EXAMPLE_SCHEMAS.keys(): 84 | if schema not in prev_schemas: 85 | with engine.connect() as conn: 86 | conn.execute(f"ATTACH DATABASE ':memory:' AS {ip.quote_identifier(schema)}") 87 | 88 | prev_schemas.add(schema) 89 | write_table(engine, EXAMPLE_DATA, table, schema) 90 | 91 | -------------------------------------------------------------------------------- /dbcooper/finder.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Mapping 4 | 5 | from .inspect import TableName, TableIdentity, list_tables, format_table, identify_table 6 | 7 | from typing import TYPE_CHECKING, Callable 8 | 9 | 10 | if TYPE_CHECKING: 11 | from .tables import DbcSimpleTable 12 | 13 | 14 | def _set_default(d, k, default): 15 | """Same behavior as dict.setdefault""" 16 | if k in d: 17 | return d[k] 18 | else: 19 | d[k] = default 20 | return default 21 | 22 | class AttributeDict(Mapping): 23 | """Similar to a dictionary, except items may also be accessed as attributes.""" 24 | 25 | def __init__(self, d=None): 26 | if d is None: 27 | self._d = {} 28 | else: 29 | # make a copy, just to be safe 30 | self._d = {**d} 31 | 32 | def __getitem__(self, k): 33 | return self._d[k] 34 | 35 | def __iter__(self): 36 | return iter(self._d) 37 | 38 | def __len__(self): 39 | return len(self._d) 40 | 41 | def __getattr__(self, k): 42 | if k in self._d: 43 | return self._d[k] 44 | 45 | raise AttributeError("No attribute %s" % k) 46 | 47 | def __setitem__(self, k, v): 48 | self._d[k] = v 49 | 50 | def __dir__(self): 51 | return list(self._d.keys()) 52 | 53 | def __repr__(self): 54 | repr_d = repr(self._d) 55 | return f"{self.__class__.__name__}({repr_d})" 56 | 57 | 58 | 59 | class TableFinder: 60 | # TODO: rename exclude_schemas 61 | # TODO: format="lowercase", exclude_schemas, exclude_tables 62 | def __init__(self, 63 | exclude_schemas=None, 64 | identify_from_part=None, 65 | ): 66 | # TODO: filter method 67 | self.exclude_schemas = exclude_schemas 68 | self.identify_from_part = identify_from_part 69 | 70 | def list_tables(self, dialect, conn): 71 | # first use generic method that dispatches on dialect name 72 | return list_tables(dialect, conn, self.exclude_schemas) 73 | 74 | 75 | def identify_table(self, dialect, table: TableName): 76 | return identify_table(dialect, table, self.identify_from_part) 77 | 78 | def join_identifiers(self, ident: TableIdentity): 79 | return f"{ident.schema}.{ident.table}" 80 | 81 | def map_tables(self, dialect, conn) -> Mapping[TableName, TableIdentity]: 82 | table_map = {} 83 | table_names = self.list_tables(dialect, conn) 84 | 85 | for name in table_names: 86 | ident_table = self.identify_table(dialect, name) 87 | table_map[name] = ident_table 88 | 89 | return table_map 90 | 91 | 92 | class AccessorBuilder: 93 | def __init__( 94 | self, 95 | format_from_part=None, 96 | name_format: "str | Callable[TableName, str]" = "identity", 97 | ): 98 | self.format_from_part=format_from_part 99 | self.name_format=name_format 100 | 101 | def format_table(self, dialect, table: TableName): 102 | if callable(self.name_format): 103 | return self.name_format(table) 104 | 105 | # first use generic method that dispatches on dialect name 106 | table = format_table(dialect, table, self.format_from_part) 107 | 108 | if self.name_format == "lower": 109 | return table.lower() 110 | elif self.name_format == "identity": 111 | return table 112 | else: 113 | raise ValueError( 114 | "Unknown name_format argument type: {type(self.name_format)}" 115 | ) 116 | 117 | def create_accessors(self, engine, table_factory: DbcSimpleTable, table_map: Mapping[TableName, TableIdentity], to_frame): 118 | accessors = AttributeDict() 119 | 120 | for table, ident in table_map.items(): 121 | fmt_name = self.format_table(engine.dialect, table) 122 | if fmt_name in accessors: 123 | raise Exception("multiple tables w/ formatted name: %s" % fmt_name) 124 | 125 | accessors[fmt_name] = table_factory(engine, ident.table, ident.schema, to_frame) 126 | 127 | return accessors 128 | 129 | 130 | class AccessorHierarchyBuilder(AccessorBuilder): 131 | def __init__( 132 | self, *args, omit_database=True, **kwargs 133 | ): 134 | super().__init__(*args, **kwargs) 135 | self.format_from_part="table" 136 | self.omit_database = omit_database 137 | 138 | def _group_by_level(self, table_map): 139 | from itertools import groupby 140 | 141 | sorted_items = sorted( 142 | sorted(table_map.items(), key=lambda x: x[0].database or ""), 143 | key=lambda x: x[0].schema or "" 144 | ) 145 | 146 | grouped = groupby(sorted_items, lambda x: (x[0].database, x[0].schema)) 147 | return {group_key: dict(iter_) for group_key, iter_ in grouped} 148 | 149 | def create_accessors(self, engine, table_factory, table_map, to_frame): 150 | 151 | grouped = self._group_by_level(table_map) 152 | 153 | res = AttributeDict() 154 | for (db, schema), sub_map in grouped.items(): 155 | sub_accessors = super().create_accessors(engine, table_factory, sub_map, to_frame) 156 | acc_db = _set_default(res, db, AttributeDict()) 157 | if schema in acc_db: 158 | raise ValueError( 159 | "Already set accessors for this schema.\n" 160 | f"Database name: {db}\n" 161 | f"Schema name: {schema}\n" 162 | ) 163 | acc_db[schema] = sub_accessors 164 | 165 | if self.omit_database: 166 | if len(res) != 1: 167 | raise ValueError( 168 | "Omitting database requires exactly 1 database entry, but found " 169 | f"the following: {list(res)}" 170 | ) 171 | 172 | # return the only entry in the accessors dictionary 173 | return list(res.values())[0] 174 | 175 | return res 176 | 177 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | ```{python tags=c("hide-cell")} 2 | # TODO: is there a way to get it so dbc.list() does not show 1 item per line? 3 | 4 | # this keeps the pandas dataframe repr from spitting out scoped style tags 5 | # which don't render on github 6 | import pandas as pd 7 | pd.set_option("display.notebook_repr_html", False) 8 | ``` 9 | 10 | # dbcooper-py 11 | 12 | [![CI](https://github.com/machow/dbcooper-py/actions/workflows/ci.yml/badge.svg)](https://github.com/machow/dbcooper-py/actions/workflows/ci.yml) 13 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/machow/dbcooper-py/HEAD) 14 | 15 | The dbcooper package turns a database connection into a collection of functions, 16 | handling logic for keeping track of connections and letting you take advantage of 17 | autocompletion when exploring a database. 18 | 19 | It's especially helpful to use when authoring database-specific Python packages, 20 | for instance in an internal company package or one wrapping a public data source. 21 | 22 | For the R version see [dgrtwo/dbcooper](https://github.com/dgrtwo/dbcooper). 23 | 24 | ## Installation 25 | 26 | ``` 27 | pip install dbcooper 28 | ``` 29 | 30 | ## Example 31 | 32 | ### Initializing the functions 33 | 34 | The dbcooper package asks you to create the connection first. 35 | As an example, we'll use the Lahman baseball database package (`lahman`). 36 | 37 | ```{python} 38 | from sqlalchemy import create_engine 39 | from dbcooper.data import lahman_sqlite 40 | 41 | # connect to sqlite 42 | engine = create_engine("sqlite://") 43 | 44 | # load the lahman data into the "lahman" schema 45 | lahman_sqlite(engine) 46 | ``` 47 | 48 | Next we'll set up dbcooper 49 | 50 | ```{python} 51 | from dbcooper import DbCooper 52 | 53 | dbc = DbCooper(engine) 54 | ``` 55 | 56 | The `DbCooper` object contains two important things: 57 | 58 | * Accessors to fetch specific tables. 59 | * Functions for interacting with the underlying database. 60 | 61 | ### Using table accessors 62 | 63 | In the example below, we'll use the `"Lahman"."Salaries"` table as an example. 64 | By default, dbcooper makes this accessible as `.lahman_salaries`. 65 | 66 | **Plain** `.lahman_salaries` prints out table and column info, including types and descriptions. 67 | 68 | ```{python} 69 | # show table and column descriptions 70 | dbc.lahman_salaries 71 | ``` 72 | 73 | Note that sqlite doesn't support table and columnn descriptions, so these sections 74 | are empty. 75 | 76 | **Calling** `.lahman_salaries()` fetches a lazy version of the data. 77 | 78 | 79 | ```{python} 80 | dbc.lahman_salaries() 81 | ``` 82 | 83 | Note that this data is a siuba `LazyTbl` object, which you can use to analyze the data. 84 | 85 | ```{python} 86 | from siuba import _, count 87 | 88 | dbc.lahman_salaries() >> count(over_100k = _.salary > 100_000) 89 | ``` 90 | 91 | ### Using database functions 92 | 93 | * `.list()`: Get a list of tables 94 | * `.tbl()`: Access a table that can be worked with using `siuba`. 95 | * `.query()`: Perform a SQL query and work with the result. 96 | * `._engine`: Get the underlying sqlalchemy engine. 97 | 98 | For instance, we could start by finding the names of the tables in the Lahman database. 99 | 100 | ```{python} 101 | dbc.list() 102 | ``` 103 | 104 | We can access one of these tables with `dbc.tbl()`, then put it through any kind 105 | of siuba operation. 106 | 107 | ```{python} 108 | dbc.tbl("Salaries") 109 | ``` 110 | 111 | ```{python} 112 | from siuba import _, count 113 | dbc.tbl("Salaries") >> count(_.yearID, sort=True) 114 | ``` 115 | 116 | If you'd rather start from a SQL query, use the `.query()` method. 117 | 118 | ```{python} 119 | dbc.query(""" 120 | SELECT 121 | playerID, 122 | sum(AB) as AB 123 | FROM Batting 124 | GROUP BY playerID 125 | """) 126 | ``` 127 | 128 | For anything else you might want to do, the sqlalchemy Engine object is available. 129 | For example, the code below shows how you can set its `.echo` attribute, which 130 | tells sqlalchemy to provide useful logs. 131 | 132 | ```{python} 133 | dbc._engine.echo = True 134 | table_names = dbc.list() 135 | ``` 136 | 137 | Note that the log messages above show that the `.list()` method executed two queries: 138 | One to list tables in the "main" schema (which is empty), and one to list tables 139 | in the "lahman" schema. 140 | 141 | 142 | ## Advanced Configuration 143 | 144 | > ⚠️: These behaviors are well tested, but dbcooper's internals and API may change. 145 | 146 | dbcooper can be configured in three ways, each corresponding to a class interface: 147 | 148 | * **TableFinder**: Which tables will be used by `dbcooper`. 149 | * **AccessorBuilder**: How table names are turned into accessors. 150 | * **DbcDocumentedTable**: The class that defines what an accessor will return. 151 | 152 | ```{python} 153 | from sqlalchemy import create_engine 154 | from dbcooper.data import lahman_sqlite 155 | from dbcooper import DbCooper, AccessorBuilder 156 | 157 | engine = create_engine("sqlite://") 158 | lahman_sqlite(engine) 159 | ``` 160 | 161 | ### Excluding a schema 162 | 163 | ```{python} 164 | from dbcooper import TableFinder 165 | 166 | finder = TableFinder(exclude_schemas=["lahman"]) 167 | dbc_no_lahman = DbCooper(engine, table_finder=finder) 168 | dbc_no_lahman.list() 169 | ``` 170 | 171 | 172 | ### Formatting table names 173 | 174 | ```{python} 175 | from dbcooper import AccessorBuilder 176 | 177 | # omits schema, and keeps only table name 178 | # e.g. `salaries`, rather than `lahman_salaries` 179 | builder = AccessorBuilder(format_from_part="table") 180 | 181 | tbl_flat = DbCooper(engine, accessor_builder=builder) 182 | tbl_flat.salaries() 183 | ``` 184 | 185 | ### Grouping tables by schema 186 | 187 | ```{python} 188 | from dbcooper import AccessorHierarchyBuilder 189 | 190 | tbl_nested = DbCooper(engine, accessor_builder=AccessorHierarchyBuilder()) 191 | 192 | # note the form: .
193 | tbl_nested.lahman.salaries() 194 | ``` 195 | 196 | ### Don't show table documentation 197 | 198 | ```{python} 199 | from dbcooper import DbcSimpleTable 200 | 201 | dbc_no_doc = DbCooper(engine, table_factory=DbcSimpleTable) 202 | dbc_no_doc.lahman_salaries 203 | ``` 204 | 205 | Note that sqlalchemy dialects like `snowflake-sqlalchemy` cannot look up things 206 | like table and column descriptions as well as other dialects, so `DbcSimpleTable` 207 | may be needed to connect to snowflake (see [this issue](https://github.com/snowflakedb/snowflake-sqlalchemy/issues/276)). 208 | 209 | 210 | ## Developing 211 | 212 | ```shell 213 | # install with development dependencies 214 | pip install -e .[dev] 215 | 216 | # or install from requirements file 217 | pip install -r requirements/dev.txt 218 | ``` 219 | 220 | ### Test 221 | 222 | ```shell 223 | # run all tests, see pytest section of pyproject.toml 224 | pytest 225 | 226 | # run specific backends 227 | pytest -m 'not snowflake and not bigquery' 228 | 229 | # stop on first failure, drop into debugger 230 | pytest -x --pdb 231 | ``` 232 | 233 | 234 | ### Release 235 | 236 | ```shell 237 | # set version number 238 | git tag v0.0.1 239 | 240 | # (optional) push to github 241 | git push origin --tags 242 | 243 | # check version 244 | python -m setuptools_scm 245 | ``` 246 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.8 3 | # To update, run: 4 | # 5 | # pip-compile --extra=dev --output-file=- setup.cfg 6 | # 7 | alabaster==0.7.12 8 | # via sphinx 9 | appnope==0.1.3 10 | # via 11 | # ipykernel 12 | # ipython 13 | asn1crypto==1.5.1 14 | # via 15 | # oscrypto 16 | # snowflake-connector-python 17 | asttokens==2.0.5 18 | # via stack-data 19 | attrs==21.4.0 20 | # via 21 | # jsonschema 22 | # pytest 23 | babel==2.10.2 24 | # via sphinx 25 | backcall==0.2.0 26 | # via ipython 27 | beautifulsoup4==4.11.1 28 | # via pydata-sphinx-theme 29 | cachetools==5.2.0 30 | # via google-auth 31 | certifi==2022.5.18.1 32 | # via 33 | # requests 34 | # snowflake-connector-python 35 | cffi==1.15.0 36 | # via 37 | # cryptography 38 | # snowflake-connector-python 39 | charset-normalizer==2.0.12 40 | # via 41 | # requests 42 | # snowflake-connector-python 43 | click==8.1.3 44 | # via pip-tools 45 | cryptography==36.0.2 46 | # via 47 | # pyopenssl 48 | # snowflake-connector-python 49 | debugpy==1.6.0 50 | # via ipykernel 51 | decorator==5.1.1 52 | # via ipython 53 | docutils==0.17.1 54 | # via 55 | # pydata-sphinx-theme 56 | # sphinx 57 | entrypoints==0.4 58 | # via jupyter-client 59 | executing==0.8.3 60 | # via stack-data 61 | fastjsonschema==2.15.3 62 | # via nbformat 63 | future==0.18.2 64 | # via sqlalchemy-bigquery 65 | google-api-core[grpc]==2.8.1 66 | # via 67 | # google-cloud-bigquery 68 | # google-cloud-bigquery-storage 69 | # google-cloud-core 70 | # sqlalchemy-bigquery 71 | google-auth==2.7.0 72 | # via 73 | # google-api-core 74 | # google-cloud-core 75 | # sqlalchemy-bigquery 76 | google-cloud-bigquery==3.2.0 77 | # via sqlalchemy-bigquery 78 | google-cloud-bigquery-storage==2.13.2 79 | # via 80 | # google-cloud-bigquery 81 | # sqlalchemy-bigquery 82 | google-cloud-core==2.3.1 83 | # via google-cloud-bigquery 84 | google-crc32c==1.3.0 85 | # via google-resumable-media 86 | google-resumable-media==2.3.3 87 | # via google-cloud-bigquery 88 | googleapis-common-protos==1.56.2 89 | # via 90 | # google-api-core 91 | # grpcio-status 92 | grpcio==1.46.3 93 | # via 94 | # google-api-core 95 | # google-cloud-bigquery 96 | # grpcio-status 97 | grpcio-status==1.46.3 98 | # via google-api-core 99 | idna==3.3 100 | # via 101 | # requests 102 | # snowflake-connector-python 103 | imagesize==1.3.0 104 | # via sphinx 105 | importlib-metadata==4.11.4 106 | # via sphinx 107 | importlib-resources==5.7.1 108 | # via 109 | # dbcooper (setup.cfg) 110 | # jsonschema 111 | iniconfig==1.1.1 112 | # via pytest 113 | ipykernel==6.14.0 114 | # via dbcooper (setup.cfg) 115 | ipython==8.4.0 116 | # via ipykernel 117 | jedi==0.18.1 118 | # via ipython 119 | jinja2==3.1.2 120 | # via sphinx 121 | jsonschema==4.6.0 122 | # via nbformat 123 | jupyter-client==7.3.4 124 | # via ipykernel 125 | jupyter-core==4.10.0 126 | # via 127 | # jupyter-client 128 | # nbformat 129 | jupytext==1.13.8 130 | # via dbcooper (setup.cfg) 131 | markdown-it-py==2.1.0 132 | # via 133 | # jupytext 134 | # mdit-py-plugins 135 | markupsafe==2.1.1 136 | # via jinja2 137 | matplotlib-inline==0.1.3 138 | # via 139 | # ipykernel 140 | # ipython 141 | mdit-py-plugins==0.3.0 142 | # via jupytext 143 | mdurl==0.1.1 144 | # via markdown-it-py 145 | nbformat==5.4.0 146 | # via jupytext 147 | nest-asyncio==1.5.5 148 | # via 149 | # ipykernel 150 | # jupyter-client 151 | numpy==1.22.4 152 | # via 153 | # pandas 154 | # pyarrow 155 | # siuba 156 | oscrypto==1.3.0 157 | # via snowflake-connector-python 158 | packaging==21.3 159 | # via 160 | # google-cloud-bigquery 161 | # ipykernel 162 | # pydata-sphinx-theme 163 | # pytest 164 | # sphinx 165 | pandas==1.4.2 166 | # via siuba 167 | parso==0.8.3 168 | # via jedi 169 | pep517==0.12.0 170 | # via pip-tools 171 | pexpect==4.8.0 172 | # via ipython 173 | pickleshare==0.7.5 174 | # via ipython 175 | pip-tools==6.6.2 176 | # via dbcooper (setup.cfg) 177 | pluggy==1.0.0 178 | # via pytest 179 | prompt-toolkit==3.0.29 180 | # via ipython 181 | proto-plus==1.20.6 182 | # via 183 | # google-cloud-bigquery 184 | # google-cloud-bigquery-storage 185 | protobuf==3.20.1 186 | # via 187 | # google-api-core 188 | # google-cloud-bigquery 189 | # google-cloud-bigquery-storage 190 | # googleapis-common-protos 191 | # grpcio-status 192 | # proto-plus 193 | psutil==5.9.1 194 | # via ipykernel 195 | psycopg2-binary==2.9.3 196 | # via dbcooper (setup.cfg) 197 | ptyprocess==0.7.0 198 | # via pexpect 199 | pure-eval==0.2.2 200 | # via stack-data 201 | py==1.11.0 202 | # via pytest 203 | pyarrow==6.0.1 204 | # via 205 | # google-cloud-bigquery 206 | # sqlalchemy-bigquery 207 | pyasn1==0.4.8 208 | # via 209 | # pyasn1-modules 210 | # rsa 211 | pyasn1-modules==0.2.8 212 | # via google-auth 213 | pycparser==2.21 214 | # via cffi 215 | pycryptodomex==3.14.1 216 | # via snowflake-connector-python 217 | pydata-sphinx-theme==0.9.0 218 | # via dbcooper (setup.cfg) 219 | pygments==2.12.0 220 | # via 221 | # ipython 222 | # sphinx 223 | pyjwt==2.4.0 224 | # via snowflake-connector-python 225 | pymysql==1.0.2 226 | # via dbcooper (setup.cfg) 227 | pyopenssl==22.0.0 228 | # via snowflake-connector-python 229 | pyparsing==3.0.9 230 | # via packaging 231 | pyrsistent==0.18.1 232 | # via jsonschema 233 | pytest==7.1.2 234 | # via 235 | # dbcooper (setup.cfg) 236 | # pytest-dotenv 237 | pytest-dotenv==0.5.2 238 | # via dbcooper (setup.cfg) 239 | python-dateutil==2.8.2 240 | # via 241 | # google-cloud-bigquery 242 | # jupyter-client 243 | # pandas 244 | python-dotenv==0.20.0 245 | # via pytest-dotenv 246 | pytz==2022.1 247 | # via 248 | # babel 249 | # pandas 250 | # snowflake-connector-python 251 | pyyaml==6.0 252 | # via 253 | # jupytext 254 | # siuba 255 | pyzmq==23.1.0 256 | # via jupyter-client 257 | requests==2.28.0 258 | # via 259 | # google-api-core 260 | # google-cloud-bigquery 261 | # snowflake-connector-python 262 | # sphinx 263 | rsa==4.8 264 | # via google-auth 265 | siuba==0.3.0 266 | # via dbcooper (setup.cfg) 267 | six==1.16.0 268 | # via 269 | # asttokens 270 | # google-auth 271 | # grpcio 272 | # python-dateutil 273 | snowballstemmer==2.2.0 274 | # via sphinx 275 | snowflake-connector-python==2.7.8 276 | # via snowflake-sqlalchemy 277 | snowflake-sqlalchemy==1.3.4 278 | # via dbcooper (setup.cfg) 279 | soupsieve==2.3.2.post1 280 | # via beautifulsoup4 281 | sphinx==4.4.0 282 | # via 283 | # dbcooper (setup.cfg) 284 | # pydata-sphinx-theme 285 | sphinxcontrib-applehelp==1.0.2 286 | # via sphinx 287 | sphinxcontrib-devhelp==1.0.2 288 | # via sphinx 289 | sphinxcontrib-htmlhelp==2.0.0 290 | # via sphinx 291 | sphinxcontrib-jsmath==1.0.1 292 | # via sphinx 293 | sphinxcontrib-qthelp==1.0.3 294 | # via sphinx 295 | sphinxcontrib-serializinghtml==1.1.5 296 | # via sphinx 297 | sqlalchemy==1.4.27 298 | # via 299 | # dbcooper (setup.cfg) 300 | # siuba 301 | # snowflake-sqlalchemy 302 | # sqlalchemy-bigquery 303 | sqlalchemy-bigquery==1.4.4 304 | # via dbcooper (setup.cfg) 305 | stack-data==0.2.0 306 | # via ipython 307 | tabulate==0.8.9 308 | # via dbcooper (setup.cfg) 309 | toml==0.10.2 310 | # via jupytext 311 | tomli==2.0.1 312 | # via 313 | # pep517 314 | # pytest 315 | tornado==6.1 316 | # via 317 | # ipykernel 318 | # jupyter-client 319 | traitlets==5.2.2.post1 320 | # via 321 | # ipykernel 322 | # ipython 323 | # jupyter-client 324 | # jupyter-core 325 | # matplotlib-inline 326 | # nbformat 327 | urllib3==1.26.9 328 | # via requests 329 | wcwidth==0.2.5 330 | # via prompt-toolkit 331 | wheel==0.37.1 332 | # via pip-tools 333 | zipp==3.8.0 334 | # via 335 | # importlib-metadata 336 | # importlib-resources 337 | 338 | # The following packages are considered to be unsafe in a requirements file: 339 | # pip 340 | # setuptools 341 | -------------------------------------------------------------------------------- /dbcooper/inspect.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | from sqlalchemy import sql 4 | from sqlalchemy.sql.elements import quoted_name 5 | from sqlalchemy.engine import Dialect 6 | 7 | from .utils import SingleGeneric 8 | from .base import TableName, TableIdentity 9 | 10 | from typing import Sequence 11 | 12 | 13 | 14 | # list_tables generic ========================================================= 15 | 16 | list_tables = SingleGeneric("list_tables") 17 | 18 | @list_tables.register("sqlite") 19 | def _list_tables_sqlite(self: Dialect, conn, exclude=None) -> Sequence[TableName]: 20 | if exclude is None: 21 | exclude = ("INFORMATION_SCHEMA",) 22 | 23 | schemas = self.get_schema_names(conn) 24 | query_str = """SELECT name FROM {0} WHERE type='table' ORDER BY name""" 25 | 26 | results = [] 27 | for schema in schemas: 28 | if schema in exclude: 29 | continue 30 | 31 | qschema = self.identifier_preparer.quote_identifier(schema) 32 | qmaster = f"{qschema}.sqlite_master" 33 | q = conn.exec_driver_sql(query_str.format(qmaster)) 34 | 35 | for row in q: 36 | results.append(TableName(None, schema, row[0])) 37 | 38 | return results 39 | 40 | 41 | @list_tables.register("mysql") 42 | def _list_tables_mysql(self: Dialect, conn, exclude=None) -> Sequence[TableName]: 43 | if exclude is None: 44 | exclude = tuple() 45 | 46 | q = conn.execute(""" 47 | SELECT table_schema AS "schema", table_name as "name" 48 | FROM INFORMATION_SCHEMA.TABLES 49 | WHERE 50 | TABLE_TYPE='BASE TABLE' 51 | AND TABLE_SCHEMA NOT IN ('mysql', 'performance_schema', 'sys') 52 | """) 53 | 54 | results = [TableName(None, row[0], row[1]) for row in q] 55 | return _filter_result(results, exclude) 56 | 57 | 58 | 59 | @list_tables.register("postgresql") 60 | @list_tables.register("duckdb") 61 | def _list_tables_pg(self: Dialect, conn, exclude=None) -> Sequence[TableName]: 62 | if exclude is None: 63 | exclude = ("information_schema", "pg_catalog") 64 | 65 | q = conn.execute(sql.text(""" 66 | SELECT db.db_name, nspname, relname FROM pg_class c 67 | JOIN pg_namespace n ON n.oid = c.relnamespace 68 | CROSS JOIN (SELECT current_database() AS db_name) db 69 | WHERE 70 | c.relkind in ('r', 'p', 'v') 71 | """)) 72 | 73 | result = [TableName(*row) for row in q] 74 | 75 | return _filter_result(result, exclude) 76 | 77 | 78 | @list_tables.register("duckdb") 79 | def _list_tables_pg(self: Dialect, conn, exclude=None) -> Sequence[TableName]: 80 | if exclude is None: 81 | exclude = ("information_schema", "pg_catalog") 82 | 83 | q = conn.execute(sql.text(""" 84 | SELECT db.db_name, table_schema, table_name FROM information_schema.tables c 85 | CROSS JOIN (SELECT current_database() AS db_name) db 86 | """)) 87 | 88 | result = [TableName(*row) for row in q] 89 | 90 | return _filter_result(result, exclude) 91 | 92 | 93 | @list_tables.register("snowflake") 94 | def _list_tables_sf(self: Dialect, conn, exclude=None) -> Sequence[TableName]: 95 | 96 | if exclude is None: 97 | exclude = ("INFORMATION_SCHEMA",) 98 | 99 | # snowflake sql supports urls with ...//, 100 | # so we need to parse them out. 101 | # note that alternatively, you could get conn.connection.database, etc.. 102 | engine = conn.engine 103 | _, opts = engine.dialect.create_connect_args(engine.url) 104 | db_name, schema_name = opts.get("database"), opts.get("schema") 105 | 106 | if schema_name: 107 | full_name = ".".join([db_name, schema_name]) 108 | in_clause = f"IN SCHEMA {full_name}" 109 | elif db_name: 110 | in_clause = f"IN DATABASE {db_name}" 111 | else: 112 | in_clause = "IN ACCOUNT" 113 | 114 | tables = conn.execute(sql.text( 115 | "SHOW TERSE TABLES " + in_clause 116 | )) 117 | 118 | views = conn.execute(sql.text( 119 | "SHOW TERSE VIEWS " + in_clause 120 | )) 121 | 122 | result = [] 123 | for row in itertools.chain(tables, views): 124 | if db_name: 125 | # a default database is set. snowflake's dialect automatically prepends 126 | # the default database name everywhere, so we need to set database 127 | # to None in our results 128 | result.append(TableName(None, row[4], row[1])) 129 | else: 130 | # no default database, so return database in results. this allows 131 | # us to specify sqlalchemy.Table(..., schema=".") 132 | result.append(TableName(row[3], row[4], row[1])) 133 | 134 | return _filter_result(result, exclude) 135 | 136 | 137 | @list_tables.register("bigquery") 138 | def _list_tables_bq(self: Dialect, conn, schema=None, exclude=None) -> Sequence[TableName]: 139 | if exclude is None: 140 | exclude = ("information_schema",) 141 | 142 | from google.api_core import exceptions 143 | 144 | client = conn.connection._client 145 | datasets = client.list_datasets() 146 | 147 | result = [] 148 | for dataset in datasets: 149 | try: 150 | tables = client.list_tables(dataset.reference, self.list_tables_page_size) 151 | 152 | for table in tables: 153 | result.append(TableName(table.project, table.reference.dataset_id, table.table_id)) 154 | except exceptions.NotFound: 155 | pass 156 | 157 | return _filter_result(result, exclude) 158 | 159 | def _filter_result(result: Sequence[TableName], exclude: "Sequence | set") -> Sequence[TableName]: 160 | exclude_set = set(exclude) 161 | return [entry for entry in result if entry.schema not in exclude_set] 162 | 163 | 164 | # Table formatter ============================================================= 165 | 166 | format_table = SingleGeneric("format_table") 167 | 168 | def _join_parts(dialect, parts): 169 | return "_".join(parts) 170 | 171 | def _table_from_part(dialect, table, from_part): 172 | tup = table.to_tuple(exists=True) 173 | ii = table.field_index_from_end(from_part) 174 | return _join_parts(dialect, tup[ii:]) 175 | 176 | 177 | @format_table.register_default 178 | def _format_table_default(self: Dialect, table: TableName, from_part=None) -> str: 179 | if from_part is not None: 180 | return _table_from_part(self, table, from_part) 181 | 182 | # just use fully qualified name parts to generate user friendly name 183 | # e.g. databasename_schemaname_tablename 184 | tup = table.to_tuple(exists=True) 185 | return _join_parts(self, tup) 186 | 187 | #@format_table.register("snowflake") 188 | #def _format_table_sf(self: Dialect, table: TableName, from_part=None) -> str: 189 | # # names in snowflake are by default case insensitive (like many databases), 190 | # # however, they are also UPPERCASE. Make lowercase for ease of use. 191 | # lower = TableName(*[x.lower() if x is not None else x for x in table.to_tuple()]) 192 | # return format_table.default(self, lower, from_part) 193 | 194 | 195 | @format_table.register("sqlite") 196 | @format_table.register("postgresql") 197 | @format_table.register("mysql") 198 | @format_table.register("bigquery") 199 | def _format_table_no_db(self: Dialect, table: TableName, from_part=None) -> str: 200 | """By default only use schema and table name. 201 | 202 | Note that this function is meant to be used for database implementations that 203 | can't use the same sqlalchemy engine to query across databases. (Or that call 204 | a schema "database"). 205 | """ 206 | 207 | if from_part is not None: 208 | return _table_from_part(self, table, from_part) 209 | 210 | # return {schema_name}.{table_name} 211 | tup = table.to_tuple(exists=True) 212 | return _join_parts(self, tup[-2:]) 213 | 214 | 215 | # Table Identifier ============================================================ 216 | 217 | identify_table = SingleGeneric("identify_table") 218 | 219 | def _identify_default_parts(dialect, parts): 220 | if len(parts) == 3: 221 | schema = ".".join(parts[:2]) 222 | elif len(parts) == 2: 223 | schema = parts[0] 224 | else: 225 | schema = None 226 | 227 | return TableIdentity(schema, parts[-1]) 228 | 229 | def quote_if_not_upper(x): 230 | if x != x.upper(): 231 | return quoted_name(x, True) 232 | 233 | return x 234 | 235 | def _identify_snowflake_parts(dialect, parts): 236 | # Handle snowflake, whose dialect is a bit funky --- 237 | # basically, snowflake assumes you are being case insensitive, 238 | # e.g. that some_table means SOME_TABLE. You can escape this by the quoting 239 | # functions below. However, snowflake dialect also tries to be clever, and 240 | # knows that sOmE_tAbLe needs to be escaped. 241 | # 242 | # Unfortunately its code is wrong in a way that if you quote an uppercase 243 | # string it will fail. So we have to detect uppercase names. 244 | quoted = [dialect.identifier_preparer.quote_identifier(x) for x in parts] 245 | if len(parts) == 3: 246 | schema = quoted_name(".".join(quoted[0:2]), False) 247 | elif len(parts) == 2: 248 | schema = quote_if_not_upper(parts[0]) 249 | else: 250 | schema = None 251 | 252 | table_name = quote_if_not_upper(parts[-1]) 253 | return TableIdentity(schema, table_name) 254 | 255 | 256 | @identify_table.register_default 257 | def _identify_table_default(self: Dialect, table: TableName, from_part=None): 258 | """By default only use schema and table name. 259 | 260 | Note that this function is meant to be used for database implementations that 261 | can't use the same sqlalchemy engine to query across databases. (Or that call 262 | a schema "database"). 263 | """ 264 | 265 | if from_part is not None: 266 | tup = table.to_tuple(exists=True) 267 | ii = table.field_index_from_end(from_part) 268 | return _identify_default_parts(self, tup[ii:]) 269 | 270 | # Note that database is omitted 271 | return _identify_default_parts(self, table.to_tuple(exists=True)[-2:]) 272 | 273 | 274 | @identify_table.register("snowflake") 275 | def _identify_table_snowflake(self: Dialect, table: TableName, from_part=None): 276 | if from_part is not None: 277 | tup = table.to_tuple(exists=True) 278 | ii = table.field_index_from_end(from_part) 279 | return _identify_snowflake_parts(self, tup[ii:]) 280 | 281 | return _identify_snowflake_parts(self, table.to_tuple(exists=True)) 282 | 283 | @identify_table.register("bigquery") 284 | def _identify_table_bigquery(self: Dialect, table: TableName, from_part=None): 285 | # uses the default implementation (no need for explicit quoting), but 286 | # includes database name 287 | if from_part is None: 288 | from_part = "database" 289 | 290 | return identify_table.default(self, table, from_part=from_part) 291 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dbcooper-py 2 | 3 | [![CI](https://github.com/machow/dbcooper-py/actions/workflows/ci.yml/badge.svg)](https://github.com/machow/dbcooper-py/actions/workflows/ci.yml) 4 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/machow/dbcooper-py/HEAD) 5 | 6 | The dbcooper package turns a database connection into a collection of functions, 7 | handling logic for keeping track of connections and letting you take advantage of 8 | autocompletion when exploring a database. 9 | 10 | It's especially helpful to use when authoring database-specific Python packages, 11 | for instance in an internal company package or one wrapping a public data source. 12 | 13 | For the R version see [dgrtwo/dbcooper](https://github.com/dgrtwo/dbcooper). 14 | 15 | ## Installation 16 | 17 | ``` 18 | pip install dbcooper 19 | ``` 20 | 21 | ## Example 22 | 23 | ### Initializing the functions 24 | 25 | The dbcooper package asks you to create the connection first. 26 | As an example, we'll use the Lahman baseball database package (`lahman`). 27 | 28 | 29 | ```python 30 | from sqlalchemy import create_engine 31 | from dbcooper.data import lahman_sqlite 32 | 33 | # connect to sqlite 34 | engine = create_engine("sqlite://") 35 | 36 | # load the lahman data into the "lahman" schema 37 | lahman_sqlite(engine) 38 | ``` 39 | 40 | Next we'll set up dbcooper 41 | 42 | 43 | ```python 44 | from dbcooper import DbCooper 45 | 46 | dbc = DbCooper(engine) 47 | ``` 48 | 49 | The `DbCooper` object contains two important things: 50 | 51 | * Accessors to fetch specific tables. 52 | * Functions for interacting with the underlying database. 53 | 54 | ### Using table accessors 55 | 56 | In the example below, we'll use the `"Lahman"."Salaries"` table as an example. 57 | By default, dbcooper makes this accessible as `.lahman_salaries`. 58 | 59 | **Plain** `.lahman_salaries` prints out table and column info, including types and descriptions. 60 | 61 | 62 | ```python 63 | # show table and column descriptions 64 | dbc.lahman_salaries 65 | ``` 66 | 67 | 68 | 69 | 70 |

salaries

71 |

(No table description.)

72 |
73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 |
name type description
index BIGINT
yearID BIGINT
teamID TEXT
lgID TEXT
playerIDTEXT
salary BIGINT
85 | 86 | 87 | 88 | Note that sqlite doesn't support table and columnn descriptions, so these sections 89 | are empty. 90 | 91 | **Calling** `.lahman_salaries()` fetches a lazy version of the data. 92 | 93 | 94 | 95 | ```python 96 | dbc.lahman_salaries() 97 | ``` 98 | 99 | 100 | 101 | 102 | # Source: lazy query 103 | # DB Conn: Engine(sqlite://) 104 | # Preview: 105 | index yearID teamID lgID playerID salary 106 | 0 0 1985 ATL NL barkele01 870000 107 | 1 1 1985 ATL NL bedrost01 550000 108 | 2 2 1985 ATL NL benedbr01 545000 109 | 3 3 1985 ATL NL campri01 633333 110 | 4 4 1985 ATL NL ceronri01 625000 111 | # .. may have more rows 112 | 113 | 114 | 115 | Note that this data is a siuba `LazyTbl` object, which you can use to analyze the data. 116 | 117 | 118 | ```python 119 | from siuba import _, count 120 | 121 | dbc.lahman_salaries() >> count(over_100k = _.salary > 100_000) 122 | ``` 123 | 124 | 125 | 126 | 127 | # Source: lazy query 128 | # DB Conn: Engine(sqlite://) 129 | # Preview: 130 | over_100k n 131 | 0 True 25374 132 | 1 False 1054 133 | # .. may have more rows 134 | 135 | 136 | 137 | ### Using database functions 138 | 139 | * `.list()`: Get a list of tables 140 | * `.tbl()`: Access a table that can be worked with using `siuba`. 141 | * `.query()`: Perform a SQL query and work with the result. 142 | * `._engine`: Get the underlying sqlalchemy engine. 143 | 144 | For instance, we could start by finding the names of the tables in the Lahman database. 145 | 146 | 147 | ```python 148 | dbc.list() 149 | ``` 150 | 151 | 152 | 153 | 154 | ['lahman.allstar_full', 155 | 'lahman.appearances', 156 | 'lahman.awards_managers', 157 | 'lahman.awards_players', 158 | 'lahman.awards_share_managers', 159 | 'lahman.awards_share_players', 160 | 'lahman.batting', 161 | 'lahman.batting_post', 162 | 'lahman.college_playing', 163 | 'lahman.fielding', 164 | 'lahman.fielding_of', 165 | 'lahman.fielding_ofsplit', 166 | 'lahman.fielding_post', 167 | 'lahman.hall_of_fame', 168 | 'lahman.home_games', 169 | 'lahman.managers', 170 | 'lahman.managers_half', 171 | 'lahman.parks', 172 | 'lahman.people', 173 | 'lahman.pitching', 174 | 'lahman.pitching_post', 175 | 'lahman.salaries', 176 | 'lahman.schools', 177 | 'lahman.series_post', 178 | 'lahman.teams', 179 | 'lahman.teams_franchises', 180 | 'lahman.teams_half'] 181 | 182 | 183 | 184 | We can access one of these tables with `dbc.tbl()`, then put it through any kind 185 | of siuba operation. 186 | 187 | 188 | ```python 189 | dbc.tbl("Salaries") 190 | ``` 191 | 192 | 193 | 194 | 195 | # Source: lazy query 196 | # DB Conn: Engine(sqlite://) 197 | # Preview: 198 | index yearID teamID lgID playerID salary 199 | 0 0 1985 ATL NL barkele01 870000 200 | 1 1 1985 ATL NL bedrost01 550000 201 | 2 2 1985 ATL NL benedbr01 545000 202 | 3 3 1985 ATL NL campri01 633333 203 | 4 4 1985 ATL NL ceronri01 625000 204 | # .. may have more rows 205 | 206 | 207 | 208 | 209 | ```python 210 | from siuba import _, count 211 | dbc.tbl("Salaries") >> count(_.yearID, sort=True) 212 | ``` 213 | 214 | 215 | 216 | 217 | # Source: lazy query 218 | # DB Conn: Engine(sqlite://) 219 | # Preview: 220 | yearID n 221 | 0 1999 1006 222 | 1 1998 998 223 | 2 1995 986 224 | 3 1996 931 225 | 4 1997 925 226 | # .. may have more rows 227 | 228 | 229 | 230 | If you'd rather start from a SQL query, use the `.query()` method. 231 | 232 | 233 | ```python 234 | dbc.query(""" 235 | SELECT 236 | playerID, 237 | sum(AB) as AB 238 | FROM Batting 239 | GROUP BY playerID 240 | """) 241 | ``` 242 | 243 | 244 | 245 | 246 | # Source: lazy query 247 | # DB Conn: Engine(sqlite://) 248 | # Preview: 249 | playerID AB 250 | 0 aardsda01 4 251 | 1 aaronha01 12364 252 | 2 aaronto01 944 253 | 3 aasedo01 5 254 | 4 abadan01 21 255 | # .. may have more rows 256 | 257 | 258 | 259 | For anything else you might want to do, the sqlalchemy Engine object is available. 260 | For example, the code below shows how you can set its `.echo` attribute, which 261 | tells sqlalchemy to provide useful logs. 262 | 263 | 264 | ```python 265 | dbc._engine.echo = True 266 | table_names = dbc.list() 267 | ``` 268 | 269 | 2022-03-20 22:49:37,553 INFO sqlalchemy.engine.Engine PRAGMA database_list 270 | 2022-03-20 22:49:37,554 INFO sqlalchemy.engine.Engine [raw sql] () 271 | 2022-03-20 22:49:37,555 INFO sqlalchemy.engine.Engine SELECT name FROM "main".sqlite_master WHERE type='table' ORDER BY name 272 | 2022-03-20 22:49:37,555 INFO sqlalchemy.engine.Engine [raw sql] () 273 | 2022-03-20 22:49:37,556 INFO sqlalchemy.engine.Engine SELECT name FROM "lahman".sqlite_master WHERE type='table' ORDER BY name 274 | 2022-03-20 22:49:37,557 INFO sqlalchemy.engine.Engine [raw sql] () 275 | 276 | 277 | Note that the log messages above show that the `.list()` method executed two queries: 278 | One to list tables in the "main" schema (which is empty), and one to list tables 279 | in the "lahman" schema. 280 | 281 | ## Advanced Configuration 282 | 283 | > ⚠️: These behaviors are well tested, but dbcooper's internals and API may change. 284 | 285 | dbcooper can be configured in three ways, each corresponding to a class interface: 286 | 287 | * **TableFinder**: Which tables will be used by `dbcooper`. 288 | * **AccessorBuilder**: How table names are turned into accessors. 289 | * **DbcDocumentedTable**: The class that defines what an accessor will return. 290 | 291 | 292 | ```python 293 | from sqlalchemy import create_engine 294 | from dbcooper.data import lahman_sqlite 295 | from dbcooper import DbCooper, AccessorBuilder 296 | 297 | engine = create_engine("sqlite://") 298 | lahman_sqlite(engine) 299 | ``` 300 | 301 | ### Excluding a schema 302 | 303 | 304 | ```python 305 | from dbcooper import TableFinder 306 | 307 | finder = TableFinder(exclude_schemas=["lahman"]) 308 | dbc_no_lahman = DbCooper(engine, table_finder=finder) 309 | dbc_no_lahman.list() 310 | ``` 311 | 312 | 313 | 314 | 315 | [] 316 | 317 | 318 | 319 | ### Formatting table names 320 | 321 | 322 | ```python 323 | from dbcooper import AccessorBuilder 324 | 325 | # omits schema, and keeps only table name 326 | # e.g. `salaries`, rather than `lahman_salaries` 327 | builder = AccessorBuilder(format_from_part="table") 328 | 329 | tbl_flat = DbCooper(engine, accessor_builder=builder) 330 | tbl_flat.salaries() 331 | ``` 332 | 333 | 334 | 335 | 336 | # Source: lazy query 337 | # DB Conn: Engine(sqlite://) 338 | # Preview: 339 | index yearID teamID lgID playerID salary 340 | 0 0 1985 ATL NL barkele01 870000 341 | 1 1 1985 ATL NL bedrost01 550000 342 | 2 2 1985 ATL NL benedbr01 545000 343 | 3 3 1985 ATL NL campri01 633333 344 | 4 4 1985 ATL NL ceronri01 625000 345 | # .. may have more rows 346 | 347 | 348 | 349 | ### Grouping tables by schema 350 | 351 | 352 | ```python 353 | from dbcooper import AccessorHierarchyBuilder 354 | 355 | tbl_nested = DbCooper(engine, accessor_builder=AccessorHierarchyBuilder()) 356 | 357 | # note the form: . 358 | tbl_nested.lahman.salaries() 359 | ``` 360 | 361 | 362 | 363 | 364 | # Source: lazy query 365 | # DB Conn: Engine(sqlite://) 366 | # Preview: 367 | index yearID teamID lgID playerID salary 368 | 0 0 1985 ATL NL barkele01 870000 369 | 1 1 1985 ATL NL bedrost01 550000 370 | 2 2 1985 ATL NL benedbr01 545000 371 | 3 3 1985 ATL NL campri01 633333 372 | 4 4 1985 ATL NL ceronri01 625000 373 | # .. may have more rows 374 | 375 | 376 | 377 | ### Don't show table documentation 378 | 379 | 380 | ```python 381 | from dbcooper import DbcSimpleTable 382 | 383 | dbc_no_doc = DbCooper(engine, table_factory=DbcSimpleTable) 384 | dbc_no_doc.lahman_salaries 385 | ``` 386 | 387 | 388 | 389 | 390 | DbcSimpleTable(..., 'salaries', 'lahman') 391 | 392 | 393 | 394 | Note that sqlalchemy dialects like `snowflake-sqlalchemy` cannot look up things 395 | like table and column descriptions as well as other dialects, so `DbcSimpleTable` 396 | may be needed to connect to snowflake (see [this issue](https://github.com/snowflakedb/snowflake-sqlalchemy/issues/276)). 397 | 398 | ## Developing 399 | 400 | ```shell 401 | # install with development dependencies 402 | pip install -e .[dev] 403 | 404 | # or install from requirements file 405 | pip install -r requirements/dev.txt 406 | ``` 407 | 408 | ### Test 409 | 410 | ```shell 411 | # run all tests, see pytest section of pyproject.toml 412 | pytest 413 | 414 | # run specific backends 415 | pytest -m 'not snowflake and not bigquery' 416 | 417 | # stop on first failure, drop into debugger 418 | pytest -x --pdb 419 | ``` 420 | 421 | ### Release 422 | 423 | ```shell 424 | # set version number 425 | git tag v0.0.1 426 | 427 | # (optional) push to github 428 | git push origin --tags 429 | 430 | # check version 431 | python -m setuptools_scm 432 | ``` 433 | --------------------------------------------------------------------------------