├── dask_deltatable ├── py.typed ├── types.py ├── __init__.py ├── utils.py ├── _schema.py ├── write.py └── core.py ├── MANIFEST.in ├── requirements.txt ├── tests ├── data │ ├── empty1.zip │ ├── empty2.zip │ ├── simple.zip │ ├── simple2.zip │ ├── partition.zip │ └── checkpoint.zip ├── test_distributed.py ├── test_write.py ├── test_utils.py ├── test_acceptance.py └── test_core.py ├── dev_requirements.txt ├── .flake8 ├── continous_integeration ├── environment-3.10.yaml ├── environment-3.11.yaml ├── environment-3.12.yaml └── environment-3.9.yaml ├── .github └── workflows │ ├── pre-commit.yaml │ ├── deploy.yaml │ └── tests.yaml ├── pyproject.toml ├── setup.cfg ├── .pre-commit-config.yaml ├── setup.py ├── LICENSE ├── conftest.py ├── .gitignore └── README.md /dask_deltatable/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include dask_deltatable/py.typed 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dask[dataframe] 2 | deltalake>=1.1.0 3 | fsspec 4 | pyarrow 5 | -------------------------------------------------------------------------------- /tests/data/empty1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/empty1.zip -------------------------------------------------------------------------------- /tests/data/empty2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/empty2.zip -------------------------------------------------------------------------------- /tests/data/simple.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/simple.zip -------------------------------------------------------------------------------- /tests/data/simple2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/simple2.zip -------------------------------------------------------------------------------- /tests/data/partition.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/partition.zip -------------------------------------------------------------------------------- /tests/data/checkpoint.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/checkpoint.zip -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | mypy==0.991 2 | mypy-extensions==0.4.3 3 | pytest==7.2.0 4 | pytest-cov==4.0.0 5 | black==22.3.0 6 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | # flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234 2 | [flake8] 3 | max-line-length = 120 4 | -------------------------------------------------------------------------------- /dask_deltatable/types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Union 4 | 5 | Filter = tuple[str, str, Any] 6 | Filters = Union[list[Filter], list[list[Filter]], None] 7 | -------------------------------------------------------------------------------- /continous_integeration/environment-3.10.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.10 6 | - dask 7 | - pyarrow 8 | - pytest 9 | - pytest-cov 10 | -------------------------------------------------------------------------------- /continous_integeration/environment-3.11.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.11 6 | - dask 7 | - pyarrow 8 | - pytest 9 | - pytest-cov 10 | -------------------------------------------------------------------------------- /continous_integeration/environment-3.12.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.12 6 | - dask 7 | - pyarrow 8 | - pytest 9 | - pytest-cov 10 | -------------------------------------------------------------------------------- /continous_integeration/environment-3.9.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.9 6 | - dask 7 | - pyarrow 8 | - pytest 9 | - pytest-cov 10 | -------------------------------------------------------------------------------- /dask_deltatable/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | __all__ = [ 4 | "read_deltalake", 5 | "read_unity_catalog", 6 | "to_deltalake", 7 | ] 8 | 9 | from .core import read_deltalake as read_deltalake 10 | from .core import read_unity_catalog as read_unity_catalog 11 | from .write import to_deltalake as to_deltalake 12 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Python Style Check 3 | on: [push,pull_request] 4 | 5 | jobs: 6 | checks: 7 | name: pre-commit hooks 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3.5.2 11 | - uses: actions/setup-python@v4 12 | with: 13 | python-version: '3.9' 14 | - uses: pre-commit/action@v3.0.0 15 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.mypy] 2 | strict = true 3 | no_implicit_reexport = false 4 | allow_incomplete_defs = true 5 | allow_untyped_defs = true 6 | warn_return_any = false 7 | disallow_untyped_calls = false 8 | ignore_missing_imports = true 9 | 10 | [[tool.mypy.overrides]] 11 | module = "pyarrow.*" 12 | ignore_missing_imports = true 13 | 14 | [tool.isort] 15 | profile = "black" 16 | add_imports = ["from __future__ import annotations"] 17 | 18 | [tool.black] 19 | target-version = ['py310'] 20 | include = '\.pyi?$' 21 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234 2 | [flake8] 3 | exclude = __init__.py 4 | max-line-length = 120 5 | ignore = 6 | # Extra space in brackets 7 | E20 8 | # Multiple spaces around "," 9 | E231,E241 10 | # Comments 11 | E26 12 | # Import formatting 13 | E4 14 | # Comparing types instead of isinstance 15 | E721 16 | # Assigning lambda expression 17 | E731 18 | # Ambiguous variable names 19 | E741 20 | # Line break before binary operator 21 | W503 22 | # Line break after binary operator 23 | W504 24 | # Redefinition of unused 'loop' from line 10 25 | F811 26 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created, updated] 6 | 7 | jobs: 8 | deploy: 9 | name: Deploy 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: conda-incubator/setup-miniconda@v2 15 | with: 16 | miniforge-variant: Mambaforge 17 | use-mamba: true 18 | python-version: 3.9 19 | - name: Install dependencies 20 | shell: bash -l {0} 21 | run: | 22 | pip install setuptools wheel twine 23 | which python 24 | pip list 25 | conda list 26 | - name: Build and publish 27 | shell: bash -l {0} 28 | env: 29 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 30 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 31 | run: | 32 | python setup.py sdist bdist_wheel 33 | twine upload dist/* 34 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 25.1.0 4 | hooks: 5 | - id: black 6 | language_version: python3 7 | - repo: https://github.com/pycqa/isort 8 | rev: 6.0.1 9 | hooks: 10 | - id: isort 11 | - repo: https://github.com/pre-commit/pre-commit-hooks 12 | rev: v5.0.0 13 | hooks: 14 | - id: trailing-whitespace 15 | - id: end-of-file-fixer 16 | - id: check-yaml 17 | - id: check-added-large-files 18 | - repo: https://github.com/asottile/pyupgrade 19 | rev: v3.20.0 20 | hooks: 21 | - id: pyupgrade 22 | args: 23 | - --py39-plus 24 | - repo: https://github.com/pre-commit/mirrors-mypy 25 | rev: v1.16.1 26 | hooks: 27 | - id: mypy 28 | # Override default --ignore-missing-imports 29 | # Use pyproject.toml if possible instead of adding command line parameters here 30 | args: [--warn-unused-configs] 31 | additional_dependencies: 32 | # Type stubs 33 | - boto3-stubs 34 | - dask 35 | - deltalake>=0.16 36 | - pandas-stubs 37 | - pytest 38 | - types-setuptools 39 | - repo: https://github.com/pycqa/flake8 40 | rev: 7.3.0 41 | hooks: 42 | - id: flake8 43 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import annotations 4 | 5 | from setuptools import setup 6 | 7 | with open("README.md", encoding="utf-8") as f: 8 | long_description = f.read() 9 | 10 | setup( 11 | name="dask-deltatable", 12 | version="0.4.0", 13 | description="Dask + Delta Table ", 14 | url="https://github.com/dask-contrib/dask-deltatable/", 15 | maintainer="rajagurunath", 16 | maintainer_email="gurunathrajagopal@gmail.com", 17 | license="BSD-3-Clause", 18 | packages=["dask_deltatable"], 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | python_requires=">=3.9", 22 | install_requires=open("requirements.txt").read().strip().split("\n"), 23 | extras_require={ 24 | "dev": ["pytest", "requests", "pytest-cov>=2.10.1"], 25 | "s3": ["s3fs", "boto3"], 26 | "uc": ["adlfs", "databricks-sdk"], 27 | }, 28 | classifiers=[ 29 | "Development Status :: 5 - Production/Stable", 30 | "Intended Audience :: Developers", 31 | "Intended Audience :: Science/Research", 32 | "Topic :: Database", 33 | "Topic :: Scientific/Engineering", 34 | "License :: OSI Approved :: BSD License", 35 | "Programming Language :: Python :: 3", 36 | "Programming Language :: Python :: 3 :: Only", 37 | "Programming Language :: Python :: 3.9", 38 | "Programming Language :: Python :: 3.10", 39 | "Programming Language :: Python :: 3.11", 40 | ], 41 | include_package_data=True, 42 | zip_safe=False, 43 | ) 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, Dask contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from 19 | this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import zipfile 5 | 6 | import pytest 7 | 8 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) 9 | DATA_DIR = os.path.join(ROOT_DIR, "tests", "data") 10 | 11 | 12 | @pytest.fixture() 13 | def simple_table(tmpdir): 14 | output_dir = tmpdir 15 | deltaf = zipfile.ZipFile(f"{DATA_DIR}/simple.zip") 16 | deltaf.extractall(output_dir) 17 | return str(output_dir) + "/test1/" 18 | 19 | 20 | @pytest.fixture() 21 | def simple_table2(tmpdir): 22 | output_dir = tmpdir 23 | deltaf = zipfile.ZipFile(f"{DATA_DIR}/simple2.zip") 24 | deltaf.extractall(output_dir) 25 | return str(output_dir) + "/simple_table/" 26 | 27 | 28 | @pytest.fixture() 29 | def partition_table(tmpdir): 30 | output_dir = tmpdir 31 | deltaf = zipfile.ZipFile(f"{DATA_DIR}/partition.zip") 32 | deltaf.extractall(output_dir) 33 | return str(output_dir) + "/test2/" 34 | 35 | 36 | @pytest.fixture() 37 | def empty_table1(tmpdir): 38 | output_dir = tmpdir 39 | deltaf = zipfile.ZipFile(f"{DATA_DIR}/empty1.zip") 40 | deltaf.extractall(output_dir) 41 | return str(output_dir) + "/empty/" 42 | 43 | 44 | @pytest.fixture() 45 | def empty_table2(tmpdir): 46 | output_dir = tmpdir 47 | deltaf = zipfile.ZipFile(f"{DATA_DIR}/empty2.zip") 48 | deltaf.extractall(output_dir) 49 | return str(output_dir) + "/empty2/" 50 | 51 | 52 | @pytest.fixture() 53 | def checkpoint_table(tmpdir): 54 | output_dir = tmpdir 55 | deltaf = zipfile.ZipFile(f"{DATA_DIR}/checkpoint.zip") 56 | deltaf.extractall(output_dir) 57 | return str(output_dir) + "/checkpoint/" 58 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | # When this workflow is queued, automatically cancel any previous running 6 | # or pending jobs from the same branch 7 | concurrency: 8 | group: tests-${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | test: 13 | runs-on: ${{ matrix.os }} 14 | defaults: 15 | run: 16 | shell: bash -l {0} 17 | strategy: 18 | matrix: 19 | os: ["windows-latest", "ubuntu-latest", "macos-latest"] 20 | python-version: ["3.9", "3.10", "3.11", "3.12"] 21 | 22 | steps: 23 | - name: Checkout source 24 | uses: actions/checkout@v3.5.3 25 | with: 26 | fetch-depth: 0 # Needed by codecov.io 27 | 28 | - name: Setup Conda Environment 29 | uses: conda-incubator/setup-miniconda@v3.2.0 30 | with: 31 | miniforge-version: latest 32 | channel-priority: strict 33 | python-version: ${{ matrix.python-version }} 34 | environment-file: continous_integeration/environment-${{ matrix.python-version }}.yaml 35 | activate-environment: test-environment 36 | auto-activate-base: false 37 | 38 | - name: Install dask-deltatable 39 | run: python -m pip install -e ".[dev]" 40 | 41 | - name: conda list 42 | run: conda list 43 | 44 | - name: Run tests 45 | id: run_tests 46 | run: | 47 | set -o pipefail 48 | mkdir reports 49 | 50 | python -m pytest tests \ 51 | --junitxml=reports/test-results.xml \ 52 | --cov-report=xml \ 53 | --cov dask_deltatable \ 54 | | tee reports/stdout 55 | 56 | - name: Upload test results 57 | # ensure this runs even if pytest fails 58 | if: > 59 | always() && 60 | (steps.run_tests.outcome == 'success' || steps.run_tests.outcome == 'failure') 61 | uses: actions/upload-artifact@v4 62 | with: 63 | name: my-artifacts-${{ strategy.job-index }} 64 | path: reports 65 | 66 | - name: Upload coverage to Codecov 67 | uses: codecov/codecov-action@v3 68 | with: 69 | token: ${{ secrets.CODECOV_TOKEN }} 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dask-worker-space/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # PyCharm project settings 123 | .idea 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | .DS_Store 136 | junit/ 137 | 138 | # downloaded DAT files: https://github.com/delta-incubator/dat 139 | tests/out/ 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Dask-DeltaTable 2 | 3 | Reading and writing to Delta Lake using Dask engine. 4 | 5 | ### Installation 6 | 7 | `dask-deltatable` is available on PyPI: 8 | 9 | ``` 10 | pip install dask-deltatable 11 | ``` 12 | 13 | And conda-forge: 14 | 15 | ``` 16 | conda install -c conda-forge dask-deltatable 17 | ``` 18 | 19 | ### Features: 20 | 21 | 1. Read the parquet files from Delta Lake and parallelize with Dask 22 | 2. Write Dask dataframes to Delta Lake (limited support) 23 | 3. Supports multiple filesystems (s3, azurefs, gcsfs) 24 | 4. Subset of Delta Lake features: 25 | - Time Travel 26 | - Schema evolution 27 | - Parquet filters 28 | - row filter 29 | - partition filter 30 | 31 | ### Not supported 32 | 33 | 1. Writing to Delta Lake is still in development. 34 | 2. `optimize` API to run a bin-packing operation on a Delta Table. 35 | 36 | ### Reading from Delta Lake 37 | 38 | ```python 39 | import dask_deltatable as ddt 40 | 41 | # read delta table 42 | df = ddt.read_deltalake("delta_path") 43 | 44 | # with specific version 45 | df = ddt.read_deltalake("delta_path", version=3) 46 | 47 | # with specific datetime 48 | df = ddt.read_deltalake("delta_path", datetime="2018-12-19T16:39:57-08:00") 49 | ``` 50 | 51 | `df` is a Dask DataFrame that you can work with in the same way you normally would. See 52 | [the Dask DataFrame documentation](https://docs.dask.org/en/stable/dataframe.html) for 53 | available operations. 54 | 55 | ### Accessing remote file systems 56 | 57 | To be able to read from S3, azure, gcsfs, and other remote filesystems, 58 | you ensure the credentials are properly configured in environment variables 59 | or config files. For AWS, you may need `~/.aws/credential`; for gcsfs, 60 | `GOOGLE_APPLICATION_CREDENTIALS`. Refer to your cloud provider documentation 61 | to configure these. 62 | 63 | ```python 64 | ddt.read_deltalake("s3://bucket_name/delta_path", version=3) 65 | ``` 66 | 67 | ### Accessing AWS Glue catalog 68 | 69 | `dask-deltatable` can connect to AWS Glue catalog to read the delta table. 70 | The method will look for `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` 71 | environment variables, and if those are not available, fall back to 72 | `~/.aws/credentials`. 73 | 74 | Example: 75 | 76 | ```python 77 | ddt.read_deltalake(catalog="glue", database_name="science", table_name="physics") 78 | ``` 79 | 80 | ### Accessing Unity catalog 81 | 82 | `dask-deltatable` can connect to Unity catalog to read the delta table. 83 | The method will look for `DATABRICKS_HOST` and `DATABRICKS_TOKEN` environment 84 | variables or try to find them as `kwargs` with the same name but lowercase. 85 | 86 | Example: 87 | 88 | ```python 89 | ddt.read_unity_catalog( 90 | catalog_name="projects", 91 | schema_name="science", 92 | table_name="physics" 93 | ) 94 | ``` 95 | 96 | ### Writing to Delta Lake 97 | 98 | To write a Dask dataframe to Delta Lake, use `to_deltalake` method. 99 | 100 | ```python 101 | import dask.dataframe as dd 102 | import dask_deltatable as ddt 103 | 104 | df = dd.read_csv("s3://bucket_name/data.csv") 105 | # do some processing on the dataframe... 106 | ddt.to_deltalake("s3://bucket_name/delta_path", df) 107 | ``` 108 | 109 | Writing to Delta Lake is still in development, so be aware that some features 110 | may not work. 111 | -------------------------------------------------------------------------------- /tests/test_distributed.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | distributed = pytest.importorskip("distributed") 6 | 7 | import os # noqa: E402 8 | import sys # noqa: E402 9 | 10 | import pyarrow as pa # noqa: E402 11 | import pyarrow.dataset as pa_ds # noqa: E402 12 | import pyarrow.parquet as pq # noqa: E402 13 | from dask.datasets import timeseries # noqa: E402 14 | from distributed.utils_test import cleanup # noqa F401 15 | from distributed.utils_test import ( # noqa F401 16 | client, 17 | cluster, 18 | cluster_fixture, 19 | gen_cluster, 20 | loop, 21 | loop_in_thread, 22 | popen, 23 | varying, 24 | ) 25 | 26 | import dask_deltatable as ddt # noqa: E402 27 | 28 | pytestmark = pytest.mark.skipif( 29 | sys.platform == "win32", 30 | reason=( 31 | "The teardown of distributed.utils_test.cluster_fixture " 32 | "fails on windows CI currently" 33 | ), 34 | ) 35 | 36 | 37 | def test_write(client, tmpdir): 38 | ddf = timeseries( 39 | start="2023-01-01", 40 | end="2023-01-03", 41 | freq="1H", 42 | partition_freq="1D", 43 | dtypes={"str": object, "float": float, "int": int}, 44 | ).reset_index() 45 | ddt.to_deltalake(f"{tmpdir}", ddf) 46 | 47 | 48 | def test_append(client, tmpdir): 49 | """Ensure that a DeltaTable can be pickled and sent over to a worker for appending.""" 50 | ddf = timeseries( 51 | start="2023-01-01", 52 | end="2023-01-03", 53 | freq="1H", 54 | partition_freq="1D", 55 | dtypes={"str": object, "float": float, "int": int}, 56 | ).reset_index() 57 | ddt.to_deltalake(f"{tmpdir}", ddf) 58 | ddt.to_deltalake(f"{tmpdir}", ddf, mode="append") 59 | 60 | 61 | def test_write_with_options(client, tmpdir): 62 | file_options = dict(compression="gzip") 63 | ddf = timeseries( 64 | start="2023-01-01", 65 | end="2023-01-03", 66 | freq="1H", 67 | partition_freq="1D", 68 | dtypes={"str": object, "float": float, "int": int}, 69 | ).reset_index() 70 | ddt.to_deltalake(f"{tmpdir}", ddf, file_options=file_options) 71 | parquet_filename = [f for f in os.listdir(tmpdir) if f.endswith(".parquet")][0] 72 | parquet_file = pq.ParquetFile(f"{tmpdir}/{parquet_filename}") 73 | assert parquet_file.metadata.row_group(0).column(0).compression == "GZIP" 74 | 75 | 76 | def test_write_with_schema(client, tmpdir): 77 | ddf = timeseries( 78 | start="2023-01-01", 79 | end="2023-01-03", 80 | freq="1H", 81 | partition_freq="1D", 82 | dtypes={"str": object, "float": float, "int": int}, 83 | ).reset_index() 84 | schema = pa.schema( 85 | [ 86 | pa.field("timestamp", pa.timestamp("us")), 87 | pa.field("str", pa.string()), 88 | pa.field("float", pa.float32()), 89 | pa.field("int", pa.int32()), 90 | ] 91 | ) 92 | ddt.to_deltalake(f"{tmpdir}", ddf, schema=schema) 93 | ds = pa_ds.dataset(str(tmpdir)) 94 | assert ds.schema == schema 95 | 96 | 97 | def test_read(client, simple_table): 98 | df = ddt.read_deltalake(simple_table) 99 | assert df.columns.tolist() == ["id", "count", "temperature", "newColumn"] 100 | assert df.compute().shape == (200, 4) 101 | -------------------------------------------------------------------------------- /tests/test_write.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import unittest.mock as mock 5 | 6 | import dask.dataframe as dd 7 | import pandas as pd 8 | import pyarrow as pa 9 | import pytest 10 | from dask.dataframe.utils import assert_eq 11 | from dask.datasets import timeseries 12 | from deltalake import DeltaTable 13 | 14 | from dask_deltatable import read_deltalake 15 | from dask_deltatable.write import to_deltalake 16 | 17 | 18 | @pytest.mark.parametrize( 19 | "with_index", 20 | [ 21 | pytest.param( 22 | True, 23 | marks=[ 24 | pytest.mark.xfail( 25 | reason="TS index is always ns resolution but delta can only handle us" 26 | ) 27 | ], 28 | ), 29 | False, 30 | ], 31 | ) 32 | @pytest.mark.parametrize("freq,partition_freq", [("1H", "1D"), ("1H", "1w")]) 33 | def test_roundtrip(tmpdir, with_index, freq, partition_freq): 34 | dtypes = { 35 | "str": object, 36 | # FIXME: Categorical data does not work 37 | # "category": "category", 38 | "float": float, 39 | "int": int, 40 | } 41 | tmpdir = str(tmpdir) 42 | ddf = timeseries( 43 | start="2023-01-01", 44 | end="2023-01-15", 45 | freq=freq, 46 | partition_freq=partition_freq, 47 | dtypes=dtypes, 48 | ) 49 | 50 | ddf = ddf.reset_index() 51 | if with_index: 52 | ddf = ddf.set_index("timestamp") 53 | 54 | out = to_deltalake(tmpdir, ddf, compute=False) 55 | assert not os.listdir(tmpdir) 56 | out.compute() 57 | assert len(os.listdir(tmpdir)) > 0 58 | 59 | ddf_read = read_deltalake(tmpdir) 60 | ddf_dask = dd.read_parquet(tmpdir) 61 | 62 | assert ddf.npartitions == ddf_read.npartitions 63 | # By default, arrow reads with ns resolution 64 | assert_eq(ddf_read, ddf_dask) 65 | 66 | 67 | @mock.patch("dask_deltatable.utils.maybe_set_aws_credentials") 68 | def test_writer_check_aws_credentials(maybe_set_aws_credentials, tmpdir): 69 | # The full functionality of maybe_set_aws_credentials tests in test_utils 70 | # we only need to ensure it's called here when writing with a str path 71 | maybe_set_aws_credentials.return_value = dict() 72 | 73 | df = pd.DataFrame({"col1": range(10)}) 74 | ddf = dd.from_pandas(df, npartitions=2) 75 | to_deltalake(str(tmpdir), ddf) 76 | maybe_set_aws_credentials.assert_called() 77 | 78 | 79 | @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) 80 | def test_datetime(tmpdir, unit): 81 | """Ensure we can write datetime with different resolutions, 82 | at least one-way only""" 83 | tmpdir = str(tmpdir) 84 | ts = pd.date_range("2023-01-01", periods=10, freq="1D", unit=unit) 85 | df = pd.DataFrame({"ts": pd.Series(ts)}) 86 | ddf = dd.from_pandas(df, npartitions=2) 87 | to_deltalake(tmpdir, ddf) 88 | ddf_read = read_deltalake(tmpdir) 89 | ddf_dask = dd.read_parquet(tmpdir) 90 | assert_eq(ddf_read, ddf_dask, check_index=False) 91 | 92 | 93 | def test_custom_metadata(tmpdir): 94 | tmpdir = str(tmpdir) 95 | df = pd.DataFrame({"a": [1, 2, 3, 4]}) 96 | ddf = dd.from_pandas(df, npartitions=2) 97 | to_deltalake(tmpdir, ddf, custom_metadata={"foo": "bar"}) 98 | dt = DeltaTable(tmpdir) 99 | assert "foo" in dt.history()[-1] 100 | assert dt.history()[-1]["foo"] == "bar" 101 | 102 | 103 | def test_append_with_schema(tmpdir): 104 | """Ensure we can append to a table with a schema""" 105 | tmpdir = str(tmpdir) 106 | df = pd.DataFrame({"a": [1, 2, 3, 4]}) 107 | ddf = dd.from_pandas(df, npartitions=2) 108 | schema = pa.Schema.from_pandas(df) 109 | to_deltalake(tmpdir, ddf, schema=schema) 110 | to_deltalake(tmpdir, ddf, schema=schema, mode="append") 111 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | import unittest.mock as mock 5 | 6 | import pytest 7 | 8 | from dask_deltatable.utils import ( 9 | get_bucket_region, 10 | get_partition_filters, 11 | maybe_set_aws_credentials, 12 | ) 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "cols,filters,expected", 17 | [ 18 | [[], None, None], 19 | [[], [("part", ">", "a")], None], 20 | [["part"], [("part", ">", "a"), ("x", "==", 1)], [[("part", ">", "a")]]], 21 | [["part"], [[("part", ">", "a")], [("x", "==", 1)]], None], 22 | [ 23 | ["m", "d"], 24 | [("m", ">", 5), ("d", "=", 1), ("x", "==", "a")], 25 | [[("m", ">", 5), ("d", "=", 1)]], 26 | ], 27 | [ 28 | ["m", "d"], 29 | [[("m", ">", 5)], [("d", "=", 1)], [("x", "==", "a")]], 30 | None, 31 | ], 32 | ], 33 | ) 34 | def test_partition_filters(cols, filters, expected): 35 | res = get_partition_filters(cols, filters) 36 | assert res == expected 37 | if isinstance(filters, list): 38 | # make sure it works with additional level of wrapping 39 | res = get_partition_filters(cols, filters) 40 | assert res == expected 41 | 42 | 43 | @mock.patch("dask_deltatable.utils.get_bucket_region") 44 | @pytest.mark.parametrize( 45 | "options", 46 | ( 47 | None, 48 | dict(), 49 | dict(AWS_ACCESS_KEY_ID="foo", AWS_SECRET_ACCESS_KEY="bar"), 50 | dict(access_key="foo", secret_key="bar"), 51 | ), 52 | ) 53 | @pytest.mark.parametrize("path", ("s3://path", "/another/path", pathlib.Path("."))) 54 | def test_maybe_set_aws_credentials( 55 | mocked_get_bucket_region, 56 | options, 57 | path, 58 | ): 59 | pytest.importorskip("boto3") 60 | 61 | mocked_get_bucket_region.return_value = "foo-region" 62 | 63 | mock_creds = mock.MagicMock() 64 | type(mock_creds).token = mock.PropertyMock(return_value="token") 65 | type(mock_creds).access_key = mock.PropertyMock(return_value="access-key") 66 | type(mock_creds).secret_key = mock.PropertyMock(return_value="secret-key") 67 | 68 | def mock_get_credentials(): 69 | return mock_creds 70 | 71 | with mock.patch("boto3.session.Session") as mocked_session: 72 | session = mocked_session.return_value 73 | session.get_credentials.side_effect = mock_get_credentials 74 | 75 | opts = maybe_set_aws_credentials(path, options) 76 | 77 | if options and not any(k in options for k in ("AWS_ACCESS_KEY_ID", "access_key")): 78 | assert opts["AWS_ACCESS_KEY_ID"] == "access-key" 79 | assert opts["AWS_SECRET_ACCESS_KEY"] == "secret-key" 80 | assert opts["AWS_SESSION_TOKEN"] == "token" 81 | assert opts["AWS_REGION"] == "foo-region" 82 | 83 | assert opts["access_key"] == "access-key" 84 | assert opts["secret_key"] == "secret-key" 85 | assert opts["token"] == "token" 86 | assert opts["region"] == "foo-region" 87 | 88 | # Did not alter input options if credentials were supplied by user 89 | elif options: 90 | assert options == opts 91 | 92 | 93 | @pytest.mark.parametrize("location", (None, "region-foo")) 94 | @pytest.mark.parametrize( 95 | "path,bucket", 96 | (("s3://foo/bar", "foo"), ("s3://fizzbuzz", "fizzbuzz"), ("/not/s3", None)), 97 | ) 98 | def test_get_bucket_region(location, path, bucket): 99 | pytest.importorskip("boto3") 100 | 101 | with mock.patch("boto3.client") as mock_client: 102 | mock_client = mock_client.return_value 103 | mock_client.get_bucket_location.return_value = {"LocationConstraint": location} 104 | 105 | if not path.startswith("s3://"): 106 | with pytest.raises(ValueError, match="is not an S3 path"): 107 | get_bucket_region(path) 108 | return 109 | 110 | region = get_bucket_region(path) 111 | 112 | # AWS returns None if bucket located in us-east-1... 113 | location = location if location else "us-east-1" 114 | assert region == location 115 | 116 | mock_client.get_bucket_location.assert_has_calls([mock.call(Bucket=bucket)]) 117 | -------------------------------------------------------------------------------- /tests/test_acceptance.py: -------------------------------------------------------------------------------- 1 | """Delta Acceptance Testing (DAT) 2 | 3 | https://github.com/delta-incubator/dat 4 | 5 | The DAT project provides test cases to verify different implementations of Delta Lake all behave 6 | consistently. The expected behavior is described in the Delta Lake Protocol. 7 | 8 | The tests cases are packaged into releases, which can be downloaded into CI jobs for automatic 9 | testing. The test cases in this repo are represented using a standard file structure, so they 10 | don't require any particular dependency or programming language. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | import os 16 | import shutil 17 | import unittest.mock as mock 18 | from urllib.request import urlretrieve 19 | 20 | import dask.dataframe as dd 21 | import pytest 22 | from dask.dataframe.utils import assert_eq 23 | 24 | import dask_deltatable as ddt 25 | 26 | DATA_VERSION = "0.0.2" 27 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) 28 | DATA_DIR = os.path.join(ROOT_DIR, "out", "reader_tests", "generated") 29 | 30 | 31 | @pytest.fixture(autouse=True, scope="session") 32 | def download_data(): 33 | """Download the data for the tests.""" 34 | if not os.path.exists(DATA_DIR): 35 | filename = f"deltalake-dat-v{DATA_VERSION}.tar.gz" 36 | dest_filename = os.path.join(ROOT_DIR, filename) 37 | urlretrieve( 38 | f"https://github.com/delta-incubator/dat/releases/download/v{DATA_VERSION}/{filename}", 39 | dest_filename, 40 | ) 41 | shutil.unpack_archive(dest_filename, ROOT_DIR) 42 | os.remove(dest_filename) 43 | assert os.path.exists(DATA_DIR) 44 | 45 | 46 | @mock.patch("dask_deltatable.utils.maybe_set_aws_credentials") 47 | def test_reader_check_aws_credentials(maybe_set_aws_credentials): 48 | # The full functionality of maybe_set_aws_credentials tests in test_utils 49 | # we only need to ensure it's called here when reading with a str path 50 | maybe_set_aws_credentials.return_value = dict() 51 | ddt.read_deltalake(f"{DATA_DIR}/all_primitive_types/delta") 52 | maybe_set_aws_credentials.assert_called() 53 | 54 | 55 | def test_reader_all_primitive_types(): 56 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/all_primitive_types/delta") 57 | expected_ddf = dd.read_parquet( 58 | f"{DATA_DIR}/all_primitive_types/expected/latest/table_content/*parquet" 59 | ) 60 | # Dask and delta go through different parquet parsers which read the 61 | # timestamp differently. This is likely a bug in arrow but the delta result 62 | # is "more correct". 63 | expected_ddf["timestamp"] = expected_ddf["timestamp"].astype("datetime64[us]") 64 | expected_ddf["timestamp"] = expected_ddf["timestamp"].dt.tz_localize("UTC") 65 | assert_eq(actual_ddf, expected_ddf) 66 | 67 | 68 | @pytest.mark.parametrize("version,subdir", [(None, "latest"), (0, "v0"), (1, "v1")]) 69 | def test_reader_basic_append(version, subdir): 70 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/basic_append/delta", version=version) 71 | expected_ddf = dd.read_parquet( 72 | f"{DATA_DIR}/basic_append/expected/{subdir}/table_content/*parquet" 73 | ) 74 | assert_eq(actual_ddf, expected_ddf, check_index=False) 75 | 76 | 77 | @pytest.mark.parametrize("version,subdir", [(None, "latest"), (0, "v0"), (1, "v1")]) 78 | def test_reader_basic_partitioned(version, subdir): 79 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/basic_partitioned/delta") 80 | expected_ddf = dd.read_parquet( 81 | f"{DATA_DIR}/basic_partitioned/expected/latest/table_content/*parquet" 82 | ) 83 | assert_eq(actual_ddf, expected_ddf, check_index=False) 84 | 85 | 86 | @pytest.mark.xfail(reason="https://github.com/delta-io/delta-rs/issues/1533") 87 | @pytest.mark.parametrize( 88 | "version,subdir", [(None, "latest"), (0, "v0"), (1, "v1"), (2, "v2")] 89 | ) 90 | def test_reader_multi_partitioned(version, subdir): 91 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/multi_partitioned/delta") 92 | expected_ddf = dd.read_parquet( 93 | f"{DATA_DIR}/multi_partitioned/expected/{subdir}/table_content/*parquet" 94 | ) 95 | assert_eq(actual_ddf, expected_ddf, check_index=False) 96 | 97 | 98 | @pytest.mark.xfail(reason="https://github.com/delta-io/delta-rs/issues/1533") 99 | def test_reader_multi_partitioned_2(): 100 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/multi_partitioned_2/delta") 101 | expected_ddf = dd.read_parquet( 102 | f"{DATA_DIR}/multi_partitioned_2/expected/latest/table_content/*parquet" 103 | ) 104 | assert_eq(actual_ddf, expected_ddf) 105 | 106 | 107 | def test_reader_nested_types(): 108 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/nested_types/delta") 109 | expected_ddf = dd.read_parquet( 110 | f"{DATA_DIR}/nested_types/expected/latest/table_content/*parquet" 111 | ) 112 | assert_eq(actual_ddf, expected_ddf) 113 | 114 | 115 | def test_reader_no_replay(): 116 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/no_replay/delta") 117 | expected_ddf = dd.read_parquet( 118 | f"{DATA_DIR}/no_replay/expected/latest/table_content/*parquet" 119 | ) 120 | assert_eq(actual_ddf, expected_ddf) 121 | 122 | 123 | def test_reader_no_stats(): 124 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/no_stats/delta") 125 | expected_ddf = dd.read_parquet( 126 | f"{DATA_DIR}/no_stats/expected/latest/table_content/*parquet" 127 | ) 128 | assert_eq(actual_ddf, expected_ddf) 129 | 130 | 131 | def test_reader_stats_as_structs(): 132 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/stats_as_struct/delta") 133 | expected_ddf = dd.read_parquet( 134 | f"{DATA_DIR}/stats_as_struct/expected/latest/table_content/*parquet" 135 | ) 136 | assert_eq(actual_ddf, expected_ddf) 137 | 138 | 139 | def test_reader_with_checkpoint(): 140 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/with_checkpoint/delta") 141 | expected_ddf = dd.read_parquet( 142 | f"{DATA_DIR}/with_checkpoint/expected/latest/table_content/*parquet" 143 | ) 144 | assert_eq(actual_ddf, expected_ddf) 145 | 146 | 147 | @pytest.mark.parametrize("version,subdir", [(None, "latest"), (1, "v1")]) 148 | def test_reader_with_schema_change(version, subdir): 149 | actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/with_schema_change/delta") 150 | expected_ddf = dd.read_parquet( 151 | f"{DATA_DIR}/with_schema_change/expected/{subdir}/table_content/*parquet" 152 | ) 153 | assert_eq(actual_ddf, expected_ddf) 154 | -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import glob 4 | import os 5 | from unittest.mock import MagicMock, patch 6 | 7 | import pandas as pd 8 | import pyarrow as pa 9 | import pyarrow.parquet as pq 10 | import pytest 11 | from deltalake import DeltaTable 12 | 13 | import dask_deltatable as ddt 14 | 15 | 16 | def test_read_delta(simple_table): 17 | df = ddt.read_deltalake(simple_table) 18 | 19 | assert df.columns.tolist() == ["id", "count", "temperature", "newColumn"] 20 | assert df.compute().shape == (200, 4) 21 | 22 | 23 | def test_read_delta_types_mapper(simple_table): 24 | """Provide a custom types mapper""" 25 | 26 | def types_mapper(pyarrow_dtype): 27 | if pyarrow_dtype == pa.int64(): 28 | return pd.Int32Dtype() 29 | 30 | df = ddt.read_deltalake( 31 | simple_table, pyarrow_to_pandas={"types_mapper": types_mapper} 32 | ) 33 | assert df.dtypes["id"] == "Int32" 34 | assert df.dtypes["count"] == "Int32" 35 | res = df.compute() 36 | assert res.dtypes["id"] == "Int32" 37 | assert res.dtypes["count"] == "Int32" 38 | 39 | 40 | def test_read_delta_categories(simple_table): 41 | """Provide a list of categories""" 42 | df = ddt.read_deltalake(simple_table, pyarrow_to_pandas={"categories": ["id"]}) 43 | assert df.dtypes["id"] == "category" 44 | res = df.compute() 45 | assert res.dtypes["id"] == "category" 46 | 47 | 48 | def test_read_delta_with_different_versions(simple_table): 49 | print(simple_table) 50 | df = ddt.read_deltalake(simple_table, version=0) 51 | assert df.compute().shape == (100, 3) 52 | 53 | df = ddt.read_deltalake(simple_table, version=1) 54 | assert df.compute().shape == (200, 4) 55 | 56 | 57 | def test_row_filter(simple_table): 58 | # row filter 59 | df = ddt.read_deltalake( 60 | simple_table, 61 | version=0, 62 | filter=[("count", ">", 30)], 63 | ) 64 | assert df.compute().shape == (61, 3) 65 | 66 | 67 | def test_different_columns(simple_table): 68 | df = ddt.read_deltalake(simple_table, columns=["count", "temperature"]) 69 | assert df.columns.tolist() == ["count", "temperature"] 70 | 71 | 72 | def test_different_schema(simple_table): 73 | # testing schema evolution 74 | 75 | df = ddt.read_deltalake(simple_table, version=0) 76 | assert df.columns.tolist() == ["id", "count", "temperature"] 77 | 78 | df = ddt.read_deltalake(simple_table, version=1) 79 | assert df.columns.tolist() == ["id", "count", "temperature", "newColumn"] 80 | 81 | 82 | @pytest.mark.parametrize( 83 | "kwargs,shape", 84 | [ 85 | (dict(version=0, filter=[("col1", "==", 1)]), (21, 3)), 86 | (dict(filter=[("col1", "==", 1), ("col2", "<", 0.5)]), (11, 4)), 87 | (dict(filter=[[("col1", "==", 1)], [("col1", "==", 2)]]), (39, 4)), 88 | (dict(filter=[("col1", "!=", 1), ("id", "<", 5)]), (6, 4)), 89 | (dict(filter=[[("col1", "!=", 1)], [("id", "<", 5)]]), (99, 4)), 90 | ], 91 | ) 92 | def test_partition_filter(partition_table, kwargs, shape): 93 | """partition filter""" 94 | df = ddt.read_deltalake(partition_table, **kwargs) 95 | filter_expr = pq.filters_to_expression(kwargs["filter"]) 96 | dt = DeltaTable(partition_table, version=kwargs.get("version")) 97 | expected_partitions = len( 98 | list(dt.to_pyarrow_dataset().get_fragments(filter=filter_expr)) 99 | ) 100 | assert df.npartitions == expected_partitions 101 | assert df.compute().shape == shape 102 | 103 | 104 | def test_empty(empty_table1, empty_table2): 105 | df = ddt.read_deltalake(empty_table1, version=4) 106 | assert df.compute().shape == (0, 2) 107 | 108 | df = ddt.read_deltalake(empty_table1, version=0) 109 | assert df.compute().shape == (5, 2) 110 | 111 | df = ddt.read_deltalake(empty_table2) 112 | assert df.compute().shape == (0, 4) 113 | 114 | df = ddt.read_deltalake(empty_table2, columns=["some_struct", "value"]) 115 | assert df.compute().shape == (0, 2) 116 | 117 | df = ddt.read_deltalake(empty_table2, columns=[]) 118 | assert df.compute().shape == (0, 0) 119 | 120 | 121 | def test_checkpoint(checkpoint_table): 122 | df = ddt.read_deltalake(checkpoint_table, checkpoint=0, version=4) 123 | assert df.compute().shape[0] == 25 124 | 125 | df = ddt.read_deltalake(checkpoint_table, checkpoint=10, version=12) 126 | assert df.compute().shape[0] == 65 127 | 128 | df = ddt.read_deltalake(checkpoint_table, checkpoint=20, version=22) 129 | assert df.compute().shape[0] == 115 130 | 131 | with pytest.raises(Exception): 132 | # Parquet file with the given checkpoint 30 does not exists: 133 | # File {checkpoint_path} not found" 134 | _ = ddt.read_deltalake(checkpoint_table, checkpoint=30, version=33) 135 | 136 | 137 | def test_out_of_version_error(simple_table): 138 | # Cannot time travel Delta table to version 4 , Available versions for given 139 | # checkpoint 0 are [0,1] 140 | with pytest.raises(Exception): 141 | _ = ddt.read_deltalake(simple_table, version=4) 142 | 143 | 144 | def test_load_with_datetime(simple_table2): 145 | log_dir = f"{simple_table2}_delta_log" 146 | log_mtime_pair = [ 147 | ("00000000000000000000.json", 1588398451.0), 148 | ("00000000000000000001.json", 1588484851.0), 149 | ("00000000000000000002.json", 1588571251.0), 150 | ("00000000000000000003.json", 1588657651.0), 151 | ("00000000000000000004.json", 1588744051.0), 152 | ] 153 | for file_name, dt_epoch in log_mtime_pair: 154 | file_path = os.path.join(log_dir, file_name) 155 | os.utime(file_path, (dt_epoch, dt_epoch)) 156 | 157 | expected = ddt.read_deltalake(simple_table2, version=0).compute() 158 | result = ddt.read_deltalake( 159 | simple_table2, datetime="2020-05-01T00:47:31-07:00" 160 | ).compute() 161 | assert expected.equals(result) 162 | # assert_frame_equal(expected,result) 163 | 164 | expected = ddt.read_deltalake(simple_table2, version=1).compute() 165 | result = ddt.read_deltalake( 166 | simple_table2, datetime="2020-05-02T22:47:31-07:00" 167 | ).compute() 168 | assert expected.equals(result) 169 | 170 | expected = ddt.read_deltalake(simple_table2, version=4).compute() 171 | result = ddt.read_deltalake( 172 | simple_table2, datetime="2020-05-25T22:47:31-07:00" 173 | ).compute() 174 | assert expected.equals(result) 175 | 176 | 177 | def test_read_delta_with_error(): 178 | with pytest.raises(ValueError) as exc_info: 179 | ddt.read_deltalake() 180 | assert str(exc_info.value) == "Please Provide Delta Table path" 181 | 182 | 183 | def test_catalog_with_error(): 184 | with pytest.raises(ValueError) as exc_info: 185 | ddt.read_deltalake(catalog="glue") 186 | assert ( 187 | str(exc_info.value) 188 | == "Since Catalog was provided, please provide Database and table name" 189 | ) 190 | 191 | 192 | @pytest.mark.skip( 193 | reason="DeltaTable.from_data_catalog was removed in delta-rs v0.15.0. " 194 | "Skip until _read_from_catalog is adapted to this change." 195 | ) 196 | def test_catalog(simple_table): 197 | dt = MagicMock() 198 | 199 | def delta_mock(**kwargs): 200 | files = glob.glob(simple_table + "/*parquet") 201 | dt.file_uris = MagicMock(return_value=files) 202 | return dt 203 | 204 | with patch("deltalake.DeltaTable.from_data_catalog", side_effect=delta_mock): 205 | os.environ["AWS_ACCESS_KEY_ID"] = "apple" 206 | os.environ["AWS_SECRET_ACCESS_KEY"] = "greatsecret" 207 | df = ddt.read_deltalake( 208 | catalog="glue", database_name="stores", table_name="orders" 209 | ) 210 | assert df.compute().shape == (200, 3) 211 | -------------------------------------------------------------------------------- /dask_deltatable/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from collections.abc import Iterator, Mapping 5 | from datetime import date, datetime 6 | from decimal import Decimal 7 | from math import inf 8 | from typing import Any, cast 9 | from urllib.parse import unquote 10 | 11 | from deltalake import DeltaTable 12 | 13 | from .types import Filter, Filters 14 | 15 | 16 | def get_bucket_region(path: str): 17 | import boto3 18 | 19 | if not path.startswith("s3://"): 20 | raise ValueError(f"'{path}' is not an S3 path") 21 | bucket = path.replace("s3://", "").split("/")[0] 22 | resp = boto3.client("s3").get_bucket_location(Bucket=bucket) 23 | # Buckets in region 'us-east-1' results in None, b/c why not. 24 | # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/get_bucket_location.html#S3.Client.get_bucket_location 25 | return resp["LocationConstraint"] or "us-east-1" 26 | 27 | 28 | def maybe_set_aws_credentials(path: Any, options: dict[str, Any]) -> dict[str, Any]: 29 | """ 30 | Maybe set AWS credentials into ``options`` if existing AWS specific keys 31 | not found in it and path is s3:// format. 32 | 33 | Parameters 34 | ---------- 35 | path : Any 36 | If it's a string, we'll check if it starts with 's3://' then determine bucket 37 | region if the AWS credentials should be set. 38 | options : dict[str, Any] 39 | Options, any kwargs to be supplied to things like S3FileSystem or similar 40 | that may accept AWS credentials set. A copy is made and returned if modified. 41 | 42 | Returns 43 | ------- 44 | dict 45 | Either the original options if not modified, or a copied and updated options 46 | with AWS credentials inserted. 47 | """ 48 | 49 | is_s3_path = getattr(path, "startswith", lambda _: False)("s3://") 50 | if not is_s3_path: 51 | return options 52 | 53 | # Avoid overwriting already provided credentials 54 | keys = ("AWS_ACCESS_KEY", "AWS_SECRET_ACCESS_KEY", "access_key", "secret_key") 55 | if not any(k in (options or ()) for k in keys): 56 | # defers installing boto3 upfront, xref _read_from_catalog 57 | import boto3 58 | 59 | session = boto3.session.Session() 60 | credentials = session.get_credentials() 61 | if credentials is None: 62 | return options 63 | region = get_bucket_region(path) 64 | 65 | options = (options or {}).copy() 66 | options.update( 67 | # Capitalized is used in delta specific API and lowercase is for S3FileSystem 68 | dict( 69 | # TODO: w/o this, we need to configure a LockClient which seems to require dynamodb. 70 | AWS_S3_ALLOW_UNSAFE_RENAME="true", 71 | AWS_SECRET_ACCESS_KEY=credentials.secret_key, 72 | AWS_ACCESS_KEY_ID=credentials.access_key, 73 | AWS_SESSION_TOKEN=credentials.token, 74 | AWS_REGION=region, 75 | secret_key=credentials.secret_key, 76 | access_key=credentials.access_key, 77 | token=credentials.token, 78 | region=region, 79 | ) 80 | ) 81 | return options 82 | 83 | 84 | def get_partition_filters( 85 | partition_columns: list[str], filters: Filters 86 | ) -> list[list[Filter]] | None: 87 | """Retrieve only filters on partition columns. If there are any row filters in the outer 88 | list (the OR list), return None, because we have to search through all partitions to apply 89 | row filters 90 | 91 | Parameters 92 | ---------- 93 | partition_columns : List[str] 94 | List of partitioned columns 95 | 96 | filters : List[Tuple[str, str, Any]] | List[List[Tuple[str, str, Any]]] 97 | List of filters. Examples: 98 | 1) (x == a) and (y == 3): 99 | [("x", "==", "a"), ("y", "==", 3)] 100 | 2) (x == a) or (y == 3) 101 | [[("x", "==", "a")], [("y", "==", 3)]] 102 | 103 | Returns 104 | ------- 105 | List[List[Tuple[str, str, Any]]] | None 106 | List of partition filters, None if we can't apply a filter on partitions because 107 | row filters are present 108 | """ 109 | if filters is None or len(filters) == 0: 110 | return None 111 | 112 | if isinstance(filters[0][0], str): 113 | filters = cast(list[list[Filter]], [filters]) 114 | filters = cast(list[list[Filter]], filters) 115 | 116 | allowed_ops = { 117 | "=": "=", 118 | "==": "=", 119 | "!=": "!=", 120 | "!==": "!=", 121 | "in": "in", 122 | "not in": "not in", 123 | ">": ">", 124 | "<": "<", 125 | ">=": ">=", 126 | "<=": "<=", 127 | } 128 | 129 | expressions = [] 130 | for disjunction in filters: 131 | inner_expressions = [] 132 | for col, op, val in disjunction: 133 | if col in partition_columns: 134 | normalized_op = allowed_ops[op] 135 | inner_expressions.append((col, normalized_op, val)) 136 | if inner_expressions: 137 | expressions.append(inner_expressions) 138 | else: 139 | return None 140 | 141 | return expressions if expressions else None 142 | 143 | 144 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt) 145 | def get_partitions_from_path(path: str) -> tuple[str, dict[str, str | None]]: 146 | if path[0] == "/": 147 | path = path[1:] 148 | parts = path.split("/") 149 | parts.pop() # remove filename 150 | out: dict[str, str | None] = {} 151 | for part in parts: 152 | if part == "": 153 | continue 154 | key, value = part.split("=", maxsplit=1) 155 | if value == "__HIVE_DEFAULT_PARTITION__": 156 | out[key] = None 157 | else: 158 | out[key] = unquote(value) 159 | return path, out 160 | 161 | 162 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt) 163 | def get_file_stats_from_metadata( 164 | metadata: Any, 165 | num_indexed_cols: int, 166 | columns_to_collect_stats: list[str] | None, 167 | ) -> dict[str, int | dict[str, Any]]: 168 | """Get Delta's file stats from PyArrow's Parquet file metadata.""" 169 | stats = { 170 | "numRecords": metadata.num_rows, 171 | "minValues": {}, 172 | "maxValues": {}, 173 | "nullCount": {}, 174 | } 175 | 176 | def iter_groups(metadata: Any) -> Iterator[Any]: 177 | for i in range(metadata.num_row_groups): 178 | if metadata.row_group(i).num_rows > 0: 179 | yield metadata.row_group(i) 180 | 181 | schema_columns = metadata.schema.names 182 | if columns_to_collect_stats is not None: 183 | idx_to_iterate = [] 184 | for col in columns_to_collect_stats: 185 | try: 186 | idx_to_iterate.append(schema_columns.index(col)) 187 | except ValueError: 188 | pass 189 | elif num_indexed_cols == -1: 190 | idx_to_iterate = list(range(metadata.num_columns)) 191 | elif num_indexed_cols >= 0: 192 | idx_to_iterate = list(range(min(num_indexed_cols, metadata.num_columns))) 193 | else: 194 | raise ValueError("delta.dataSkippingNumIndexedCols valid values are >=-1") 195 | 196 | for column_idx in idx_to_iterate: 197 | name = metadata.row_group(0).column(column_idx).path_in_schema 198 | 199 | # If stats missing, then we can't know aggregate stats 200 | if all( 201 | group.column(column_idx).is_stats_set for group in iter_groups(metadata) 202 | ): 203 | stats["nullCount"][name] = sum( 204 | group.column(column_idx).statistics.null_count 205 | for group in iter_groups(metadata) 206 | ) 207 | 208 | # Min / max may not exist for some column types, or if all values are null 209 | if any( 210 | group.column(column_idx).statistics.has_min_max 211 | for group in iter_groups(metadata) 212 | ): 213 | # Min and Max are recorded in physical type, not logical type 214 | # https://stackoverflow.com/questions/66753485/decoding-parquet-min-max-statistics-for-decimal-type 215 | # TODO: Add logic to decode physical type for DATE, DECIMAL 216 | 217 | minimums = ( 218 | group.column(column_idx).statistics.min 219 | for group in iter_groups(metadata) 220 | ) 221 | # If some row groups have all null values, their min and max will be null too. 222 | min_value = min(minimum for minimum in minimums if minimum is not None) 223 | # Infinity cannot be serialized to JSON, so we skip it. Saying 224 | # min/max is infinity is equivalent to saying it is null, anyways. 225 | if min_value != -inf: 226 | stats["minValues"][name] = min_value 227 | maximums = ( 228 | group.column(column_idx).statistics.max 229 | for group in iter_groups(metadata) 230 | ) 231 | max_value = max(maximum for maximum in maximums if maximum is not None) 232 | if max_value != inf: 233 | stats["maxValues"][name] = max_value 234 | return stats 235 | 236 | 237 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt) 238 | class DeltaJSONEncoder(json.JSONEncoder): 239 | def default(self, obj: Any) -> Any: 240 | if isinstance(obj, bytes): 241 | return obj.decode("unicode_escape", "backslashreplace") 242 | elif isinstance(obj, date): 243 | return obj.isoformat() 244 | elif isinstance(obj, datetime): 245 | return obj.isoformat() 246 | elif isinstance(obj, Decimal): 247 | return str(obj) 248 | # Let the base class default method raise the TypeError 249 | return json.JSONEncoder.default(self, obj) 250 | 251 | 252 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt) 253 | def enforce_append_only( 254 | table: DeltaTable | None, 255 | configuration: Mapping[str, str | None] | None, 256 | mode: str, 257 | ) -> None: 258 | """Throw ValueError if table configuration contains delta.appendOnly and mode is not append""" 259 | if table: 260 | configuration = table.metadata().configuration 261 | config_delta_append_only = ( 262 | configuration and configuration.get("delta.appendOnly", "false") == "true" 263 | ) 264 | if config_delta_append_only and mode != "append": 265 | raise ValueError( 266 | "If configuration has delta.appendOnly = 'true', mode must be 'append'." 267 | f" Mode is currently {mode}" 268 | ) 269 | 270 | 271 | # Inspired from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt) 272 | def get_num_idx_cols_and_stats_columns( 273 | table: DeltaTable | None, configuration: Mapping[str, str | None] | None 274 | ) -> tuple[int, list[str] | None]: 275 | """Get the num_idx_columns and stats_columns from the table configuration in the state 276 | 277 | If table does not exist (only can occur in the first write action) it takes 278 | the configuration that was passed. 279 | """ 280 | if table is not None: 281 | configuration = table.metadata().configuration 282 | if configuration is None: 283 | num_idx_cols = -1 284 | stats_columns = None 285 | else: 286 | # Parse configuration 287 | dataSkippingNumIndexedCols = configuration.get( 288 | "delta.dataSkippingNumIndexedCols", "-1" 289 | ) 290 | num_idx_cols = ( 291 | int(dataSkippingNumIndexedCols) 292 | if dataSkippingNumIndexedCols is not None 293 | else -1 294 | ) 295 | columns = configuration.get("delta.dataSkippingStatsColumns", None) 296 | if columns is not None: 297 | stats_columns = [col.strip() for col in columns.split(",")] 298 | else: 299 | stats_columns = None 300 | return num_idx_cols, stats_columns 301 | -------------------------------------------------------------------------------- /dask_deltatable/_schema.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | """ 4 | Most of this code was taken from 5 | 6 | https://github.com/data-engineering-collective/plateau 7 | 8 | https://github.com/data-engineering-collective/plateau/blob/d4c4522f5a829d43e3368fc82e1568c91fa352f3/plateau/core/common_metadata.py 9 | 10 | and adapted to this project 11 | 12 | under the original license 13 | 14 | MIT License 15 | 16 | Copyright (c) 2022 The plateau contributors. 17 | Copyright (c) 2020-2021 The kartothek contributors. 18 | Copyright (c) 2019 JDA Software, Inc 19 | 20 | Permission is hereby granted, free of charge, to any person obtaining a copy 21 | of this software and associated documentation files (the "Software"), to deal 22 | in the Software without restriction, including without limitation the rights 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 24 | copies of the Software, and to permit persons to whom the Software is 25 | furnished to do so, subject to the following conditions: 26 | 27 | The above copyright notice and this permission notice shall be included in all 28 | copies or substantial portions of the Software. 29 | 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36 | SOFTWARE. 37 | 38 | """ 39 | import difflib 40 | import json 41 | import logging 42 | import pprint 43 | from collections.abc import Iterable 44 | from copy import deepcopy 45 | 46 | import pandas as pd 47 | import pyarrow as pa 48 | import pyarrow.parquet as pq 49 | 50 | _logger = logging.getLogger() 51 | 52 | 53 | class SchemaWrapper: 54 | def __init__(self, schema: pa.Schema): 55 | self.schema = schema 56 | 57 | def __hash__(self): 58 | # FIXME: pyarrow raises a "cannot hash type dict" error 59 | return hash(_schema2bytes(self.schema)) 60 | 61 | 62 | def pyarrow_to_deltalake(schema: pa.Schema) -> pa.Schema: 63 | """Adjust data types to make schema compatible with Delta Lake dtypes. 64 | Not all Arrow data types are supported by Delta Lake. See also 65 | ``deltalake.schema.delta_arrow_schema_from_pandas``. 66 | 67 | Notes 68 | ----- 69 | We shouldn't need this when https://github.com/delta-io/delta-rs/issues/686 is closed 70 | """ 71 | schema_out = [] 72 | for field in schema: 73 | if isinstance(field.type, pa.TimestampType): 74 | f = pa.field( 75 | name=field.name, 76 | type=pa.timestamp("us"), 77 | nullable=field.nullable, 78 | metadata=field.metadata, 79 | ) 80 | schema_out.append(f) 81 | else: 82 | schema_out.append(field) 83 | return pa.schema(schema_out, metadata=schema.metadata) 84 | 85 | 86 | def _pandas_in_schemas(schemas): 87 | """Check if any schema contains pandas metadata.""" 88 | has_pandas = False 89 | for schema in schemas: 90 | if schema.metadata and b"pandas" in schema.metadata: 91 | has_pandas = True 92 | return has_pandas 93 | 94 | 95 | def _determine_schemas_to_compare( 96 | schemas: Iterable[pa.Schema], ignore_pandas: bool 97 | ) -> tuple[pa.Schema | None, list[tuple[pa.Schema, list[str]]]]: 98 | """Iterate over a list of `pyarrow.Schema` objects and prepares them for 99 | comparison by picking a reference and determining all null columns. 100 | 101 | .. note:: 102 | 103 | If pandas metadata exists, the version stored in the metadata is overwritten with the currently 104 | installed version since we expect to stay backwards compatible 105 | 106 | Returns 107 | ------- 108 | reference: Schema 109 | A reference schema which is picked from the input list. The reference schema is guaranteed 110 | to be a schema having the least number of null columns of all input columns. The set of null 111 | columns is guaranteed to be a true subset of all null columns of all input schemas. If no such 112 | schema can be found, an Exception is raised 113 | list_of_schemas: List[Tuple[Schema, List]] 114 | A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and 115 | must be removed before comparing the schemas 116 | """ 117 | has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas 118 | schemas_to_evaluate: list[tuple[pa.Schema, list[str]]] = [] 119 | reference = None 120 | null_cols_in_reference = set() 121 | # Hashing the schemas is a very fast way to reduce the number of schemas to 122 | # actually compare since in most circumstances this reduces to very few 123 | # (which differ in e.g. null columns) 124 | for schema_wrapped in set(map(SchemaWrapper, schemas)): 125 | schema = schema_wrapped.schema 126 | del schema_wrapped 127 | if has_pandas: 128 | metadata = schema.metadata 129 | if metadata is None or b"pandas" not in metadata: 130 | raise ValueError( 131 | "Pandas and non-Pandas schemas are not comparable. " 132 | "Use ignore_pandas=True if you only want to compare " 133 | "on Arrow level." 134 | ) 135 | pandas_metadata = json.loads(metadata[b"pandas"].decode("utf8")) 136 | 137 | # we don't care about the pandas version, since we assume it's safe 138 | # to read datasets that were written by older or newer versions. 139 | pandas_metadata["pandas_version"] = f"{pd.__version__}" 140 | 141 | metadata_clean = deepcopy(metadata) 142 | metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata) 143 | current = pa.schema(schema, metadata_clean) 144 | else: 145 | current = schema 146 | 147 | # If a field is null we cannot compare it and must therefore reject it 148 | null_columns = {field.name for field in current if field.type == pa.null()} 149 | 150 | # Determine a valid reference schema. A valid reference schema is considered to be the schema 151 | # of all input schemas with the least empty columns. 152 | # The reference schema ought to be a schema whose empty columns are a true subset for all sets 153 | # of empty columns. This ensures that the actual reference schema is the schema with the most 154 | # information possible. A schema which doesn't fulfil this requirement would weaken the 155 | # comparison and would allow for false positives 156 | 157 | # Trivial case 158 | if reference is None: 159 | reference = current 160 | null_cols_in_reference = null_columns 161 | # The reference has enough information to validate against current schema. 162 | # Append it to the list of schemas to be verified 163 | elif null_cols_in_reference.issubset(null_columns): 164 | schemas_to_evaluate.append((current, list(null_columns))) 165 | # current schema includes all information of reference and more. 166 | # Add reference to schemas_to_evaluate and update reference 167 | elif null_columns.issubset(null_cols_in_reference): 168 | schemas_to_evaluate.append((reference, list(null_cols_in_reference))) 169 | reference = current 170 | null_cols_in_reference = null_columns 171 | # If there is no clear subset available elect the schema with the least null columns as `reference`. 172 | # Iterate over the null columns of `reference` and replace it with a non-null field of the `current` 173 | # schema which recovers the loop invariant (null columns of `reference` is subset of `current`) 174 | else: 175 | if len(null_columns) < len(null_cols_in_reference): 176 | reference, current = current, reference 177 | null_cols_in_reference, null_columns = ( 178 | null_columns, 179 | null_cols_in_reference, 180 | ) 181 | 182 | for col in null_cols_in_reference - null_columns: 183 | # Enrich the information in the reference by grabbing the missing fields 184 | # from the current iteration. This assumes that we only check for global validity and 185 | # isn't relevant where the reference comes from. 186 | reference = _swap_fields_by_name(reference, current, col) 187 | null_cols_in_reference.remove(col) 188 | schemas_to_evaluate.append((current, list(null_columns))) 189 | 190 | assert (reference is not None) or (not schemas_to_evaluate) 191 | 192 | return reference, schemas_to_evaluate 193 | 194 | 195 | def _swap_fields_by_name(reference, current, field_name): 196 | current_field = current.field(field_name) 197 | reference_index = reference.get_field_index(field_name) 198 | return reference.set(reference_index, current_field) 199 | 200 | 201 | def _strip_columns_from_schema(schema, field_names): 202 | stripped_schema = schema 203 | 204 | for name in field_names: 205 | ix = stripped_schema.get_field_index(name) 206 | if ix >= 0: 207 | stripped_schema = stripped_schema.remove(ix) 208 | else: 209 | # If the returned index is negative, the field doesn't exist in the schema. 210 | # This is most likely an indicator for incompatible schemas and we refuse to strip the schema 211 | # to not obfurscate the validation result 212 | _logger.warning( 213 | "Unexpected field `%s` encountered while trying to strip `null` columns.\n" 214 | "Schema was:\n\n`%s`" % (name, schema) 215 | ) 216 | return schema 217 | return stripped_schema 218 | 219 | 220 | def _schema2bytes(schema: SchemaWrapper) -> bytes: 221 | buf = pa.BufferOutputStream() 222 | pq.write_metadata(schema, buf, coerce_timestamps="us") 223 | return buf.getvalue().to_pybytes() 224 | 225 | 226 | def _remove_diff_header(diff): 227 | diff = list(diff) 228 | for ix, el in enumerate(diff): 229 | # This marks the first actual entry of the diff 230 | # e.g. @@ -1,5 + 2,5 @@ 231 | if el.startswith("@"): 232 | return diff[ix:] 233 | return diff 234 | 235 | 236 | def _diff_schemas(first, second): 237 | # see https://issues.apache.org/jira/browse/ARROW-4176 238 | 239 | first_pyarrow_info = str(first.remove_metadata()) 240 | second_pyarrow_info = str(second.remove_metadata()) 241 | pyarrow_diff = _remove_diff_header( 242 | difflib.unified_diff( 243 | str(first_pyarrow_info).splitlines(), str(second_pyarrow_info).splitlines() 244 | ) 245 | ) 246 | 247 | first_pandas_info = first.pandas_metadata 248 | second_pandas_info = second.pandas_metadata 249 | pandas_meta_diff = _remove_diff_header( 250 | difflib.unified_diff( 251 | pprint.pformat(first_pandas_info).splitlines(), 252 | pprint.pformat(second_pandas_info).splitlines(), 253 | ) 254 | ) 255 | 256 | diff_string = ( 257 | "Arrow schema:\n" 258 | + "\n".join(pyarrow_diff) 259 | + "\n\nPandas_metadata:\n" 260 | + "\n".join(pandas_meta_diff) 261 | ) 262 | 263 | return diff_string 264 | 265 | 266 | def validate_compatible( 267 | schemas: Iterable[pa.Schema], ignore_pandas: bool = False 268 | ) -> pa.Schema: 269 | """Validate that all schemas in a given list are compatible. 270 | 271 | Apart from the pandas version preserved in the schema metadata, schemas must be completely identical. That includes 272 | a perfect match of the whole metadata (except the pandas version) and pyarrow types. 273 | 274 | In the case that all schemas don't contain any pandas metadata, we will check the Arrow 275 | schemas directly for compatibility. 276 | 277 | Parameters 278 | ---------- 279 | schemas: List[Schema] 280 | Schema information from multiple sources, e.g. multiple partitions. List may be empty. 281 | ignore_pandas: bool 282 | Ignore the schema information given by Pandas an always use the Arrow schema. 283 | 284 | Returns 285 | ------- 286 | schema: SchemaWrapper 287 | The reference schema which was tested against 288 | 289 | Raises 290 | ------ 291 | ValueError 292 | At least two schemas are incompatible. 293 | """ 294 | reference, schemas_to_evaluate = _determine_schemas_to_compare( 295 | schemas, ignore_pandas 296 | ) 297 | 298 | for current, null_columns in schemas_to_evaluate: 299 | # We have schemas so the reference schema should be non-none. 300 | assert reference is not None 301 | # Compare each schema to the reference but ignore the null_cols and the Pandas schema information. 302 | reference_to_compare = _strip_columns_from_schema( 303 | reference, null_columns 304 | ).remove_metadata() 305 | current_to_compare = _strip_columns_from_schema( 306 | current, null_columns 307 | ).remove_metadata() 308 | 309 | def _fmt_origin(origin): 310 | origin = sorted(origin) 311 | # dask cuts of exception messages at 1k chars: 312 | # https://github.com/dask/distributed/blob/6e0c0a6b90b1d3c/distributed/core.py#L964 313 | # therefore, we cut the the maximum length 314 | max_len = 200 315 | inner_msg = ", ".join(origin) 316 | ellipsis = "..." 317 | if len(inner_msg) > max_len + len(ellipsis): 318 | inner_msg = inner_msg[:max_len] + ellipsis 319 | return f"{{{inner_msg}}}" 320 | 321 | if reference_to_compare != current_to_compare: 322 | schema_diff = _diff_schemas(reference, current) 323 | exception_message = """Schema violation 324 | 325 | Origin schema: {origin_schema} 326 | Origin reference: {origin_reference} 327 | 328 | Diff: 329 | {schema_diff} 330 | 331 | Reference schema: 332 | {reference}""".format( 333 | schema_diff=schema_diff, 334 | reference=str(reference), 335 | origin_schema=_fmt_origin(current.origin), 336 | origin_reference=_fmt_origin(reference.origin), 337 | ) 338 | raise ValueError(exception_message) 339 | 340 | # add all origins to result AFTER error checking, otherwise the error message would be pretty misleading due to the 341 | # reference containing all origins. 342 | if reference is None: 343 | return None 344 | else: 345 | return reference 346 | 347 | 348 | def _dict_to_binary(dct): 349 | return json.dumps(dct, sort_keys=True).encode("utf8") 350 | -------------------------------------------------------------------------------- /dask_deltatable/write.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import uuid 5 | from collections.abc import Mapping 6 | from datetime import datetime 7 | from pathlib import Path 8 | from typing import Any, Literal 9 | 10 | import dask 11 | import dask.dataframe as dd 12 | import pyarrow as pa 13 | import pyarrow.dataset as ds 14 | import pyarrow.fs as pa_fs 15 | from dask.core import flatten 16 | from deltalake import CommitProperties, DeltaTable 17 | from deltalake import Schema as DeltaSchema 18 | from deltalake.exceptions import DeltaProtocolError 19 | from deltalake.fs import DeltaStorageHandler 20 | from deltalake.table import MAX_SUPPORTED_PYARROW_WRITER_VERSION 21 | from deltalake.transaction import AddAction, create_table_with_add_actions 22 | from deltalake.writer.writer import try_get_table_and_table_uri 23 | from toolz.itertoolz import pluck 24 | 25 | from . import utils 26 | from ._schema import pyarrow_to_deltalake, validate_compatible 27 | 28 | PYARROW_MAJOR_VERSION = int(pa.__version__.split(".")[0]) 29 | 30 | 31 | def to_deltalake( 32 | table_or_uri: str | Path | DeltaTable, 33 | df: dd.DataFrame, 34 | *, 35 | schema: pa.Schema | None = None, 36 | partition_by: list[str] | str | None = None, 37 | filesystem: pa_fs.FileSystem | None = None, 38 | mode: Literal["error", "append", "overwrite", "ignore"] = "error", 39 | file_options: Mapping[str, Any] | None = None, 40 | max_partitions: int | None = None, 41 | max_open_files: int = 1024, 42 | max_rows_per_file: int = 10 * 1024 * 1024, 43 | min_rows_per_group: int = 64 * 1024, 44 | max_rows_per_group: int = 128 * 1024, 45 | name: str | None = None, 46 | description: str | None = None, 47 | configuration: Mapping[str, str | None] | None = None, 48 | overwrite_schema: bool = False, 49 | storage_options: dict[str, str] | None = None, 50 | partition_filters: list[tuple[str, str, Any]] | None = None, 51 | compute: bool = True, 52 | custom_metadata: dict[str, str] | None = None, 53 | ): 54 | """Write a given dask.DataFrame to a delta table. The returned value is a Dask Scalar, 55 | and the writing operation is only triggered when calling ``.compute()`` 56 | 57 | Parameters 58 | ---------- 59 | table_or_uri: str | Path | DeltaTable 60 | URI of a table or a DeltaTable object. 61 | df: dd.DataFrame 62 | Data to write 63 | schema : pa.Schema | None. Default None 64 | Optional schema to write. 65 | partition_by : list[str] | str | None. Default None 66 | List of columns to partition the table by. Only required 67 | when creating a new table 68 | filesystem : pa_fs.FileSystem | None. Default None 69 | Optional filesystem to pass to PyArrow. If not provided will 70 | be inferred from uri. The file system has to be rooted in the table root. 71 | Use the pyarrow.fs.SubTreeFileSystem, to adopt the root of pyarrow file systems. 72 | mode : Literal["error", "append", "overwrite", "ignore"]. Default "error" 73 | How to handle existing data. Default is to error if table already exists. 74 | If 'append', will add new data. 75 | If 'overwrite', will replace table with new data. 76 | If 'ignore', will not write anything if table already exists. 77 | file_options : Mapping[str, Any] | None. Default None 78 | Optional dict of options that can be used to initialize ParquetFileWriteOptions. 79 | Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx 80 | for the list of available options 81 | max_partitions : int | None. Default None 82 | The maximum number of partitions that will be used. 83 | max_open_files : int. Default 1024 84 | Limits the maximum number of 85 | files that can be left open while writing. If an attempt is made to open 86 | too many files then the least recently used file will be closed. 87 | If this setting is set too low you may end up fragmenting your 88 | data into many small files. 89 | max_rows_per_file : int. Default 10 * 1024 * 1024 90 | Maximum number of rows per file. 91 | If greater than 0 then this will limit how many rows are placed in any single file. 92 | Otherwise there will be no limit and one file will be created in each output directory 93 | unless files need to be closed to respect max_open_files 94 | min_rows_per_group : int. Default 64 * 1024 95 | Minimum number of rows per group. When the value is set, 96 | the dataset writer will batch incoming data and only write the row groups to the disk 97 | when sufficient rows have accumulated. 98 | max_rows_per_group : int. Default 128 * 1024 99 | Maximum number of rows per group. 100 | If the value is set, then the dataset writer may split up large incoming batches into multiple row groups. 101 | If this value is set, then min_rows_per_group should also be set 102 | name: str | None. Default None 103 | User-provided identifier for this table. 104 | description : str | None. Default None 105 | User-provided description for this table 106 | configuration : Mapping[str, str | None] | None. Default None 107 | A map containing configuration options for the metadata action. 108 | overwrite_schema : bool. Default False 109 | If True, allows updating the schema of the table. 110 | storage_options : dict[str, str] | None. Default None 111 | Options passed to the native delta filesystem. Unused if 'filesystem' is defined 112 | partition_filters : list[tuple[str, str, Any]] | None. Default None 113 | The partition filters that will be used for partition overwrite. 114 | compute : bool. Default True 115 | Whether to trigger the writing operation immediately 116 | 117 | Returns 118 | ------- 119 | dask.Scalar 120 | """ 121 | storage_options = utils.maybe_set_aws_credentials(table_or_uri, storage_options) # type: ignore 122 | table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options) 123 | 124 | # We need to write against the latest table version 125 | if table: 126 | table.update_incremental() 127 | 128 | utils.enforce_append_only(table=table, configuration=configuration, mode=mode) 129 | 130 | if filesystem is None: 131 | if table is not None: 132 | storage_options = table._storage_options or {} 133 | storage_options.update(storage_options or {}) 134 | 135 | storage_options = utils.maybe_set_aws_credentials(table_uri, storage_options) 136 | filesystem = pa_fs.PyFileSystem(DeltaStorageHandler(table_uri, storage_options)) 137 | 138 | if isinstance(partition_by, str): 139 | partition_by = [partition_by] 140 | 141 | if schema is not None: 142 | schema = pyarrow_to_deltalake(schema) 143 | 144 | if table: # already exists 145 | if ( 146 | schema is not None 147 | and schema != pa.schema(table.schema()) 148 | and not (mode == "overwrite" and overwrite_schema) 149 | ): 150 | raise ValueError( 151 | "Schema of data does not match table schema\n" 152 | f"Table schema:\n{schema}\nData Schema:\n{table.schema().to_arrow()}" 153 | ) 154 | 155 | if mode == "error": 156 | raise AssertionError("DeltaTable already exists.") 157 | elif mode == "ignore": 158 | return 159 | 160 | current_version = table.version() 161 | 162 | if partition_by: 163 | assert partition_by == table.metadata().partition_columns 164 | else: 165 | partition_by = table.metadata().partition_columns 166 | 167 | if table.protocol().min_writer_version > MAX_SUPPORTED_PYARROW_WRITER_VERSION: 168 | raise DeltaProtocolError( 169 | "This table's min_writer_version is " 170 | f"{table.protocol().min_writer_version}, " 171 | f"but this method only supports version {MAX_SUPPORTED_PYARROW_WRITER_VERSION}." 172 | ) 173 | else: # creating a new table 174 | current_version = -1 175 | 176 | # FIXME: schema is only known at this point if provided by the user 177 | if partition_by and schema: 178 | partition_schema = pa.schema([schema.field(name) for name in partition_by]) 179 | partitioning = ds.partitioning(partition_schema, flavor="hive") 180 | else: 181 | if partition_by: 182 | raise NotImplementedError("Have to provide schema when using partition_by") 183 | partitioning = None 184 | if mode == "overwrite": 185 | # FIXME: There are a couple of checks that are not migrated yet 186 | raise NotImplementedError("mode='overwrite' is not implemented") 187 | written = df.map_partitions( 188 | _write_partition, 189 | schema=schema, 190 | partitioning=partitioning, 191 | current_version=current_version, 192 | file_options=file_options, 193 | max_open_files=max_open_files, 194 | max_rows_per_file=max_rows_per_file, 195 | min_rows_per_group=min_rows_per_group, 196 | max_rows_per_group=max_rows_per_group, 197 | filesystem=filesystem, 198 | max_partitions=max_partitions, 199 | meta=(None, object), 200 | table=DaskDeltaTable.from_delta_table(table) if table else None, 201 | configuration=configuration, 202 | ) 203 | result = dask.delayed(_commit, name="deltatable-commit")( 204 | DaskDeltaTable.from_delta_table(table) if table else None, 205 | written, 206 | table_uri, 207 | schema, 208 | mode, 209 | partition_by, 210 | name, 211 | description, 212 | configuration, 213 | storage_options, 214 | partition_filters, 215 | custom_metadata, 216 | ) 217 | 218 | if compute: 219 | result = result.compute() 220 | return result 221 | 222 | 223 | def _commit( 224 | table, 225 | schemas_add_actions_nested, 226 | table_uri, 227 | schema, 228 | mode, 229 | partition_by, 230 | name, 231 | description, 232 | configuration, 233 | storage_options, 234 | partition_filters, 235 | custom_metadata, 236 | ): 237 | schemas = list(flatten(pluck(0, schemas_add_actions_nested))) 238 | add_actions = list(flatten(pluck(1, schemas_add_actions_nested))) 239 | # TODO: What should the behavior be if the schema is provided? Cast the 240 | # data? 241 | if schema: 242 | schemas.append(schema) 243 | 244 | # TODO: This is applying a potentially stricter schema control than what 245 | # Delta requires but if this passes, it should be good to go 246 | schema = validate_compatible(schemas) 247 | assert schema 248 | delta_schema = DeltaSchema.from_arrow(schema) 249 | commit_properties = CommitProperties(custom_metadata=custom_metadata) 250 | if table is None: 251 | storage_options = utils.maybe_set_aws_credentials(table_uri, storage_options) 252 | create_table_with_add_actions( 253 | table_uri, 254 | delta_schema, 255 | add_actions, 256 | mode, 257 | partition_by or [], 258 | name, 259 | description, 260 | configuration, 261 | storage_options, 262 | commit_properties, 263 | ) 264 | else: 265 | table._table.create_write_transaction( 266 | add_actions, 267 | mode, 268 | partition_by or [], 269 | delta_schema, 270 | partition_filters, 271 | ) 272 | table.update_incremental() 273 | 274 | 275 | def _write_partition( 276 | df, 277 | *, 278 | schema, 279 | partitioning, 280 | current_version, 281 | file_options, 282 | max_open_files, 283 | max_rows_per_file, 284 | min_rows_per_group, 285 | max_rows_per_group, 286 | filesystem, 287 | max_partitions, 288 | table, 289 | configuration, 290 | ) -> tuple[pa.Schema, list[AddAction]]: 291 | if schema is None: 292 | # 293 | schema = pyarrow_to_deltalake(pa.Schema.from_pandas(df)) 294 | data = pa.Table.from_pandas(df, schema=schema) 295 | 296 | add_actions: list[AddAction] = [] 297 | 298 | def visitor(written_file: Any) -> None: 299 | num_indexed_cols, stats_cols = utils.get_num_idx_cols_and_stats_columns( 300 | table if table is not None else None, configuration 301 | ) 302 | path, partition_values = utils.get_partitions_from_path(written_file.path) 303 | stats = utils.get_file_stats_from_metadata( 304 | written_file.metadata, num_indexed_cols, stats_cols 305 | ) 306 | 307 | # PyArrow added support for written_file.size in 9.0.0 308 | if PYARROW_MAJOR_VERSION >= 9: 309 | size = written_file.size 310 | else: 311 | size = filesystem.get_file_info([path])[0].size 312 | 313 | add_actions.append( 314 | AddAction( 315 | path, 316 | size, 317 | partition_values, 318 | int(datetime.now().timestamp() * 1000), 319 | True, 320 | json.dumps(stats, cls=utils.DeltaJSONEncoder), 321 | ) 322 | ) 323 | 324 | if file_options is not None: 325 | file_options = ds.ParquetFileFormat().make_write_options(**file_options) 326 | 327 | ds.write_dataset( 328 | data, 329 | base_dir="/", 330 | basename_template=f"{current_version + 1}-{uuid.uuid4()}-{{i}}.parquet", 331 | format="parquet", 332 | partitioning=partitioning, 333 | # It will not accept a schema if using a RBR 334 | schema=schema, 335 | existing_data_behavior="overwrite_or_ignore", 336 | file_options=file_options, 337 | max_open_files=max_open_files, 338 | file_visitor=visitor, 339 | max_rows_per_file=max_rows_per_file, 340 | min_rows_per_group=min_rows_per_group, 341 | max_rows_per_group=max_rows_per_group, 342 | filesystem=filesystem, 343 | max_partitions=max_partitions, 344 | ) 345 | return schema, add_actions 346 | 347 | 348 | class DaskDeltaTable(DeltaTable): 349 | @classmethod 350 | def from_delta_table( 351 | cls, 352 | table: DeltaTable, 353 | ) -> DaskDeltaTable: 354 | config = table.table_config 355 | return cls( 356 | table_uri=table.table_uri, 357 | version=table.version(), 358 | storage_options=table._storage_options, 359 | without_files=config.without_files, 360 | log_buffer_size=config.log_buffer_size, 361 | ) 362 | 363 | def __reduce__(self) -> tuple[type, tuple[Any, ...]]: 364 | """ 365 | This allows DeltaTable to be pickled. 366 | """ 367 | config = self.table_config 368 | return ( 369 | self.__class__, 370 | ( 371 | self.table_uri, 372 | self.version(), 373 | self._storage_options, 374 | config.without_files, 375 | config.log_buffer_size, 376 | ), 377 | ) 378 | -------------------------------------------------------------------------------- /dask_deltatable/core.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from collections.abc import Sequence 5 | from typing import Any, Callable, cast 6 | 7 | import dask 8 | import dask.dataframe as dd 9 | import pyarrow as pa 10 | import pyarrow.parquet as pq 11 | from dask.base import tokenize 12 | from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine 13 | from dask.dataframe.utils import make_meta 14 | from deltalake import DeltaTable 15 | from fsspec.core import get_fs_token_paths 16 | from packaging.version import Version 17 | from pyarrow import dataset as pa_ds 18 | 19 | from . import utils 20 | from .types import Filters 21 | 22 | if Version(pa.__version__) >= Version("10.0.0"): 23 | filters_to_expression = pq.filters_to_expression 24 | else: 25 | # fallback to older internal method 26 | filters_to_expression = pq._filters_to_expression 27 | 28 | 29 | def _get_pq_files(dt: DeltaTable, filter: Filters = None) -> list[str]: 30 | """ 31 | Get the list of parquet files after loading the 32 | current datetime version 33 | 34 | Parameters 35 | ---------- 36 | dt : DeltaTable 37 | DeltaTable instance 38 | filter : list[tuple[str, str, Any]] | list[list[tuple[str, str, Any]]] | None 39 | Filters in DNF form. 40 | 41 | Returns 42 | ------- 43 | list[str] 44 | List of files matching optional filter. 45 | """ 46 | partition_filters = utils.get_partition_filters( 47 | dt.metadata().partition_columns, filter 48 | ) 49 | if not partition_filters: 50 | # can't filter 51 | return sorted(dt.file_uris()) 52 | file_uris = set() 53 | for filter_set in partition_filters: 54 | file_uris.update(dt.file_uris(partition_filters=filter_set)) 55 | return sorted(list(file_uris)) 56 | 57 | 58 | def _read_delta_partition( 59 | filename: str, 60 | schema: pa.Schema, 61 | fs: Any, 62 | columns: Sequence[str] | None, 63 | filter: Filters = None, 64 | pyarrow_to_pandas: dict[str, Any] | None = None, 65 | **_kwargs: dict[str, Any], 66 | ): 67 | filter_expression = filters_to_expression(filter) if filter else None 68 | if pyarrow_to_pandas is None: 69 | pyarrow_to_pandas = {} 70 | pyarrow_to_pandas["types_mapper"] = _get_type_mapper( 71 | pyarrow_to_pandas.get("types_mapper") 72 | ) 73 | pyarrow_to_pandas["ignore_metadata"] = pyarrow_to_pandas.get( 74 | "ignore_metadata", False 75 | ) 76 | table = pa_ds.dataset( 77 | source=filename, 78 | schema=schema, 79 | filesystem=fs, 80 | format="parquet", 81 | partitioning="hive", 82 | ).to_table(filter=filter_expression, columns=columns) 83 | return table.to_pandas(**pyarrow_to_pandas) 84 | 85 | 86 | def _read_from_filesystem( 87 | path: str, 88 | version: int | None, 89 | columns: Sequence[str] | None, 90 | datetime: str | None = None, 91 | storage_options: dict[str, str] | None = None, 92 | delta_storage_options: dict[str, str] | None = None, 93 | **kwargs: dict[str, Any], 94 | ) -> dd.DataFrame: 95 | """ 96 | Reads the list of parquet files in parallel 97 | """ 98 | storage_options = utils.maybe_set_aws_credentials(path, storage_options) # type: ignore 99 | delta_storage_options = utils.maybe_set_aws_credentials(path, delta_storage_options) # type: ignore 100 | 101 | fs, fs_token, _ = get_fs_token_paths(path, storage_options=storage_options) 102 | dt = DeltaTable( 103 | table_uri=path, version=version, storage_options=delta_storage_options 104 | ) 105 | if datetime is not None: 106 | dt.load_as_version(datetime) 107 | 108 | schema = pa.schema(dt.schema()) 109 | 110 | filter_value = cast(Filters, kwargs.get("filter", None)) 111 | pq_files = _get_pq_files(dt, filter=filter_value) 112 | 113 | mapper_kwargs = kwargs.get("pyarrow_to_pandas", {}) 114 | mapper_kwargs["types_mapper"] = _get_type_mapper( 115 | mapper_kwargs.get("types_mapper", None) 116 | ) 117 | meta = make_meta(pa.table(schema.empty_table()).to_pandas(**mapper_kwargs)) 118 | if columns: 119 | meta = meta[columns] 120 | 121 | if not dd._dask_expr_enabled(): 122 | # Setting token not supported in dask-expr 123 | kwargs["token"] = tokenize(path, fs_token, **kwargs) # type: ignore 124 | 125 | if len(pq_files) == 0: 126 | df = schema.empty_table().to_pandas() 127 | if columns is not None: 128 | df = df[columns] 129 | return dd.from_pandas(df, npartitions=1) 130 | else: 131 | return dd.from_map( 132 | _read_delta_partition, 133 | pq_files, 134 | fs=fs, 135 | columns=columns, 136 | schema=schema, 137 | meta=meta, 138 | label="read-delta-table", 139 | **kwargs, 140 | ) 141 | 142 | 143 | def _get_type_mapper( 144 | user_types_mapper: dict[str, Any] | None, 145 | ) -> Callable[[Any], Any] | None: 146 | """ 147 | Set the type mapper for the schema 148 | """ 149 | convert_string = dask.config.get("dataframe.convert-string", True) 150 | if convert_string is None: 151 | convert_string = True 152 | return ArrowDatasetEngine._determine_type_mapper( 153 | dtype_backend=None, 154 | convert_string=convert_string, 155 | arrow_to_pandas={"types_mapper": user_types_mapper}, 156 | ) 157 | 158 | 159 | def read_deltalake( 160 | path: str | None = None, 161 | catalog: str | None = None, 162 | database_name: str | None = None, 163 | table_name: str | None = None, 164 | version: int | None = None, 165 | columns: list[str] | None = None, 166 | storage_options: dict[str, str] | None = None, 167 | datetime: str | None = None, 168 | delta_storage_options: dict[str, str] | None = None, 169 | **kwargs, 170 | ): 171 | """ 172 | Read a Delta Table into a Dask DataFrame 173 | 174 | This reads a list of Parquet files in delta table directory into a 175 | Dask.dataframe. 176 | 177 | Parameters 178 | ---------- 179 | path: Optional[str] 180 | path of Delta table directory 181 | catalog: Optional[str] 182 | Currently supports only AWS Glue Catalog 183 | if catalog is provided, user has to provide database and table name, and 184 | delta-rs will fetch the metadata from glue catalog, this is used by dask to read 185 | the parquet tables 186 | database_name: Optional[str] 187 | database name present in the catalog 188 | tablename: Optional[str] 189 | table name present in the database of the Catalog 190 | version: int, default None 191 | DeltaTable Version, used for Time Travelling across the 192 | different versions of the parquet datasets 193 | datetime: str, default None 194 | Time travel Delta table to the latest version that's created at or 195 | before provided `datetime_string` argument. 196 | The `datetime_string` argument should be an RFC 3339 and ISO 8601 date 197 | and time string. 198 | 199 | Examples: 200 | `2018-01-26T18:30:09Z` 201 | `2018-12-19T16:39:57-08:00` 202 | `2018-01-26T18:30:09.453+00:00` 203 | #(copied from delta-rs docs) 204 | columns: None or list(str) 205 | Columns to load. If None, loads all. 206 | storage_options : dict, default None 207 | Key/value pairs to be passed on to the fsspec backend, if any. 208 | delta_storage_options : dict, default None 209 | Key/value pairs to be passed on to the delta-rs filesystem, if any. 210 | kwargs: dict,optional 211 | Some most used parameters can be passed here are: 212 | 1. schema 213 | 2. filter 214 | 3. pyarrow_to_pandas 215 | 216 | schema: pyarrow.Schema 217 | Used to maintain schema evolution in deltatable. 218 | delta protocol stores the schema string in the json log files which is 219 | converted into pyarrow.Schema and used for schema evolution 220 | i.e Based on particular version, some columns can be 221 | shown or not shown. 222 | 223 | filter: Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None 224 | List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``. 225 | Can act as both partition as well as row based filter, above list of filters 226 | converted into pyarrow.dataset.Expression built using pyarrow.dataset.Field 227 | example: 228 | [("x",">",400)] --> pyarrow.dataset.field("x")>400 229 | 230 | pyarrow_to_pandas: dict 231 | Options to pass directly to pyarrow.Table.to_pandas. 232 | Common options include: 233 | * categories: list[str] 234 | List of columns to treat as pandas.Categorical 235 | * strings_to_categorical: bool 236 | Encode string (UTF8) and binary types to pandas.Categorical. 237 | * types_mapper: Callable 238 | A function mapping a pyarrow DataType to a pandas ExtensionDtype 239 | 240 | See https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas 241 | for more. 242 | 243 | Returns 244 | ------- 245 | Dask.DataFrame 246 | 247 | Examples 248 | -------- 249 | >>> import dask_deltatable as ddt 250 | >>> df = ddt.read_deltalake('s3://bucket/my-delta-table') # doctest: +SKIP 251 | 252 | """ 253 | if catalog is not None: 254 | if (database_name is None) or (table_name is None): 255 | raise ValueError( 256 | "Since Catalog was provided, please provide Database and table name" 257 | ) 258 | else: 259 | raise NotImplementedError( 260 | "Reading from a catalog used to be supported ", 261 | "but was removed from the upstream dependency delta-rs>=1.0.", 262 | ) 263 | else: 264 | if path is None: 265 | raise ValueError("Please Provide Delta Table path") 266 | 267 | delta_storage_options = utils.maybe_set_aws_credentials(path, delta_storage_options) # type: ignore 268 | resultdf = _read_from_filesystem( 269 | path=path, 270 | version=version, 271 | columns=columns, 272 | storage_options=storage_options, 273 | datetime=datetime, 274 | delta_storage_options=delta_storage_options, 275 | **kwargs, 276 | ) 277 | return resultdf 278 | 279 | 280 | def read_unity_catalog( 281 | catalog_name: str, 282 | schema_name: str, 283 | table_name: str, 284 | **kwargs, 285 | ) -> dd.DataFrame: 286 | """ 287 | Read a Delta Table from Databricks Unity Catalog into a Dask DataFrame. 288 | 289 | This function connects to Databricks using the WorkspaceClient and retrieves 290 | temporary credentials to access the specified Unity Catalog table. It then 291 | reads the Delta table's Parquet files into a Dask DataFrame. 292 | 293 | Parameters 294 | ---------- 295 | catalog_name : str 296 | Name of the Unity Catalog catalog. 297 | schema_name : str 298 | Name of the schema within the catalog. 299 | table_name : str 300 | Name of the table within the catalog schema. 301 | **kwargs 302 | Additional keyword arguments passed to `dask.dataframe.read_parquet`. 303 | Some most used parameters can be passed here are: 304 | 1. schema 305 | 2. filter 306 | 3. pyarrow_to_pandas 307 | 4. databricks_host 308 | 5. databricks_token 309 | 310 | schema: pyarrow.Schema 311 | Used to maintain schema evolution in deltatable. 312 | delta protocol stores the schema string in the json log files which is 313 | converted into pyarrow.Schema and used for schema evolution 314 | i.e Based on particular version, some columns can be 315 | shown or not shown. 316 | 317 | filter: Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None 318 | List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``. 319 | Can act as both partition as well as row based filter, above list of filters 320 | converted into pyarrow.dataset.Expression built using pyarrow.dataset.Field 321 | example: 322 | [("x",">",400)] --> pyarrow.dataset.field("x")>400 323 | 324 | pyarrow_to_pandas: dict 325 | Options to pass directly to pyarrow.Table.to_pandas. 326 | Common options include: 327 | * categories: list[str] 328 | List of columns to treat as pandas.Categorical 329 | * strings_to_categorical: bool 330 | Encode string (UTF8) and binary types to pandas.Categorical. 331 | * types_mapper: Callable 332 | A function mapping a pyarrow DataType to a pandas ExtensionDtype 333 | 334 | See https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas 335 | for more. 336 | 337 | databricks_host: str 338 | The Databricks workspace URL hosting the Unity Catalog. 339 | 340 | databricks_token: str 341 | A Databricks personal access token with at least read access on the catalog. 342 | 343 | Returns 344 | ------- 345 | dask.dataframe.DataFrame 346 | A Dask DataFrame representing the Delta table. 347 | 348 | Notes 349 | ----- 350 | Requires the following to be set as either environment variables or in `kwargs` as 351 | lower case: 352 | - DATABRICKS_HOST: The Databricks workspace URL hosting the Unity Catalog. 353 | - DATABRICKS_TOKEN: A Databricks personal access token with at least read access on 354 | the catalog. 355 | 356 | Example 357 | ------- 358 | >>> ddf = read_unity_catalog( 359 | catalog_name="main", 360 | database_name="my_db", 361 | able_name="my_table", 362 | ) 363 | """ 364 | from databricks.sdk import WorkspaceClient 365 | from databricks.sdk.service.catalog import TableOperation 366 | 367 | try: 368 | workspace_client = WorkspaceClient( 369 | host=os.environ.get("DATABRICKS_HOST", kwargs["databricks_host"]), 370 | token=os.environ.get("DATABRICKS_TOKEN", kwargs["databricks_token"]), 371 | ) 372 | except KeyError: 373 | raise ValueError( 374 | "Please set `DATABRICKS_HOST` and `DATABRICKS_TOKEN` either as environment" 375 | " variables or as part of `kwargs` with lowercase" 376 | ) 377 | uc_full_url = f"{catalog_name}.{schema_name}.{table_name}" 378 | table = workspace_client.tables.get(uc_full_url) 379 | temp_credentials = workspace_client.temporary_table_credentials.generate_temporary_table_credentials( 380 | operation=TableOperation.READ, 381 | table_id=table.table_id, 382 | ) 383 | storage_options = { 384 | "sas_token": temp_credentials.azure_user_delegation_sas.sas_token 385 | } 386 | delta_table = DeltaTable( 387 | table_uri=table.storage_location, storage_options=storage_options 388 | ) 389 | ddf = dd.read_parquet( 390 | path=delta_table.file_uris(), 391 | storage_options=storage_options, 392 | **kwargs, 393 | ) 394 | return ddf 395 | --------------------------------------------------------------------------------