├── dask_deltatable
    ├── py.typed
    ├── types.py
    ├── __init__.py
    ├── utils.py
    ├── _schema.py
    ├── write.py
    └── core.py
├── MANIFEST.in
├── requirements.txt
├── tests
    ├── data
    │   ├── empty1.zip
    │   ├── empty2.zip
    │   ├── simple.zip
    │   ├── simple2.zip
    │   ├── partition.zip
    │   └── checkpoint.zip
    ├── test_distributed.py
    ├── test_write.py
    ├── test_utils.py
    ├── test_acceptance.py
    └── test_core.py
├── dev_requirements.txt
├── .flake8
├── continous_integeration
    ├── environment-3.10.yaml
    ├── environment-3.11.yaml
    ├── environment-3.12.yaml
    └── environment-3.9.yaml
├── .github
    └── workflows
    │   ├── pre-commit.yaml
    │   ├── deploy.yaml
    │   └── tests.yaml
├── pyproject.toml
├── setup.cfg
├── .pre-commit-config.yaml
├── setup.py
├── LICENSE
├── conftest.py
├── .gitignore
└── README.md


/dask_deltatable/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include dask_deltatable/py.typed
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | dask[dataframe]
2 | deltalake>=1.1.0
3 | fsspec
4 | pyarrow
5 | 


--------------------------------------------------------------------------------
/tests/data/empty1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/empty1.zip


--------------------------------------------------------------------------------
/tests/data/empty2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/empty2.zip


--------------------------------------------------------------------------------
/tests/data/simple.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/simple.zip


--------------------------------------------------------------------------------
/tests/data/simple2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/simple2.zip


--------------------------------------------------------------------------------
/tests/data/partition.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/partition.zip


--------------------------------------------------------------------------------
/tests/data/checkpoint.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dask-contrib/dask-deltatable/main/tests/data/checkpoint.zip


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
1 | mypy==0.991
2 | mypy-extensions==0.4.3
3 | pytest==7.2.0
4 | pytest-cov==4.0.0
5 | black==22.3.0
6 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | # flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234
2 | [flake8]
3 | max-line-length = 120
4 | 


--------------------------------------------------------------------------------
/dask_deltatable/types.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from typing import Any, Union
4 | 
5 | Filter = tuple[str, str, Any]
6 | Filters = Union[list[Filter], list[list[Filter]], None]
7 | 


--------------------------------------------------------------------------------
/continous_integeration/environment-3.10.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.10
 6 |   - dask
 7 |   - pyarrow
 8 |   - pytest
 9 |   - pytest-cov
10 | 


--------------------------------------------------------------------------------
/continous_integeration/environment-3.11.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.11
 6 |   - dask
 7 |   - pyarrow
 8 |   - pytest
 9 |   - pytest-cov
10 | 


--------------------------------------------------------------------------------
/continous_integeration/environment-3.12.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.12
 6 |   - dask
 7 |   - pyarrow
 8 |   - pytest
 9 |   - pytest-cov
10 | 


--------------------------------------------------------------------------------
/continous_integeration/environment-3.9.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.9
 6 |   - dask
 7 |   - pyarrow
 8 |   - pytest
 9 |   - pytest-cov
10 | 


--------------------------------------------------------------------------------
/dask_deltatable/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | __all__ = [
 4 |     "read_deltalake",
 5 |     "read_unity_catalog",
 6 |     "to_deltalake",
 7 | ]
 8 | 
 9 | from .core import read_deltalake as read_deltalake
10 | from .core import read_unity_catalog as read_unity_catalog
11 | from .write import to_deltalake as to_deltalake
12 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Python Style Check
 3 | on: [push,pull_request]
 4 | 
 5 | jobs:
 6 |   checks:
 7 |     name: pre-commit hooks
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v3.5.2
11 |       - uses: actions/setup-python@v4
12 |         with:
13 |           python-version: '3.9'
14 |       - uses: pre-commit/action@v3.0.0
15 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.mypy]
 2 | strict = true
 3 | no_implicit_reexport = false
 4 | allow_incomplete_defs = true
 5 | allow_untyped_defs = true
 6 | warn_return_any = false
 7 | disallow_untyped_calls = false
 8 | ignore_missing_imports = true
 9 | 
10 | [[tool.mypy.overrides]]
11 | module = "pyarrow.*"
12 | ignore_missing_imports = true
13 | 
14 | [tool.isort]
15 | profile = "black"
16 | add_imports = ["from __future__ import annotations"]
17 | 
18 | [tool.black]
19 | target-version = ['py310']
20 | include = '\.pyi?$'
21 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | # flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234
 2 | [flake8]
 3 | exclude = __init__.py
 4 | max-line-length = 120
 5 | ignore =
 6 | # Extra space in brackets
 7 |     E20
 8 | # Multiple spaces around ","
 9 |     E231,E241
10 | # Comments
11 |     E26
12 | # Import formatting
13 |     E4
14 | # Comparing types instead of isinstance
15 |     E721
16 | # Assigning lambda expression
17 |     E731
18 | # Ambiguous variable names
19 |     E741
20 | # Line break before binary operator
21 |     W503
22 | # Line break after binary operator
23 |     W504
24 | # Redefinition of unused 'loop' from line 10
25 |     F811
26 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created, updated]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     name: Deploy
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |       - name: Set up Python
14 |         uses: conda-incubator/setup-miniconda@v2
15 |         with:
16 |           miniforge-variant: Mambaforge
17 |           use-mamba: true
18 |           python-version: 3.9
19 |       - name: Install dependencies
20 |         shell: bash -l {0}
21 |         run: |
22 |           pip install setuptools wheel twine
23 |           which python
24 |           pip list
25 |           conda list
26 |       - name: Build and publish
27 |         shell: bash -l {0}
28 |         env:
29 |           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
30 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
31 |         run: |
32 |           python setup.py sdist bdist_wheel
33 |           twine upload dist/*
34 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 25.1.0
 4 |     hooks:
 5 |       - id: black
 6 |         language_version: python3
 7 |   - repo: https://github.com/pycqa/isort
 8 |     rev: 6.0.1
 9 |     hooks:
10 |       - id: isort
11 |   - repo: https://github.com/pre-commit/pre-commit-hooks
12 |     rev: v5.0.0
13 |     hooks:
14 |       - id: trailing-whitespace
15 |       - id: end-of-file-fixer
16 |       - id: check-yaml
17 |       - id: check-added-large-files
18 |   - repo: https://github.com/asottile/pyupgrade
19 |     rev: v3.20.0
20 |     hooks:
21 |       - id: pyupgrade
22 |         args:
23 |           - --py39-plus
24 |   - repo: https://github.com/pre-commit/mirrors-mypy
25 |     rev: v1.16.1
26 |     hooks:
27 |       - id: mypy
28 |         # Override default --ignore-missing-imports
29 |         # Use pyproject.toml if possible instead of adding command line parameters here
30 |         args: [--warn-unused-configs]
31 |         additional_dependencies:
32 |           # Type stubs
33 |           - boto3-stubs
34 |           - dask
35 |           - deltalake>=0.16
36 |           - pandas-stubs
37 |           - pytest
38 |           - types-setuptools
39 |   - repo: https://github.com/pycqa/flake8
40 |     rev: 7.3.0
41 |     hooks:
42 |       - id: flake8
43 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from setuptools import setup
 6 | 
 7 | with open("README.md", encoding="utf-8") as f:
 8 |     long_description = f.read()
 9 | 
10 | setup(
11 |     name="dask-deltatable",
12 |     version="0.4.0",
13 |     description="Dask + Delta Table ",
14 |     url="https://github.com/dask-contrib/dask-deltatable/",
15 |     maintainer="rajagurunath",
16 |     maintainer_email="gurunathrajagopal@gmail.com",
17 |     license="BSD-3-Clause",
18 |     packages=["dask_deltatable"],
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     python_requires=">=3.9",
22 |     install_requires=open("requirements.txt").read().strip().split("\n"),
23 |     extras_require={
24 |         "dev": ["pytest", "requests", "pytest-cov>=2.10.1"],
25 |         "s3": ["s3fs", "boto3"],
26 |         "uc": ["adlfs", "databricks-sdk"],
27 |     },
28 |     classifiers=[
29 |         "Development Status :: 5 - Production/Stable",
30 |         "Intended Audience :: Developers",
31 |         "Intended Audience :: Science/Research",
32 |         "Topic :: Database",
33 |         "Topic :: Scientific/Engineering",
34 |         "License :: OSI Approved :: BSD License",
35 |         "Programming Language :: Python :: 3",
36 |         "Programming Language :: Python :: 3 :: Only",
37 |         "Programming Language :: Python :: 3.9",
38 |         "Programming Language :: Python :: 3.10",
39 |         "Programming Language :: Python :: 3.11",
40 |     ],
41 |     include_package_data=True,
42 |     zip_safe=False,
43 | )
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2023, Dask contributors
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are
 8 | met:
 9 | 
10 | * Redistributions of source code must retain the above copyright
11 |   notice, this list of conditions and the following disclaimer.
12 | 
13 | * Redistributions in binary form must reproduce the above copyright
14 |   notice, this list of conditions and the following disclaimer in the
15 |   documentation and/or other materials provided with the distribution.
16 | 
17 | * Neither the name of the copyright holder nor the names of its
18 |   contributors may be used to endorse or promote products derived from
19 |   this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import zipfile
 5 | 
 6 | import pytest
 7 | 
 8 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 9 | DATA_DIR = os.path.join(ROOT_DIR, "tests", "data")
10 | 
11 | 
12 | @pytest.fixture()
13 | def simple_table(tmpdir):
14 |     output_dir = tmpdir
15 |     deltaf = zipfile.ZipFile(f"{DATA_DIR}/simple.zip")
16 |     deltaf.extractall(output_dir)
17 |     return str(output_dir) + "/test1/"
18 | 
19 | 
20 | @pytest.fixture()
21 | def simple_table2(tmpdir):
22 |     output_dir = tmpdir
23 |     deltaf = zipfile.ZipFile(f"{DATA_DIR}/simple2.zip")
24 |     deltaf.extractall(output_dir)
25 |     return str(output_dir) + "/simple_table/"
26 | 
27 | 
28 | @pytest.fixture()
29 | def partition_table(tmpdir):
30 |     output_dir = tmpdir
31 |     deltaf = zipfile.ZipFile(f"{DATA_DIR}/partition.zip")
32 |     deltaf.extractall(output_dir)
33 |     return str(output_dir) + "/test2/"
34 | 
35 | 
36 | @pytest.fixture()
37 | def empty_table1(tmpdir):
38 |     output_dir = tmpdir
39 |     deltaf = zipfile.ZipFile(f"{DATA_DIR}/empty1.zip")
40 |     deltaf.extractall(output_dir)
41 |     return str(output_dir) + "/empty/"
42 | 
43 | 
44 | @pytest.fixture()
45 | def empty_table2(tmpdir):
46 |     output_dir = tmpdir
47 |     deltaf = zipfile.ZipFile(f"{DATA_DIR}/empty2.zip")
48 |     deltaf.extractall(output_dir)
49 |     return str(output_dir) + "/empty2/"
50 | 
51 | 
52 | @pytest.fixture()
53 | def checkpoint_table(tmpdir):
54 |     output_dir = tmpdir
55 |     deltaf = zipfile.ZipFile(f"{DATA_DIR}/checkpoint.zip")
56 |     deltaf.extractall(output_dir)
57 |     return str(output_dir) + "/checkpoint/"
58 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | # When this workflow is queued, automatically cancel any previous running
 6 | # or pending jobs from the same branch
 7 | concurrency:
 8 |   group: tests-${{ github.ref }}
 9 |   cancel-in-progress: true
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ${{ matrix.os }}
14 |     defaults:
15 |       run:
16 |         shell: bash -l {0}
17 |     strategy:
18 |       matrix:
19 |         os: ["windows-latest", "ubuntu-latest", "macos-latest"]
20 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
21 | 
22 |     steps:
23 |       - name: Checkout source
24 |         uses: actions/checkout@v3.5.3
25 |         with:
26 |           fetch-depth: 0  # Needed by codecov.io
27 | 
28 |       - name: Setup Conda Environment
29 |         uses: conda-incubator/setup-miniconda@v3.2.0
30 |         with:
31 |           miniforge-version: latest
32 |           channel-priority: strict
33 |           python-version: ${{ matrix.python-version }}
34 |           environment-file: continous_integeration/environment-${{ matrix.python-version }}.yaml
35 |           activate-environment: test-environment
36 |           auto-activate-base: false
37 | 
38 |       - name: Install dask-deltatable
39 |         run: python -m pip install -e ".[dev]"
40 | 
41 |       - name: conda list
42 |         run: conda list
43 | 
44 |       - name: Run tests
45 |         id: run_tests
46 |         run: |
47 |           set -o pipefail
48 |           mkdir reports
49 | 
50 |           python -m pytest tests \
51 |           --junitxml=reports/test-results.xml \
52 |           --cov-report=xml \
53 |           --cov dask_deltatable \
54 |           | tee reports/stdout
55 | 
56 |       - name: Upload test results
57 |         # ensure this runs even if pytest fails
58 |         if: >
59 |           always() &&
60 |           (steps.run_tests.outcome == 'success' || steps.run_tests.outcome == 'failure')
61 |         uses: actions/upload-artifact@v4
62 |         with:
63 |           name: my-artifacts-${{ strategy.job-index }}
64 |           path: reports
65 | 
66 |       - name: Upload coverage to Codecov
67 |         uses: codecov/codecov-action@v3
68 |         with:
69 |           token: ${{ secrets.CODECOV_TOKEN }}
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | dask-worker-space/
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # PyCharm project settings
123 | .idea
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | .DS_Store
136 | junit/
137 | 
138 | # downloaded DAT files: https://github.com/delta-incubator/dat
139 | tests/out/
140 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Dask-DeltaTable
  2 | 
  3 | Reading and writing to Delta Lake using Dask engine.
  4 | 
  5 | ### Installation
  6 | 
  7 | `dask-deltatable` is available on PyPI:
  8 | 
  9 | ```
 10 | pip install dask-deltatable
 11 | ```
 12 | 
 13 | And conda-forge:
 14 | 
 15 | ```
 16 | conda install -c conda-forge dask-deltatable
 17 | ```
 18 | 
 19 | ### Features:
 20 | 
 21 | 1. Read the parquet files from Delta Lake and parallelize with Dask
 22 | 2. Write Dask dataframes to Delta Lake (limited support)
 23 | 3. Supports multiple filesystems (s3, azurefs, gcsfs)
 24 | 4. Subset of Delta Lake features:
 25 |    - Time Travel
 26 |    - Schema evolution
 27 |    - Parquet filters
 28 |      - row filter
 29 |      - partition filter
 30 | 
 31 | ### Not supported
 32 | 
 33 | 1. Writing to Delta Lake is still in development.
 34 | 2. `optimize` API to run a bin-packing operation on a Delta Table.
 35 | 
 36 | ### Reading from Delta Lake
 37 | 
 38 | ```python
 39 | import dask_deltatable as ddt
 40 | 
 41 | # read delta table
 42 | df = ddt.read_deltalake("delta_path")
 43 | 
 44 | # with specific version
 45 | df = ddt.read_deltalake("delta_path", version=3)
 46 | 
 47 | # with specific datetime
 48 | df = ddt.read_deltalake("delta_path", datetime="2018-12-19T16:39:57-08:00")
 49 | ```
 50 | 
 51 | `df` is a Dask DataFrame that you can work with in the same way you normally would. See
 52 | [the Dask DataFrame documentation](https://docs.dask.org/en/stable/dataframe.html) for
 53 | available operations.
 54 | 
 55 | ### Accessing remote file systems
 56 | 
 57 | To be able to read from S3, azure, gcsfs, and other remote filesystems,
 58 | you ensure the credentials are properly configured in environment variables
 59 | or config files. For AWS, you may need `~/.aws/credential`; for gcsfs,
 60 | `GOOGLE_APPLICATION_CREDENTIALS`. Refer to your cloud provider documentation
 61 | to configure these.
 62 | 
 63 | ```python
 64 | ddt.read_deltalake("s3://bucket_name/delta_path", version=3)
 65 | ```
 66 | 
 67 | ### Accessing AWS Glue catalog
 68 | 
 69 | `dask-deltatable` can connect to AWS Glue catalog to read the delta table.
 70 | The method will look for `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`
 71 | environment variables, and if those are not available, fall back to
 72 | `~/.aws/credentials`.
 73 | 
 74 | Example:
 75 | 
 76 | ```python
 77 | ddt.read_deltalake(catalog="glue", database_name="science", table_name="physics")
 78 | ```
 79 | 
 80 | ### Accessing Unity catalog
 81 | 
 82 | `dask-deltatable` can connect to Unity catalog to read the delta table.
 83 | The method will look for `DATABRICKS_HOST` and `DATABRICKS_TOKEN` environment
 84 | variables or try to find them as `kwargs` with the same name but lowercase.
 85 | 
 86 | Example:
 87 | 
 88 | ```python
 89 | ddt.read_unity_catalog(
 90 |     catalog_name="projects",
 91 |     schema_name="science",
 92 |     table_name="physics"
 93 | )
 94 | ```
 95 | 
 96 | ### Writing to Delta Lake
 97 | 
 98 | To write a Dask dataframe to Delta Lake, use `to_deltalake` method.
 99 | 
100 | ```python
101 | import dask.dataframe as dd
102 | import dask_deltatable as ddt
103 | 
104 | df = dd.read_csv("s3://bucket_name/data.csv")
105 | # do some processing on the dataframe...
106 | ddt.to_deltalake("s3://bucket_name/delta_path", df)
107 | ```
108 | 
109 | Writing to Delta Lake is still in development, so be aware that some features
110 | may not work.
111 | 


--------------------------------------------------------------------------------
/tests/test_distributed.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pytest
  4 | 
  5 | distributed = pytest.importorskip("distributed")
  6 | 
  7 | import os  # noqa: E402
  8 | import sys  # noqa: E402
  9 | 
 10 | import pyarrow as pa  # noqa: E402
 11 | import pyarrow.dataset as pa_ds  # noqa: E402
 12 | import pyarrow.parquet as pq  # noqa: E402
 13 | from dask.datasets import timeseries  # noqa: E402
 14 | from distributed.utils_test import cleanup  # noqa F401
 15 | from distributed.utils_test import (  # noqa F401
 16 |     client,
 17 |     cluster,
 18 |     cluster_fixture,
 19 |     gen_cluster,
 20 |     loop,
 21 |     loop_in_thread,
 22 |     popen,
 23 |     varying,
 24 | )
 25 | 
 26 | import dask_deltatable as ddt  # noqa: E402
 27 | 
 28 | pytestmark = pytest.mark.skipif(
 29 |     sys.platform == "win32",
 30 |     reason=(
 31 |         "The teardown of distributed.utils_test.cluster_fixture "
 32 |         "fails on windows CI currently"
 33 |     ),
 34 | )
 35 | 
 36 | 
 37 | def test_write(client, tmpdir):
 38 |     ddf = timeseries(
 39 |         start="2023-01-01",
 40 |         end="2023-01-03",
 41 |         freq="1H",
 42 |         partition_freq="1D",
 43 |         dtypes={"str": object, "float": float, "int": int},
 44 |     ).reset_index()
 45 |     ddt.to_deltalake(f"{tmpdir}", ddf)
 46 | 
 47 | 
 48 | def test_append(client, tmpdir):
 49 |     """Ensure that a DeltaTable can be pickled and sent over to a worker for appending."""
 50 |     ddf = timeseries(
 51 |         start="2023-01-01",
 52 |         end="2023-01-03",
 53 |         freq="1H",
 54 |         partition_freq="1D",
 55 |         dtypes={"str": object, "float": float, "int": int},
 56 |     ).reset_index()
 57 |     ddt.to_deltalake(f"{tmpdir}", ddf)
 58 |     ddt.to_deltalake(f"{tmpdir}", ddf, mode="append")
 59 | 
 60 | 
 61 | def test_write_with_options(client, tmpdir):
 62 |     file_options = dict(compression="gzip")
 63 |     ddf = timeseries(
 64 |         start="2023-01-01",
 65 |         end="2023-01-03",
 66 |         freq="1H",
 67 |         partition_freq="1D",
 68 |         dtypes={"str": object, "float": float, "int": int},
 69 |     ).reset_index()
 70 |     ddt.to_deltalake(f"{tmpdir}", ddf, file_options=file_options)
 71 |     parquet_filename = [f for f in os.listdir(tmpdir) if f.endswith(".parquet")][0]
 72 |     parquet_file = pq.ParquetFile(f"{tmpdir}/{parquet_filename}")
 73 |     assert parquet_file.metadata.row_group(0).column(0).compression == "GZIP"
 74 | 
 75 | 
 76 | def test_write_with_schema(client, tmpdir):
 77 |     ddf = timeseries(
 78 |         start="2023-01-01",
 79 |         end="2023-01-03",
 80 |         freq="1H",
 81 |         partition_freq="1D",
 82 |         dtypes={"str": object, "float": float, "int": int},
 83 |     ).reset_index()
 84 |     schema = pa.schema(
 85 |         [
 86 |             pa.field("timestamp", pa.timestamp("us")),
 87 |             pa.field("str", pa.string()),
 88 |             pa.field("float", pa.float32()),
 89 |             pa.field("int", pa.int32()),
 90 |         ]
 91 |     )
 92 |     ddt.to_deltalake(f"{tmpdir}", ddf, schema=schema)
 93 |     ds = pa_ds.dataset(str(tmpdir))
 94 |     assert ds.schema == schema
 95 | 
 96 | 
 97 | def test_read(client, simple_table):
 98 |     df = ddt.read_deltalake(simple_table)
 99 |     assert df.columns.tolist() == ["id", "count", "temperature", "newColumn"]
100 |     assert df.compute().shape == (200, 4)
101 | 


--------------------------------------------------------------------------------
/tests/test_write.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import unittest.mock as mock
  5 | 
  6 | import dask.dataframe as dd
  7 | import pandas as pd
  8 | import pyarrow as pa
  9 | import pytest
 10 | from dask.dataframe.utils import assert_eq
 11 | from dask.datasets import timeseries
 12 | from deltalake import DeltaTable
 13 | 
 14 | from dask_deltatable import read_deltalake
 15 | from dask_deltatable.write import to_deltalake
 16 | 
 17 | 
 18 | @pytest.mark.parametrize(
 19 |     "with_index",
 20 |     [
 21 |         pytest.param(
 22 |             True,
 23 |             marks=[
 24 |                 pytest.mark.xfail(
 25 |                     reason="TS index is always ns resolution but delta can only handle us"
 26 |                 )
 27 |             ],
 28 |         ),
 29 |         False,
 30 |     ],
 31 | )
 32 | @pytest.mark.parametrize("freq,partition_freq", [("1H", "1D"), ("1H", "1w")])
 33 | def test_roundtrip(tmpdir, with_index, freq, partition_freq):
 34 |     dtypes = {
 35 |         "str": object,
 36 |         # FIXME: Categorical data does not work
 37 |         # "category": "category",
 38 |         "float": float,
 39 |         "int": int,
 40 |     }
 41 |     tmpdir = str(tmpdir)
 42 |     ddf = timeseries(
 43 |         start="2023-01-01",
 44 |         end="2023-01-15",
 45 |         freq=freq,
 46 |         partition_freq=partition_freq,
 47 |         dtypes=dtypes,
 48 |     )
 49 | 
 50 |     ddf = ddf.reset_index()
 51 |     if with_index:
 52 |         ddf = ddf.set_index("timestamp")
 53 | 
 54 |     out = to_deltalake(tmpdir, ddf, compute=False)
 55 |     assert not os.listdir(tmpdir)
 56 |     out.compute()
 57 |     assert len(os.listdir(tmpdir)) > 0
 58 | 
 59 |     ddf_read = read_deltalake(tmpdir)
 60 |     ddf_dask = dd.read_parquet(tmpdir)
 61 | 
 62 |     assert ddf.npartitions == ddf_read.npartitions
 63 |     # By default, arrow reads with ns resolution
 64 |     assert_eq(ddf_read, ddf_dask)
 65 | 
 66 | 
 67 | @mock.patch("dask_deltatable.utils.maybe_set_aws_credentials")
 68 | def test_writer_check_aws_credentials(maybe_set_aws_credentials, tmpdir):
 69 |     # The full functionality of maybe_set_aws_credentials tests in test_utils
 70 |     # we only need to ensure it's called here when writing with a str path
 71 |     maybe_set_aws_credentials.return_value = dict()
 72 | 
 73 |     df = pd.DataFrame({"col1": range(10)})
 74 |     ddf = dd.from_pandas(df, npartitions=2)
 75 |     to_deltalake(str(tmpdir), ddf)
 76 |     maybe_set_aws_credentials.assert_called()
 77 | 
 78 | 
 79 | @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
 80 | def test_datetime(tmpdir, unit):
 81 |     """Ensure we can write datetime with different resolutions,
 82 |     at least one-way only"""
 83 |     tmpdir = str(tmpdir)
 84 |     ts = pd.date_range("2023-01-01", periods=10, freq="1D", unit=unit)
 85 |     df = pd.DataFrame({"ts": pd.Series(ts)})
 86 |     ddf = dd.from_pandas(df, npartitions=2)
 87 |     to_deltalake(tmpdir, ddf)
 88 |     ddf_read = read_deltalake(tmpdir)
 89 |     ddf_dask = dd.read_parquet(tmpdir)
 90 |     assert_eq(ddf_read, ddf_dask, check_index=False)
 91 | 
 92 | 
 93 | def test_custom_metadata(tmpdir):
 94 |     tmpdir = str(tmpdir)
 95 |     df = pd.DataFrame({"a": [1, 2, 3, 4]})
 96 |     ddf = dd.from_pandas(df, npartitions=2)
 97 |     to_deltalake(tmpdir, ddf, custom_metadata={"foo": "bar"})
 98 |     dt = DeltaTable(tmpdir)
 99 |     assert "foo" in dt.history()[-1]
100 |     assert dt.history()[-1]["foo"] == "bar"
101 | 
102 | 
103 | def test_append_with_schema(tmpdir):
104 |     """Ensure we can append to a table with a schema"""
105 |     tmpdir = str(tmpdir)
106 |     df = pd.DataFrame({"a": [1, 2, 3, 4]})
107 |     ddf = dd.from_pandas(df, npartitions=2)
108 |     schema = pa.Schema.from_pandas(df)
109 |     to_deltalake(tmpdir, ddf, schema=schema)
110 |     to_deltalake(tmpdir, ddf, schema=schema, mode="append")
111 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pathlib
  4 | import unittest.mock as mock
  5 | 
  6 | import pytest
  7 | 
  8 | from dask_deltatable.utils import (
  9 |     get_bucket_region,
 10 |     get_partition_filters,
 11 |     maybe_set_aws_credentials,
 12 | )
 13 | 
 14 | 
 15 | @pytest.mark.parametrize(
 16 |     "cols,filters,expected",
 17 |     [
 18 |         [[], None, None],
 19 |         [[], [("part", ">", "a")], None],
 20 |         [["part"], [("part", ">", "a"), ("x", "==", 1)], [[("part", ">", "a")]]],
 21 |         [["part"], [[("part", ">", "a")], [("x", "==", 1)]], None],
 22 |         [
 23 |             ["m", "d"],
 24 |             [("m", ">", 5), ("d", "=", 1), ("x", "==", "a")],
 25 |             [[("m", ">", 5), ("d", "=", 1)]],
 26 |         ],
 27 |         [
 28 |             ["m", "d"],
 29 |             [[("m", ">", 5)], [("d", "=", 1)], [("x", "==", "a")]],
 30 |             None,
 31 |         ],
 32 |     ],
 33 | )
 34 | def test_partition_filters(cols, filters, expected):
 35 |     res = get_partition_filters(cols, filters)
 36 |     assert res == expected
 37 |     if isinstance(filters, list):
 38 |         # make sure it works with additional level of wrapping
 39 |         res = get_partition_filters(cols, filters)
 40 |         assert res == expected
 41 | 
 42 | 
 43 | @mock.patch("dask_deltatable.utils.get_bucket_region")
 44 | @pytest.mark.parametrize(
 45 |     "options",
 46 |     (
 47 |         None,
 48 |         dict(),
 49 |         dict(AWS_ACCESS_KEY_ID="foo", AWS_SECRET_ACCESS_KEY="bar"),
 50 |         dict(access_key="foo", secret_key="bar"),
 51 |     ),
 52 | )
 53 | @pytest.mark.parametrize("path", ("s3://path", "/another/path", pathlib.Path(".")))
 54 | def test_maybe_set_aws_credentials(
 55 |     mocked_get_bucket_region,
 56 |     options,
 57 |     path,
 58 | ):
 59 |     pytest.importorskip("boto3")
 60 | 
 61 |     mocked_get_bucket_region.return_value = "foo-region"
 62 | 
 63 |     mock_creds = mock.MagicMock()
 64 |     type(mock_creds).token = mock.PropertyMock(return_value="token")
 65 |     type(mock_creds).access_key = mock.PropertyMock(return_value="access-key")
 66 |     type(mock_creds).secret_key = mock.PropertyMock(return_value="secret-key")
 67 | 
 68 |     def mock_get_credentials():
 69 |         return mock_creds
 70 | 
 71 |     with mock.patch("boto3.session.Session") as mocked_session:
 72 |         session = mocked_session.return_value
 73 |         session.get_credentials.side_effect = mock_get_credentials
 74 | 
 75 |         opts = maybe_set_aws_credentials(path, options)
 76 | 
 77 |     if options and not any(k in options for k in ("AWS_ACCESS_KEY_ID", "access_key")):
 78 |         assert opts["AWS_ACCESS_KEY_ID"] == "access-key"
 79 |         assert opts["AWS_SECRET_ACCESS_KEY"] == "secret-key"
 80 |         assert opts["AWS_SESSION_TOKEN"] == "token"
 81 |         assert opts["AWS_REGION"] == "foo-region"
 82 | 
 83 |         assert opts["access_key"] == "access-key"
 84 |         assert opts["secret_key"] == "secret-key"
 85 |         assert opts["token"] == "token"
 86 |         assert opts["region"] == "foo-region"
 87 | 
 88 |     # Did not alter input options if credentials were supplied by user
 89 |     elif options:
 90 |         assert options == opts
 91 | 
 92 | 
 93 | @pytest.mark.parametrize("location", (None, "region-foo"))
 94 | @pytest.mark.parametrize(
 95 |     "path,bucket",
 96 |     (("s3://foo/bar", "foo"), ("s3://fizzbuzz", "fizzbuzz"), ("/not/s3", None)),
 97 | )
 98 | def test_get_bucket_region(location, path, bucket):
 99 |     pytest.importorskip("boto3")
100 | 
101 |     with mock.patch("boto3.client") as mock_client:
102 |         mock_client = mock_client.return_value
103 |         mock_client.get_bucket_location.return_value = {"LocationConstraint": location}
104 | 
105 |         if not path.startswith("s3://"):
106 |             with pytest.raises(ValueError, match="is not an S3 path"):
107 |                 get_bucket_region(path)
108 |             return
109 | 
110 |         region = get_bucket_region(path)
111 | 
112 |     # AWS returns None if bucket located in us-east-1...
113 |     location = location if location else "us-east-1"
114 |     assert region == location
115 | 
116 |     mock_client.get_bucket_location.assert_has_calls([mock.call(Bucket=bucket)])
117 | 


--------------------------------------------------------------------------------
/tests/test_acceptance.py:
--------------------------------------------------------------------------------
  1 | """Delta Acceptance Testing (DAT)
  2 | 
  3 | https://github.com/delta-incubator/dat
  4 | 
  5 | The DAT project provides test cases to verify different implementations of Delta Lake all behave
  6 | consistently. The expected behavior is described in the Delta Lake Protocol.
  7 | 
  8 | The tests cases are packaged into releases, which can be downloaded into CI jobs for automatic
  9 | testing. The test cases in this repo are represented using a standard file structure, so they
 10 | don't require any particular dependency or programming language.
 11 | """
 12 | 
 13 | from __future__ import annotations
 14 | 
 15 | import os
 16 | import shutil
 17 | import unittest.mock as mock
 18 | from urllib.request import urlretrieve
 19 | 
 20 | import dask.dataframe as dd
 21 | import pytest
 22 | from dask.dataframe.utils import assert_eq
 23 | 
 24 | import dask_deltatable as ddt
 25 | 
 26 | DATA_VERSION = "0.0.2"
 27 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 28 | DATA_DIR = os.path.join(ROOT_DIR, "out", "reader_tests", "generated")
 29 | 
 30 | 
 31 | @pytest.fixture(autouse=True, scope="session")
 32 | def download_data():
 33 |     """Download the data for the tests."""
 34 |     if not os.path.exists(DATA_DIR):
 35 |         filename = f"deltalake-dat-v{DATA_VERSION}.tar.gz"
 36 |         dest_filename = os.path.join(ROOT_DIR, filename)
 37 |         urlretrieve(
 38 |             f"https://github.com/delta-incubator/dat/releases/download/v{DATA_VERSION}/{filename}",
 39 |             dest_filename,
 40 |         )
 41 |         shutil.unpack_archive(dest_filename, ROOT_DIR)
 42 |         os.remove(dest_filename)
 43 |         assert os.path.exists(DATA_DIR)
 44 | 
 45 | 
 46 | @mock.patch("dask_deltatable.utils.maybe_set_aws_credentials")
 47 | def test_reader_check_aws_credentials(maybe_set_aws_credentials):
 48 |     # The full functionality of maybe_set_aws_credentials tests in test_utils
 49 |     # we only need to ensure it's called here when reading with a str path
 50 |     maybe_set_aws_credentials.return_value = dict()
 51 |     ddt.read_deltalake(f"{DATA_DIR}/all_primitive_types/delta")
 52 |     maybe_set_aws_credentials.assert_called()
 53 | 
 54 | 
 55 | def test_reader_all_primitive_types():
 56 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/all_primitive_types/delta")
 57 |     expected_ddf = dd.read_parquet(
 58 |         f"{DATA_DIR}/all_primitive_types/expected/latest/table_content/*parquet"
 59 |     )
 60 |     # Dask and delta go through different parquet parsers which read the
 61 |     # timestamp differently. This is likely a bug in arrow but the delta result
 62 |     # is "more correct".
 63 |     expected_ddf["timestamp"] = expected_ddf["timestamp"].astype("datetime64[us]")
 64 |     expected_ddf["timestamp"] = expected_ddf["timestamp"].dt.tz_localize("UTC")
 65 |     assert_eq(actual_ddf, expected_ddf)
 66 | 
 67 | 
 68 | @pytest.mark.parametrize("version,subdir", [(None, "latest"), (0, "v0"), (1, "v1")])
 69 | def test_reader_basic_append(version, subdir):
 70 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/basic_append/delta", version=version)
 71 |     expected_ddf = dd.read_parquet(
 72 |         f"{DATA_DIR}/basic_append/expected/{subdir}/table_content/*parquet"
 73 |     )
 74 |     assert_eq(actual_ddf, expected_ddf, check_index=False)
 75 | 
 76 | 
 77 | @pytest.mark.parametrize("version,subdir", [(None, "latest"), (0, "v0"), (1, "v1")])
 78 | def test_reader_basic_partitioned(version, subdir):
 79 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/basic_partitioned/delta")
 80 |     expected_ddf = dd.read_parquet(
 81 |         f"{DATA_DIR}/basic_partitioned/expected/latest/table_content/*parquet"
 82 |     )
 83 |     assert_eq(actual_ddf, expected_ddf, check_index=False)
 84 | 
 85 | 
 86 | @pytest.mark.xfail(reason="https://github.com/delta-io/delta-rs/issues/1533")
 87 | @pytest.mark.parametrize(
 88 |     "version,subdir", [(None, "latest"), (0, "v0"), (1, "v1"), (2, "v2")]
 89 | )
 90 | def test_reader_multi_partitioned(version, subdir):
 91 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/multi_partitioned/delta")
 92 |     expected_ddf = dd.read_parquet(
 93 |         f"{DATA_DIR}/multi_partitioned/expected/{subdir}/table_content/*parquet"
 94 |     )
 95 |     assert_eq(actual_ddf, expected_ddf, check_index=False)
 96 | 
 97 | 
 98 | @pytest.mark.xfail(reason="https://github.com/delta-io/delta-rs/issues/1533")
 99 | def test_reader_multi_partitioned_2():
100 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/multi_partitioned_2/delta")
101 |     expected_ddf = dd.read_parquet(
102 |         f"{DATA_DIR}/multi_partitioned_2/expected/latest/table_content/*parquet"
103 |     )
104 |     assert_eq(actual_ddf, expected_ddf)
105 | 
106 | 
107 | def test_reader_nested_types():
108 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/nested_types/delta")
109 |     expected_ddf = dd.read_parquet(
110 |         f"{DATA_DIR}/nested_types/expected/latest/table_content/*parquet"
111 |     )
112 |     assert_eq(actual_ddf, expected_ddf)
113 | 
114 | 
115 | def test_reader_no_replay():
116 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/no_replay/delta")
117 |     expected_ddf = dd.read_parquet(
118 |         f"{DATA_DIR}/no_replay/expected/latest/table_content/*parquet"
119 |     )
120 |     assert_eq(actual_ddf, expected_ddf)
121 | 
122 | 
123 | def test_reader_no_stats():
124 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/no_stats/delta")
125 |     expected_ddf = dd.read_parquet(
126 |         f"{DATA_DIR}/no_stats/expected/latest/table_content/*parquet"
127 |     )
128 |     assert_eq(actual_ddf, expected_ddf)
129 | 
130 | 
131 | def test_reader_stats_as_structs():
132 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/stats_as_struct/delta")
133 |     expected_ddf = dd.read_parquet(
134 |         f"{DATA_DIR}/stats_as_struct/expected/latest/table_content/*parquet"
135 |     )
136 |     assert_eq(actual_ddf, expected_ddf)
137 | 
138 | 
139 | def test_reader_with_checkpoint():
140 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/with_checkpoint/delta")
141 |     expected_ddf = dd.read_parquet(
142 |         f"{DATA_DIR}/with_checkpoint/expected/latest/table_content/*parquet"
143 |     )
144 |     assert_eq(actual_ddf, expected_ddf)
145 | 
146 | 
147 | @pytest.mark.parametrize("version,subdir", [(None, "latest"), (1, "v1")])
148 | def test_reader_with_schema_change(version, subdir):
149 |     actual_ddf = ddt.read_deltalake(f"{DATA_DIR}/with_schema_change/delta")
150 |     expected_ddf = dd.read_parquet(
151 |         f"{DATA_DIR}/with_schema_change/expected/{subdir}/table_content/*parquet"
152 |     )
153 |     assert_eq(actual_ddf, expected_ddf)
154 | 


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import glob
  4 | import os
  5 | from unittest.mock import MagicMock, patch
  6 | 
  7 | import pandas as pd
  8 | import pyarrow as pa
  9 | import pyarrow.parquet as pq
 10 | import pytest
 11 | from deltalake import DeltaTable
 12 | 
 13 | import dask_deltatable as ddt
 14 | 
 15 | 
 16 | def test_read_delta(simple_table):
 17 |     df = ddt.read_deltalake(simple_table)
 18 | 
 19 |     assert df.columns.tolist() == ["id", "count", "temperature", "newColumn"]
 20 |     assert df.compute().shape == (200, 4)
 21 | 
 22 | 
 23 | def test_read_delta_types_mapper(simple_table):
 24 |     """Provide a custom types mapper"""
 25 | 
 26 |     def types_mapper(pyarrow_dtype):
 27 |         if pyarrow_dtype == pa.int64():
 28 |             return pd.Int32Dtype()
 29 | 
 30 |     df = ddt.read_deltalake(
 31 |         simple_table, pyarrow_to_pandas={"types_mapper": types_mapper}
 32 |     )
 33 |     assert df.dtypes["id"] == "Int32"
 34 |     assert df.dtypes["count"] == "Int32"
 35 |     res = df.compute()
 36 |     assert res.dtypes["id"] == "Int32"
 37 |     assert res.dtypes["count"] == "Int32"
 38 | 
 39 | 
 40 | def test_read_delta_categories(simple_table):
 41 |     """Provide a list of categories"""
 42 |     df = ddt.read_deltalake(simple_table, pyarrow_to_pandas={"categories": ["id"]})
 43 |     assert df.dtypes["id"] == "category"
 44 |     res = df.compute()
 45 |     assert res.dtypes["id"] == "category"
 46 | 
 47 | 
 48 | def test_read_delta_with_different_versions(simple_table):
 49 |     print(simple_table)
 50 |     df = ddt.read_deltalake(simple_table, version=0)
 51 |     assert df.compute().shape == (100, 3)
 52 | 
 53 |     df = ddt.read_deltalake(simple_table, version=1)
 54 |     assert df.compute().shape == (200, 4)
 55 | 
 56 | 
 57 | def test_row_filter(simple_table):
 58 |     # row filter
 59 |     df = ddt.read_deltalake(
 60 |         simple_table,
 61 |         version=0,
 62 |         filter=[("count", ">", 30)],
 63 |     )
 64 |     assert df.compute().shape == (61, 3)
 65 | 
 66 | 
 67 | def test_different_columns(simple_table):
 68 |     df = ddt.read_deltalake(simple_table, columns=["count", "temperature"])
 69 |     assert df.columns.tolist() == ["count", "temperature"]
 70 | 
 71 | 
 72 | def test_different_schema(simple_table):
 73 |     # testing schema evolution
 74 | 
 75 |     df = ddt.read_deltalake(simple_table, version=0)
 76 |     assert df.columns.tolist() == ["id", "count", "temperature"]
 77 | 
 78 |     df = ddt.read_deltalake(simple_table, version=1)
 79 |     assert df.columns.tolist() == ["id", "count", "temperature", "newColumn"]
 80 | 
 81 | 
 82 | @pytest.mark.parametrize(
 83 |     "kwargs,shape",
 84 |     [
 85 |         (dict(version=0, filter=[("col1", "==", 1)]), (21, 3)),
 86 |         (dict(filter=[("col1", "==", 1), ("col2", "<", 0.5)]), (11, 4)),
 87 |         (dict(filter=[[("col1", "==", 1)], [("col1", "==", 2)]]), (39, 4)),
 88 |         (dict(filter=[("col1", "!=", 1), ("id", "<", 5)]), (6, 4)),
 89 |         (dict(filter=[[("col1", "!=", 1)], [("id", "<", 5)]]), (99, 4)),
 90 |     ],
 91 | )
 92 | def test_partition_filter(partition_table, kwargs, shape):
 93 |     """partition filter"""
 94 |     df = ddt.read_deltalake(partition_table, **kwargs)
 95 |     filter_expr = pq.filters_to_expression(kwargs["filter"])
 96 |     dt = DeltaTable(partition_table, version=kwargs.get("version"))
 97 |     expected_partitions = len(
 98 |         list(dt.to_pyarrow_dataset().get_fragments(filter=filter_expr))
 99 |     )
100 |     assert df.npartitions == expected_partitions
101 |     assert df.compute().shape == shape
102 | 
103 | 
104 | def test_empty(empty_table1, empty_table2):
105 |     df = ddt.read_deltalake(empty_table1, version=4)
106 |     assert df.compute().shape == (0, 2)
107 | 
108 |     df = ddt.read_deltalake(empty_table1, version=0)
109 |     assert df.compute().shape == (5, 2)
110 | 
111 |     df = ddt.read_deltalake(empty_table2)
112 |     assert df.compute().shape == (0, 4)
113 | 
114 |     df = ddt.read_deltalake(empty_table2, columns=["some_struct", "value"])
115 |     assert df.compute().shape == (0, 2)
116 | 
117 |     df = ddt.read_deltalake(empty_table2, columns=[])
118 |     assert df.compute().shape == (0, 0)
119 | 
120 | 
121 | def test_checkpoint(checkpoint_table):
122 |     df = ddt.read_deltalake(checkpoint_table, checkpoint=0, version=4)
123 |     assert df.compute().shape[0] == 25
124 | 
125 |     df = ddt.read_deltalake(checkpoint_table, checkpoint=10, version=12)
126 |     assert df.compute().shape[0] == 65
127 | 
128 |     df = ddt.read_deltalake(checkpoint_table, checkpoint=20, version=22)
129 |     assert df.compute().shape[0] == 115
130 | 
131 |     with pytest.raises(Exception):
132 |         # Parquet file with the given checkpoint 30 does not exists:
133 |         # File {checkpoint_path} not found"
134 |         _ = ddt.read_deltalake(checkpoint_table, checkpoint=30, version=33)
135 | 
136 | 
137 | def test_out_of_version_error(simple_table):
138 |     # Cannot time travel Delta table to version 4 , Available versions for given
139 |     # checkpoint 0 are [0,1]
140 |     with pytest.raises(Exception):
141 |         _ = ddt.read_deltalake(simple_table, version=4)
142 | 
143 | 
144 | def test_load_with_datetime(simple_table2):
145 |     log_dir = f"{simple_table2}_delta_log"
146 |     log_mtime_pair = [
147 |         ("00000000000000000000.json", 1588398451.0),
148 |         ("00000000000000000001.json", 1588484851.0),
149 |         ("00000000000000000002.json", 1588571251.0),
150 |         ("00000000000000000003.json", 1588657651.0),
151 |         ("00000000000000000004.json", 1588744051.0),
152 |     ]
153 |     for file_name, dt_epoch in log_mtime_pair:
154 |         file_path = os.path.join(log_dir, file_name)
155 |         os.utime(file_path, (dt_epoch, dt_epoch))
156 | 
157 |     expected = ddt.read_deltalake(simple_table2, version=0).compute()
158 |     result = ddt.read_deltalake(
159 |         simple_table2, datetime="2020-05-01T00:47:31-07:00"
160 |     ).compute()
161 |     assert expected.equals(result)
162 |     # assert_frame_equal(expected,result)
163 | 
164 |     expected = ddt.read_deltalake(simple_table2, version=1).compute()
165 |     result = ddt.read_deltalake(
166 |         simple_table2, datetime="2020-05-02T22:47:31-07:00"
167 |     ).compute()
168 |     assert expected.equals(result)
169 | 
170 |     expected = ddt.read_deltalake(simple_table2, version=4).compute()
171 |     result = ddt.read_deltalake(
172 |         simple_table2, datetime="2020-05-25T22:47:31-07:00"
173 |     ).compute()
174 |     assert expected.equals(result)
175 | 
176 | 
177 | def test_read_delta_with_error():
178 |     with pytest.raises(ValueError) as exc_info:
179 |         ddt.read_deltalake()
180 |     assert str(exc_info.value) == "Please Provide Delta Table path"
181 | 
182 | 
183 | def test_catalog_with_error():
184 |     with pytest.raises(ValueError) as exc_info:
185 |         ddt.read_deltalake(catalog="glue")
186 |     assert (
187 |         str(exc_info.value)
188 |         == "Since Catalog was provided, please provide Database and table name"
189 |     )
190 | 
191 | 
192 | @pytest.mark.skip(
193 |     reason="DeltaTable.from_data_catalog was removed in delta-rs v0.15.0. "
194 |     "Skip until _read_from_catalog is adapted to this change."
195 | )
196 | def test_catalog(simple_table):
197 |     dt = MagicMock()
198 | 
199 |     def delta_mock(**kwargs):
200 |         files = glob.glob(simple_table + "/*parquet")
201 |         dt.file_uris = MagicMock(return_value=files)
202 |         return dt
203 | 
204 |     with patch("deltalake.DeltaTable.from_data_catalog", side_effect=delta_mock):
205 |         os.environ["AWS_ACCESS_KEY_ID"] = "apple"
206 |         os.environ["AWS_SECRET_ACCESS_KEY"] = "greatsecret"
207 |         df = ddt.read_deltalake(
208 |             catalog="glue", database_name="stores", table_name="orders"
209 |         )
210 |         assert df.compute().shape == (200, 3)
211 | 


--------------------------------------------------------------------------------
/dask_deltatable/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | from collections.abc import Iterator, Mapping
  5 | from datetime import date, datetime
  6 | from decimal import Decimal
  7 | from math import inf
  8 | from typing import Any, cast
  9 | from urllib.parse import unquote
 10 | 
 11 | from deltalake import DeltaTable
 12 | 
 13 | from .types import Filter, Filters
 14 | 
 15 | 
 16 | def get_bucket_region(path: str):
 17 |     import boto3
 18 | 
 19 |     if not path.startswith("s3://"):
 20 |         raise ValueError(f"'{path}' is not an S3 path")
 21 |     bucket = path.replace("s3://", "").split("/")[0]
 22 |     resp = boto3.client("s3").get_bucket_location(Bucket=bucket)
 23 |     # Buckets in region 'us-east-1' results in None, b/c why not.
 24 |     # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/get_bucket_location.html#S3.Client.get_bucket_location
 25 |     return resp["LocationConstraint"] or "us-east-1"
 26 | 
 27 | 
 28 | def maybe_set_aws_credentials(path: Any, options: dict[str, Any]) -> dict[str, Any]:
 29 |     """
 30 |     Maybe set AWS credentials into ``options`` if existing AWS specific keys
 31 |     not found in it and path is s3:// format.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     path : Any
 36 |         If it's a string, we'll check if it starts with 's3://' then determine bucket
 37 |         region if the AWS credentials should be set.
 38 |     options : dict[str, Any]
 39 |         Options, any kwargs to be supplied to things like S3FileSystem or similar
 40 |         that may accept AWS credentials set. A copy is made and returned if modified.
 41 | 
 42 |     Returns
 43 |     -------
 44 |     dict
 45 |         Either the original options if not modified, or a copied and updated options
 46 |         with AWS credentials inserted.
 47 |     """
 48 | 
 49 |     is_s3_path = getattr(path, "startswith", lambda _: False)("s3://")
 50 |     if not is_s3_path:
 51 |         return options
 52 | 
 53 |     # Avoid overwriting already provided credentials
 54 |     keys = ("AWS_ACCESS_KEY", "AWS_SECRET_ACCESS_KEY", "access_key", "secret_key")
 55 |     if not any(k in (options or ()) for k in keys):
 56 |         # defers installing boto3 upfront, xref _read_from_catalog
 57 |         import boto3
 58 | 
 59 |         session = boto3.session.Session()
 60 |         credentials = session.get_credentials()
 61 |         if credentials is None:
 62 |             return options
 63 |         region = get_bucket_region(path)
 64 | 
 65 |         options = (options or {}).copy()
 66 |         options.update(
 67 |             # Capitalized is used in delta specific API and lowercase is for S3FileSystem
 68 |             dict(
 69 |                 # TODO: w/o this, we need to configure a LockClient which seems to require dynamodb.
 70 |                 AWS_S3_ALLOW_UNSAFE_RENAME="true",
 71 |                 AWS_SECRET_ACCESS_KEY=credentials.secret_key,
 72 |                 AWS_ACCESS_KEY_ID=credentials.access_key,
 73 |                 AWS_SESSION_TOKEN=credentials.token,
 74 |                 AWS_REGION=region,
 75 |                 secret_key=credentials.secret_key,
 76 |                 access_key=credentials.access_key,
 77 |                 token=credentials.token,
 78 |                 region=region,
 79 |             )
 80 |         )
 81 |     return options
 82 | 
 83 | 
 84 | def get_partition_filters(
 85 |     partition_columns: list[str], filters: Filters
 86 | ) -> list[list[Filter]] | None:
 87 |     """Retrieve only filters on partition columns. If there are any row filters in the outer
 88 |     list (the OR list), return None, because we have to search through all partitions to apply
 89 |     row filters
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     partition_columns : List[str]
 94 |         List of partitioned columns
 95 | 
 96 |     filters : List[Tuple[str, str, Any]] | List[List[Tuple[str, str, Any]]]
 97 |         List of filters. Examples:
 98 |         1) (x == a) and (y == 3):
 99 |            [("x", "==", "a"), ("y", "==", 3)]
100 |         2) (x == a) or (y == 3)
101 |             [[("x", "==", "a")], [("y", "==", 3)]]
102 | 
103 |     Returns
104 |     -------
105 |     List[List[Tuple[str, str, Any]]] | None
106 |         List of partition filters, None if we can't apply a filter on partitions because
107 |         row filters are present
108 |     """
109 |     if filters is None or len(filters) == 0:
110 |         return None
111 | 
112 |     if isinstance(filters[0][0], str):
113 |         filters = cast(list[list[Filter]], [filters])
114 |     filters = cast(list[list[Filter]], filters)
115 | 
116 |     allowed_ops = {
117 |         "=": "=",
118 |         "==": "=",
119 |         "!=": "!=",
120 |         "!==": "!=",
121 |         "in": "in",
122 |         "not in": "not in",
123 |         ">": ">",
124 |         "<": "<",
125 |         ">=": ">=",
126 |         "<=": "<=",
127 |     }
128 | 
129 |     expressions = []
130 |     for disjunction in filters:
131 |         inner_expressions = []
132 |         for col, op, val in disjunction:
133 |             if col in partition_columns:
134 |                 normalized_op = allowed_ops[op]
135 |                 inner_expressions.append((col, normalized_op, val))
136 |         if inner_expressions:
137 |             expressions.append(inner_expressions)
138 |         else:
139 |             return None
140 | 
141 |     return expressions if expressions else None
142 | 
143 | 
144 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt)
145 | def get_partitions_from_path(path: str) -> tuple[str, dict[str, str | None]]:
146 |     if path[0] == "/":
147 |         path = path[1:]
148 |     parts = path.split("/")
149 |     parts.pop()  # remove filename
150 |     out: dict[str, str | None] = {}
151 |     for part in parts:
152 |         if part == "":
153 |             continue
154 |         key, value = part.split("=", maxsplit=1)
155 |         if value == "__HIVE_DEFAULT_PARTITION__":
156 |             out[key] = None
157 |         else:
158 |             out[key] = unquote(value)
159 |     return path, out
160 | 
161 | 
162 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt)
163 | def get_file_stats_from_metadata(
164 |     metadata: Any,
165 |     num_indexed_cols: int,
166 |     columns_to_collect_stats: list[str] | None,
167 | ) -> dict[str, int | dict[str, Any]]:
168 |     """Get Delta's file stats from PyArrow's Parquet file metadata."""
169 |     stats = {
170 |         "numRecords": metadata.num_rows,
171 |         "minValues": {},
172 |         "maxValues": {},
173 |         "nullCount": {},
174 |     }
175 | 
176 |     def iter_groups(metadata: Any) -> Iterator[Any]:
177 |         for i in range(metadata.num_row_groups):
178 |             if metadata.row_group(i).num_rows > 0:
179 |                 yield metadata.row_group(i)
180 | 
181 |     schema_columns = metadata.schema.names
182 |     if columns_to_collect_stats is not None:
183 |         idx_to_iterate = []
184 |         for col in columns_to_collect_stats:
185 |             try:
186 |                 idx_to_iterate.append(schema_columns.index(col))
187 |             except ValueError:
188 |                 pass
189 |     elif num_indexed_cols == -1:
190 |         idx_to_iterate = list(range(metadata.num_columns))
191 |     elif num_indexed_cols >= 0:
192 |         idx_to_iterate = list(range(min(num_indexed_cols, metadata.num_columns)))
193 |     else:
194 |         raise ValueError("delta.dataSkippingNumIndexedCols valid values are >=-1")
195 | 
196 |     for column_idx in idx_to_iterate:
197 |         name = metadata.row_group(0).column(column_idx).path_in_schema
198 | 
199 |         # If stats missing, then we can't know aggregate stats
200 |         if all(
201 |             group.column(column_idx).is_stats_set for group in iter_groups(metadata)
202 |         ):
203 |             stats["nullCount"][name] = sum(
204 |                 group.column(column_idx).statistics.null_count
205 |                 for group in iter_groups(metadata)
206 |             )
207 | 
208 |             # Min / max may not exist for some column types, or if all values are null
209 |             if any(
210 |                 group.column(column_idx).statistics.has_min_max
211 |                 for group in iter_groups(metadata)
212 |             ):
213 |                 # Min and Max are recorded in physical type, not logical type
214 |                 # https://stackoverflow.com/questions/66753485/decoding-parquet-min-max-statistics-for-decimal-type
215 |                 # TODO: Add logic to decode physical type for DATE, DECIMAL
216 | 
217 |                 minimums = (
218 |                     group.column(column_idx).statistics.min
219 |                     for group in iter_groups(metadata)
220 |                 )
221 |                 # If some row groups have all null values, their min and max will be null too.
222 |                 min_value = min(minimum for minimum in minimums if minimum is not None)
223 |                 # Infinity cannot be serialized to JSON, so we skip it. Saying
224 |                 # min/max is infinity is equivalent to saying it is null, anyways.
225 |                 if min_value != -inf:
226 |                     stats["minValues"][name] = min_value
227 |                 maximums = (
228 |                     group.column(column_idx).statistics.max
229 |                     for group in iter_groups(metadata)
230 |                 )
231 |                 max_value = max(maximum for maximum in maximums if maximum is not None)
232 |                 if max_value != inf:
233 |                     stats["maxValues"][name] = max_value
234 |     return stats
235 | 
236 | 
237 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt)
238 | class DeltaJSONEncoder(json.JSONEncoder):
239 |     def default(self, obj: Any) -> Any:
240 |         if isinstance(obj, bytes):
241 |             return obj.decode("unicode_escape", "backslashreplace")
242 |         elif isinstance(obj, date):
243 |             return obj.isoformat()
244 |         elif isinstance(obj, datetime):
245 |             return obj.isoformat()
246 |         elif isinstance(obj, Decimal):
247 |             return str(obj)
248 |         # Let the base class default method raise the TypeError
249 |         return json.JSONEncoder.default(self, obj)
250 | 
251 | 
252 | # Copied from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt)
253 | def enforce_append_only(
254 |     table: DeltaTable | None,
255 |     configuration: Mapping[str, str | None] | None,
256 |     mode: str,
257 | ) -> None:
258 |     """Throw ValueError if table configuration contains delta.appendOnly and mode is not append"""
259 |     if table:
260 |         configuration = table.metadata().configuration
261 |     config_delta_append_only = (
262 |         configuration and configuration.get("delta.appendOnly", "false") == "true"
263 |     )
264 |     if config_delta_append_only and mode != "append":
265 |         raise ValueError(
266 |             "If configuration has delta.appendOnly = 'true', mode must be 'append'."
267 |             f" Mode is currently {mode}"
268 |         )
269 | 
270 | 
271 | # Inspired from delta-rs v0.25.5 (https://github.com/delta-io/delta-rs/blob/python-v0.25.5/LICENSE.txt)
272 | def get_num_idx_cols_and_stats_columns(
273 |     table: DeltaTable | None, configuration: Mapping[str, str | None] | None
274 | ) -> tuple[int, list[str] | None]:
275 |     """Get the num_idx_columns and stats_columns from the table configuration in the state
276 | 
277 |     If table does not exist (only can occur in the first write action) it takes
278 |     the configuration that was passed.
279 |     """
280 |     if table is not None:
281 |         configuration = table.metadata().configuration
282 |     if configuration is None:
283 |         num_idx_cols = -1
284 |         stats_columns = None
285 |     else:
286 |         # Parse configuration
287 |         dataSkippingNumIndexedCols = configuration.get(
288 |             "delta.dataSkippingNumIndexedCols", "-1"
289 |         )
290 |         num_idx_cols = (
291 |             int(dataSkippingNumIndexedCols)
292 |             if dataSkippingNumIndexedCols is not None
293 |             else -1
294 |         )
295 |         columns = configuration.get("delta.dataSkippingStatsColumns", None)
296 |         if columns is not None:
297 |             stats_columns = [col.strip() for col in columns.split(",")]
298 |         else:
299 |             stats_columns = None
300 |     return num_idx_cols, stats_columns
301 | 


--------------------------------------------------------------------------------
/dask_deltatable/_schema.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | """
  4 | Most of this code was taken from
  5 | 
  6 | https://github.com/data-engineering-collective/plateau
  7 | 
  8 | https://github.com/data-engineering-collective/plateau/blob/d4c4522f5a829d43e3368fc82e1568c91fa352f3/plateau/core/common_metadata.py
  9 | 
 10 | and adapted to this project
 11 | 
 12 | under the original license
 13 | 
 14 | MIT License
 15 | 
 16 | Copyright (c) 2022 The plateau contributors.
 17 | Copyright (c) 2020-2021 The kartothek contributors.
 18 | Copyright (c) 2019 JDA Software, Inc
 19 | 
 20 | Permission is hereby granted, free of charge, to any person obtaining a copy
 21 | of this software and associated documentation files (the "Software"), to deal
 22 | in the Software without restriction, including without limitation the rights
 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 24 | copies of the Software, and to permit persons to whom the Software is
 25 | furnished to do so, subject to the following conditions:
 26 | 
 27 | The above copyright notice and this permission notice shall be included in all
 28 | copies or substantial portions of the Software.
 29 | 
 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 36 | SOFTWARE.
 37 | 
 38 | """
 39 | import difflib
 40 | import json
 41 | import logging
 42 | import pprint
 43 | from collections.abc import Iterable
 44 | from copy import deepcopy
 45 | 
 46 | import pandas as pd
 47 | import pyarrow as pa
 48 | import pyarrow.parquet as pq
 49 | 
 50 | _logger = logging.getLogger()
 51 | 
 52 | 
 53 | class SchemaWrapper:
 54 |     def __init__(self, schema: pa.Schema):
 55 |         self.schema = schema
 56 | 
 57 |     def __hash__(self):
 58 |         # FIXME: pyarrow raises a "cannot hash type dict" error
 59 |         return hash(_schema2bytes(self.schema))
 60 | 
 61 | 
 62 | def pyarrow_to_deltalake(schema: pa.Schema) -> pa.Schema:
 63 |     """Adjust data types to make schema compatible with Delta Lake dtypes.
 64 |     Not all Arrow data types are supported by Delta Lake. See also
 65 |     ``deltalake.schema.delta_arrow_schema_from_pandas``.
 66 | 
 67 |     Notes
 68 |     -----
 69 |     We shouldn't need this when https://github.com/delta-io/delta-rs/issues/686 is closed
 70 |     """
 71 |     schema_out = []
 72 |     for field in schema:
 73 |         if isinstance(field.type, pa.TimestampType):
 74 |             f = pa.field(
 75 |                 name=field.name,
 76 |                 type=pa.timestamp("us"),
 77 |                 nullable=field.nullable,
 78 |                 metadata=field.metadata,
 79 |             )
 80 |             schema_out.append(f)
 81 |         else:
 82 |             schema_out.append(field)
 83 |     return pa.schema(schema_out, metadata=schema.metadata)
 84 | 
 85 | 
 86 | def _pandas_in_schemas(schemas):
 87 |     """Check if any schema contains pandas metadata."""
 88 |     has_pandas = False
 89 |     for schema in schemas:
 90 |         if schema.metadata and b"pandas" in schema.metadata:
 91 |             has_pandas = True
 92 |     return has_pandas
 93 | 
 94 | 
 95 | def _determine_schemas_to_compare(
 96 |     schemas: Iterable[pa.Schema], ignore_pandas: bool
 97 | ) -> tuple[pa.Schema | None, list[tuple[pa.Schema, list[str]]]]:
 98 |     """Iterate over a list of `pyarrow.Schema` objects and prepares them for
 99 |     comparison by picking a reference and determining all null columns.
100 | 
101 |     .. note::
102 | 
103 |         If pandas metadata exists, the version stored in the metadata is overwritten with the currently
104 |         installed version since we expect to stay backwards compatible
105 | 
106 |     Returns
107 |     -------
108 |     reference: Schema
109 |         A reference schema which is picked from the input list. The reference schema is guaranteed
110 |         to be a schema having the least number of null columns of all input columns. The set of null
111 |         columns is guaranteed to be a true subset of all null columns of all input schemas. If no such
112 |         schema can be found, an Exception is raised
113 |     list_of_schemas: List[Tuple[Schema, List]]
114 |         A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and
115 |         must be removed before comparing the schemas
116 |     """
117 |     has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas
118 |     schemas_to_evaluate: list[tuple[pa.Schema, list[str]]] = []
119 |     reference = None
120 |     null_cols_in_reference = set()
121 |     # Hashing the schemas is a very fast way to reduce the number of schemas to
122 |     # actually compare since in most circumstances this reduces to very few
123 |     # (which differ in e.g. null columns)
124 |     for schema_wrapped in set(map(SchemaWrapper, schemas)):
125 |         schema = schema_wrapped.schema
126 |         del schema_wrapped
127 |         if has_pandas:
128 |             metadata = schema.metadata
129 |             if metadata is None or b"pandas" not in metadata:
130 |                 raise ValueError(
131 |                     "Pandas and non-Pandas schemas are not comparable. "
132 |                     "Use ignore_pandas=True if you only want to compare "
133 |                     "on Arrow level."
134 |                 )
135 |             pandas_metadata = json.loads(metadata[b"pandas"].decode("utf8"))
136 | 
137 |             # we don't care about the pandas version, since we assume it's safe
138 |             # to read datasets that were written by older or newer versions.
139 |             pandas_metadata["pandas_version"] = f"{pd.__version__}"
140 | 
141 |             metadata_clean = deepcopy(metadata)
142 |             metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata)
143 |             current = pa.schema(schema, metadata_clean)
144 |         else:
145 |             current = schema
146 | 
147 |         # If a field is null we cannot compare it and must therefore reject it
148 |         null_columns = {field.name for field in current if field.type == pa.null()}
149 | 
150 |         # Determine a valid reference schema. A valid reference schema is considered to be the schema
151 |         # of all input schemas with the least empty columns.
152 |         # The reference schema ought to be a schema whose empty columns are a true subset for all sets
153 |         # of empty columns. This ensures that the actual reference schema is the schema with the most
154 |         # information possible. A schema which doesn't fulfil this requirement would weaken the
155 |         # comparison and would allow for false positives
156 | 
157 |         # Trivial case
158 |         if reference is None:
159 |             reference = current
160 |             null_cols_in_reference = null_columns
161 |         # The reference has enough information to validate against current schema.
162 |         # Append it to the list of schemas to be verified
163 |         elif null_cols_in_reference.issubset(null_columns):
164 |             schemas_to_evaluate.append((current, list(null_columns)))
165 |         # current schema includes all information of reference and more.
166 |         # Add reference to schemas_to_evaluate and update reference
167 |         elif null_columns.issubset(null_cols_in_reference):
168 |             schemas_to_evaluate.append((reference, list(null_cols_in_reference)))
169 |             reference = current
170 |             null_cols_in_reference = null_columns
171 |         # If there is no clear subset available elect the schema with the least null columns as `reference`.
172 |         # Iterate over the null columns of `reference` and replace it with a non-null field of the `current`
173 |         # schema which recovers the loop invariant (null columns of `reference` is subset of `current`)
174 |         else:
175 |             if len(null_columns) < len(null_cols_in_reference):
176 |                 reference, current = current, reference
177 |                 null_cols_in_reference, null_columns = (
178 |                     null_columns,
179 |                     null_cols_in_reference,
180 |                 )
181 | 
182 |             for col in null_cols_in_reference - null_columns:
183 |                 # Enrich the information in the reference by grabbing the missing fields
184 |                 # from the current iteration. This assumes that we only check for global validity and
185 |                 # isn't relevant where the reference comes from.
186 |                 reference = _swap_fields_by_name(reference, current, col)
187 |                 null_cols_in_reference.remove(col)
188 |             schemas_to_evaluate.append((current, list(null_columns)))
189 | 
190 |     assert (reference is not None) or (not schemas_to_evaluate)
191 | 
192 |     return reference, schemas_to_evaluate
193 | 
194 | 
195 | def _swap_fields_by_name(reference, current, field_name):
196 |     current_field = current.field(field_name)
197 |     reference_index = reference.get_field_index(field_name)
198 |     return reference.set(reference_index, current_field)
199 | 
200 | 
201 | def _strip_columns_from_schema(schema, field_names):
202 |     stripped_schema = schema
203 | 
204 |     for name in field_names:
205 |         ix = stripped_schema.get_field_index(name)
206 |         if ix >= 0:
207 |             stripped_schema = stripped_schema.remove(ix)
208 |         else:
209 |             # If the returned index is negative, the field doesn't exist in the schema.
210 |             # This is most likely an indicator for incompatible schemas and we refuse to strip the schema
211 |             # to not obfurscate the validation result
212 |             _logger.warning(
213 |                 "Unexpected field `%s` encountered while trying to strip `null` columns.\n"
214 |                 "Schema was:\n\n`%s`" % (name, schema)
215 |             )
216 |             return schema
217 |     return stripped_schema
218 | 
219 | 
220 | def _schema2bytes(schema: SchemaWrapper) -> bytes:
221 |     buf = pa.BufferOutputStream()
222 |     pq.write_metadata(schema, buf, coerce_timestamps="us")
223 |     return buf.getvalue().to_pybytes()
224 | 
225 | 
226 | def _remove_diff_header(diff):
227 |     diff = list(diff)
228 |     for ix, el in enumerate(diff):
229 |         # This marks the first actual entry of the diff
230 |         # e.g. @@ -1,5 + 2,5 @@
231 |         if el.startswith("@"):
232 |             return diff[ix:]
233 |     return diff
234 | 
235 | 
236 | def _diff_schemas(first, second):
237 |     # see https://issues.apache.org/jira/browse/ARROW-4176
238 | 
239 |     first_pyarrow_info = str(first.remove_metadata())
240 |     second_pyarrow_info = str(second.remove_metadata())
241 |     pyarrow_diff = _remove_diff_header(
242 |         difflib.unified_diff(
243 |             str(first_pyarrow_info).splitlines(), str(second_pyarrow_info).splitlines()
244 |         )
245 |     )
246 | 
247 |     first_pandas_info = first.pandas_metadata
248 |     second_pandas_info = second.pandas_metadata
249 |     pandas_meta_diff = _remove_diff_header(
250 |         difflib.unified_diff(
251 |             pprint.pformat(first_pandas_info).splitlines(),
252 |             pprint.pformat(second_pandas_info).splitlines(),
253 |         )
254 |     )
255 | 
256 |     diff_string = (
257 |         "Arrow schema:\n"
258 |         + "\n".join(pyarrow_diff)
259 |         + "\n\nPandas_metadata:\n"
260 |         + "\n".join(pandas_meta_diff)
261 |     )
262 | 
263 |     return diff_string
264 | 
265 | 
266 | def validate_compatible(
267 |     schemas: Iterable[pa.Schema], ignore_pandas: bool = False
268 | ) -> pa.Schema:
269 |     """Validate that all schemas in a given list are compatible.
270 | 
271 |     Apart from the pandas version preserved in the schema metadata, schemas must be completely identical. That includes
272 |     a perfect match of the whole metadata (except the pandas version) and pyarrow types.
273 | 
274 |     In the case that all schemas don't contain any pandas metadata, we will check the Arrow
275 |     schemas directly for compatibility.
276 | 
277 |     Parameters
278 |     ----------
279 |     schemas: List[Schema]
280 |         Schema information from multiple sources, e.g. multiple partitions. List may be empty.
281 |     ignore_pandas: bool
282 |         Ignore the schema information given by Pandas an always use the Arrow schema.
283 | 
284 |     Returns
285 |     -------
286 |     schema: SchemaWrapper
287 |         The reference schema which was tested against
288 | 
289 |     Raises
290 |     ------
291 |     ValueError
292 |         At least two schemas are incompatible.
293 |     """
294 |     reference, schemas_to_evaluate = _determine_schemas_to_compare(
295 |         schemas, ignore_pandas
296 |     )
297 | 
298 |     for current, null_columns in schemas_to_evaluate:
299 |         # We have schemas so the reference schema should be non-none.
300 |         assert reference is not None
301 |         # Compare each schema to the reference but ignore the null_cols and the Pandas schema information.
302 |         reference_to_compare = _strip_columns_from_schema(
303 |             reference, null_columns
304 |         ).remove_metadata()
305 |         current_to_compare = _strip_columns_from_schema(
306 |             current, null_columns
307 |         ).remove_metadata()
308 | 
309 |         def _fmt_origin(origin):
310 |             origin = sorted(origin)
311 |             # dask cuts of exception messages at 1k chars:
312 |             #   https://github.com/dask/distributed/blob/6e0c0a6b90b1d3c/distributed/core.py#L964
313 |             # therefore, we cut the the maximum length
314 |             max_len = 200
315 |             inner_msg = ", ".join(origin)
316 |             ellipsis = "..."
317 |             if len(inner_msg) > max_len + len(ellipsis):
318 |                 inner_msg = inner_msg[:max_len] + ellipsis
319 |             return f"{{{inner_msg}}}"
320 | 
321 |         if reference_to_compare != current_to_compare:
322 |             schema_diff = _diff_schemas(reference, current)
323 |             exception_message = """Schema violation
324 | 
325 | Origin schema: {origin_schema}
326 | Origin reference: {origin_reference}
327 | 
328 | Diff:
329 | {schema_diff}
330 | 
331 | Reference schema:
332 | {reference}""".format(
333 |                 schema_diff=schema_diff,
334 |                 reference=str(reference),
335 |                 origin_schema=_fmt_origin(current.origin),
336 |                 origin_reference=_fmt_origin(reference.origin),
337 |             )
338 |             raise ValueError(exception_message)
339 | 
340 |     # add all origins to result AFTER error checking, otherwise the error message would be pretty misleading due to the
341 |     # reference containing all origins.
342 |     if reference is None:
343 |         return None
344 |     else:
345 |         return reference
346 | 
347 | 
348 | def _dict_to_binary(dct):
349 |     return json.dumps(dct, sort_keys=True).encode("utf8")
350 | 


--------------------------------------------------------------------------------
/dask_deltatable/write.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import uuid
  5 | from collections.abc import Mapping
  6 | from datetime import datetime
  7 | from pathlib import Path
  8 | from typing import Any, Literal
  9 | 
 10 | import dask
 11 | import dask.dataframe as dd
 12 | import pyarrow as pa
 13 | import pyarrow.dataset as ds
 14 | import pyarrow.fs as pa_fs
 15 | from dask.core import flatten
 16 | from deltalake import CommitProperties, DeltaTable
 17 | from deltalake import Schema as DeltaSchema
 18 | from deltalake.exceptions import DeltaProtocolError
 19 | from deltalake.fs import DeltaStorageHandler
 20 | from deltalake.table import MAX_SUPPORTED_PYARROW_WRITER_VERSION
 21 | from deltalake.transaction import AddAction, create_table_with_add_actions
 22 | from deltalake.writer.writer import try_get_table_and_table_uri
 23 | from toolz.itertoolz import pluck
 24 | 
 25 | from . import utils
 26 | from ._schema import pyarrow_to_deltalake, validate_compatible
 27 | 
 28 | PYARROW_MAJOR_VERSION = int(pa.__version__.split(".")[0])
 29 | 
 30 | 
 31 | def to_deltalake(
 32 |     table_or_uri: str | Path | DeltaTable,
 33 |     df: dd.DataFrame,
 34 |     *,
 35 |     schema: pa.Schema | None = None,
 36 |     partition_by: list[str] | str | None = None,
 37 |     filesystem: pa_fs.FileSystem | None = None,
 38 |     mode: Literal["error", "append", "overwrite", "ignore"] = "error",
 39 |     file_options: Mapping[str, Any] | None = None,
 40 |     max_partitions: int | None = None,
 41 |     max_open_files: int = 1024,
 42 |     max_rows_per_file: int = 10 * 1024 * 1024,
 43 |     min_rows_per_group: int = 64 * 1024,
 44 |     max_rows_per_group: int = 128 * 1024,
 45 |     name: str | None = None,
 46 |     description: str | None = None,
 47 |     configuration: Mapping[str, str | None] | None = None,
 48 |     overwrite_schema: bool = False,
 49 |     storage_options: dict[str, str] | None = None,
 50 |     partition_filters: list[tuple[str, str, Any]] | None = None,
 51 |     compute: bool = True,
 52 |     custom_metadata: dict[str, str] | None = None,
 53 | ):
 54 |     """Write a given dask.DataFrame to a delta table. The returned value is a Dask Scalar,
 55 |     and the writing operation is only triggered when calling ``.compute()``
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     table_or_uri: str | Path | DeltaTable
 60 |         URI of a table or a DeltaTable object.
 61 |     df: dd.DataFrame
 62 |         Data to write
 63 |     schema : pa.Schema | None. Default None
 64 |         Optional schema to write.
 65 |     partition_by : list[str] | str | None. Default None
 66 |         List of columns to partition the table by. Only required
 67 |         when creating a new table
 68 |     filesystem : pa_fs.FileSystem | None. Default None
 69 |         Optional filesystem to pass to PyArrow. If not provided will
 70 |         be inferred from uri. The file system has to be rooted in the table root.
 71 |         Use the pyarrow.fs.SubTreeFileSystem, to adopt the root of pyarrow file systems.
 72 |     mode : Literal["error", "append", "overwrite", "ignore"]. Default "error"
 73 |         How to handle existing data. Default is to error if table already exists.
 74 |         If 'append', will add new data.
 75 |         If 'overwrite', will replace table with new data.
 76 |         If 'ignore', will not write anything if table already exists.
 77 |     file_options : Mapping[str, Any] | None. Default None
 78 |         Optional dict of options that can be used to initialize ParquetFileWriteOptions.
 79 |         Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx
 80 |         for the list of available options
 81 |     max_partitions : int | None. Default None
 82 |         The maximum number of partitions that will be used.
 83 |     max_open_files : int. Default 1024
 84 |         Limits the maximum number of
 85 |         files that can be left open while writing. If an attempt is made to open
 86 |         too many files then the least recently used file will be closed.
 87 |         If this setting is set too low you may end up fragmenting your
 88 |         data into many small files.
 89 |     max_rows_per_file : int. Default 10 * 1024 * 1024
 90 |         Maximum number of rows per file.
 91 |         If greater than 0 then this will limit how many rows are placed in any single file.
 92 |         Otherwise there will be no limit and one file will be created in each output directory
 93 |         unless files need to be closed to respect max_open_files
 94 |     min_rows_per_group : int. Default 64 * 1024
 95 |         Minimum number of rows per group. When the value is set,
 96 |         the dataset writer will batch incoming data and only write the row groups to the disk
 97 |         when sufficient rows have accumulated.
 98 |     max_rows_per_group : int. Default 128 * 1024
 99 |         Maximum number of rows per group.
100 |         If the value is set, then the dataset writer may split up large incoming batches into multiple row groups.
101 |         If this value is set, then min_rows_per_group should also be set
102 |     name: str | None. Default None
103 |         User-provided identifier for this table.
104 |     description : str | None. Default None
105 |         User-provided description for this table
106 |     configuration : Mapping[str, str | None] | None. Default None
107 |         A map containing configuration options for the metadata action.
108 |     overwrite_schema : bool. Default False
109 |         If True, allows updating the schema of the table.
110 |     storage_options : dict[str, str] | None. Default None
111 |         Options passed to the native delta filesystem. Unused if 'filesystem' is defined
112 |     partition_filters : list[tuple[str, str, Any]] | None. Default None
113 |         The partition filters that will be used for partition overwrite.
114 |     compute : bool. Default True
115 |         Whether to trigger the writing operation immediately
116 | 
117 |     Returns
118 |     -------
119 |     dask.Scalar
120 |     """
121 |     storage_options = utils.maybe_set_aws_credentials(table_or_uri, storage_options)  # type: ignore
122 |     table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options)
123 | 
124 |     # We need to write against the latest table version
125 |     if table:
126 |         table.update_incremental()
127 | 
128 |     utils.enforce_append_only(table=table, configuration=configuration, mode=mode)
129 | 
130 |     if filesystem is None:
131 |         if table is not None:
132 |             storage_options = table._storage_options or {}
133 |             storage_options.update(storage_options or {})
134 | 
135 |         storage_options = utils.maybe_set_aws_credentials(table_uri, storage_options)
136 |         filesystem = pa_fs.PyFileSystem(DeltaStorageHandler(table_uri, storage_options))
137 | 
138 |     if isinstance(partition_by, str):
139 |         partition_by = [partition_by]
140 | 
141 |     if schema is not None:
142 |         schema = pyarrow_to_deltalake(schema)
143 | 
144 |     if table:  # already exists
145 |         if (
146 |             schema is not None
147 |             and schema != pa.schema(table.schema())
148 |             and not (mode == "overwrite" and overwrite_schema)
149 |         ):
150 |             raise ValueError(
151 |                 "Schema of data does not match table schema\n"
152 |                 f"Table schema:\n{schema}\nData Schema:\n{table.schema().to_arrow()}"
153 |             )
154 | 
155 |         if mode == "error":
156 |             raise AssertionError("DeltaTable already exists.")
157 |         elif mode == "ignore":
158 |             return
159 | 
160 |         current_version = table.version()
161 | 
162 |         if partition_by:
163 |             assert partition_by == table.metadata().partition_columns
164 |         else:
165 |             partition_by = table.metadata().partition_columns
166 | 
167 |         if table.protocol().min_writer_version > MAX_SUPPORTED_PYARROW_WRITER_VERSION:
168 |             raise DeltaProtocolError(
169 |                 "This table's min_writer_version is "
170 |                 f"{table.protocol().min_writer_version}, "
171 |                 f"but this method only supports version {MAX_SUPPORTED_PYARROW_WRITER_VERSION}."
172 |             )
173 |     else:  # creating a new table
174 |         current_version = -1
175 | 
176 |     # FIXME: schema is only known at this point if provided by the user
177 |     if partition_by and schema:
178 |         partition_schema = pa.schema([schema.field(name) for name in partition_by])
179 |         partitioning = ds.partitioning(partition_schema, flavor="hive")
180 |     else:
181 |         if partition_by:
182 |             raise NotImplementedError("Have to provide schema when using partition_by")
183 |         partitioning = None
184 |     if mode == "overwrite":
185 |         # FIXME: There are a couple of checks that are not migrated yet
186 |         raise NotImplementedError("mode='overwrite' is not implemented")
187 |     written = df.map_partitions(
188 |         _write_partition,
189 |         schema=schema,
190 |         partitioning=partitioning,
191 |         current_version=current_version,
192 |         file_options=file_options,
193 |         max_open_files=max_open_files,
194 |         max_rows_per_file=max_rows_per_file,
195 |         min_rows_per_group=min_rows_per_group,
196 |         max_rows_per_group=max_rows_per_group,
197 |         filesystem=filesystem,
198 |         max_partitions=max_partitions,
199 |         meta=(None, object),
200 |         table=DaskDeltaTable.from_delta_table(table) if table else None,
201 |         configuration=configuration,
202 |     )
203 |     result = dask.delayed(_commit, name="deltatable-commit")(
204 |         DaskDeltaTable.from_delta_table(table) if table else None,
205 |         written,
206 |         table_uri,
207 |         schema,
208 |         mode,
209 |         partition_by,
210 |         name,
211 |         description,
212 |         configuration,
213 |         storage_options,
214 |         partition_filters,
215 |         custom_metadata,
216 |     )
217 | 
218 |     if compute:
219 |         result = result.compute()
220 |     return result
221 | 
222 | 
223 | def _commit(
224 |     table,
225 |     schemas_add_actions_nested,
226 |     table_uri,
227 |     schema,
228 |     mode,
229 |     partition_by,
230 |     name,
231 |     description,
232 |     configuration,
233 |     storage_options,
234 |     partition_filters,
235 |     custom_metadata,
236 | ):
237 |     schemas = list(flatten(pluck(0, schemas_add_actions_nested)))
238 |     add_actions = list(flatten(pluck(1, schemas_add_actions_nested)))
239 |     # TODO: What should the behavior be if the schema is provided? Cast the
240 |     # data?
241 |     if schema:
242 |         schemas.append(schema)
243 | 
244 |     # TODO: This is applying a potentially stricter schema control than what
245 |     # Delta requires but if this passes, it should be good to go
246 |     schema = validate_compatible(schemas)
247 |     assert schema
248 |     delta_schema = DeltaSchema.from_arrow(schema)
249 |     commit_properties = CommitProperties(custom_metadata=custom_metadata)
250 |     if table is None:
251 |         storage_options = utils.maybe_set_aws_credentials(table_uri, storage_options)
252 |         create_table_with_add_actions(
253 |             table_uri,
254 |             delta_schema,
255 |             add_actions,
256 |             mode,
257 |             partition_by or [],
258 |             name,
259 |             description,
260 |             configuration,
261 |             storage_options,
262 |             commit_properties,
263 |         )
264 |     else:
265 |         table._table.create_write_transaction(
266 |             add_actions,
267 |             mode,
268 |             partition_by or [],
269 |             delta_schema,
270 |             partition_filters,
271 |         )
272 |         table.update_incremental()
273 | 
274 | 
275 | def _write_partition(
276 |     df,
277 |     *,
278 |     schema,
279 |     partitioning,
280 |     current_version,
281 |     file_options,
282 |     max_open_files,
283 |     max_rows_per_file,
284 |     min_rows_per_group,
285 |     max_rows_per_group,
286 |     filesystem,
287 |     max_partitions,
288 |     table,
289 |     configuration,
290 | ) -> tuple[pa.Schema, list[AddAction]]:
291 |     if schema is None:
292 |         #
293 |         schema = pyarrow_to_deltalake(pa.Schema.from_pandas(df))
294 |     data = pa.Table.from_pandas(df, schema=schema)
295 | 
296 |     add_actions: list[AddAction] = []
297 | 
298 |     def visitor(written_file: Any) -> None:
299 |         num_indexed_cols, stats_cols = utils.get_num_idx_cols_and_stats_columns(
300 |             table if table is not None else None, configuration
301 |         )
302 |         path, partition_values = utils.get_partitions_from_path(written_file.path)
303 |         stats = utils.get_file_stats_from_metadata(
304 |             written_file.metadata, num_indexed_cols, stats_cols
305 |         )
306 | 
307 |         # PyArrow added support for written_file.size in 9.0.0
308 |         if PYARROW_MAJOR_VERSION >= 9:
309 |             size = written_file.size
310 |         else:
311 |             size = filesystem.get_file_info([path])[0].size
312 | 
313 |         add_actions.append(
314 |             AddAction(
315 |                 path,
316 |                 size,
317 |                 partition_values,
318 |                 int(datetime.now().timestamp() * 1000),
319 |                 True,
320 |                 json.dumps(stats, cls=utils.DeltaJSONEncoder),
321 |             )
322 |         )
323 | 
324 |     if file_options is not None:
325 |         file_options = ds.ParquetFileFormat().make_write_options(**file_options)
326 | 
327 |     ds.write_dataset(
328 |         data,
329 |         base_dir="/",
330 |         basename_template=f"{current_version + 1}-{uuid.uuid4()}-{{i}}.parquet",
331 |         format="parquet",
332 |         partitioning=partitioning,
333 |         # It will not accept a schema if using a RBR
334 |         schema=schema,
335 |         existing_data_behavior="overwrite_or_ignore",
336 |         file_options=file_options,
337 |         max_open_files=max_open_files,
338 |         file_visitor=visitor,
339 |         max_rows_per_file=max_rows_per_file,
340 |         min_rows_per_group=min_rows_per_group,
341 |         max_rows_per_group=max_rows_per_group,
342 |         filesystem=filesystem,
343 |         max_partitions=max_partitions,
344 |     )
345 |     return schema, add_actions
346 | 
347 | 
348 | class DaskDeltaTable(DeltaTable):
349 |     @classmethod
350 |     def from_delta_table(
351 |         cls,
352 |         table: DeltaTable,
353 |     ) -> DaskDeltaTable:
354 |         config = table.table_config
355 |         return cls(
356 |             table_uri=table.table_uri,
357 |             version=table.version(),
358 |             storage_options=table._storage_options,
359 |             without_files=config.without_files,
360 |             log_buffer_size=config.log_buffer_size,
361 |         )
362 | 
363 |     def __reduce__(self) -> tuple[type, tuple[Any, ...]]:
364 |         """
365 |         This allows DeltaTable to be pickled.
366 |         """
367 |         config = self.table_config
368 |         return (
369 |             self.__class__,
370 |             (
371 |                 self.table_uri,
372 |                 self.version(),
373 |                 self._storage_options,
374 |                 config.without_files,
375 |                 config.log_buffer_size,
376 |             ),
377 |         )
378 | 


--------------------------------------------------------------------------------
/dask_deltatable/core.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from collections.abc import Sequence
  5 | from typing import Any, Callable, cast
  6 | 
  7 | import dask
  8 | import dask.dataframe as dd
  9 | import pyarrow as pa
 10 | import pyarrow.parquet as pq
 11 | from dask.base import tokenize
 12 | from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine
 13 | from dask.dataframe.utils import make_meta
 14 | from deltalake import DeltaTable
 15 | from fsspec.core import get_fs_token_paths
 16 | from packaging.version import Version
 17 | from pyarrow import dataset as pa_ds
 18 | 
 19 | from . import utils
 20 | from .types import Filters
 21 | 
 22 | if Version(pa.__version__) >= Version("10.0.0"):
 23 |     filters_to_expression = pq.filters_to_expression
 24 | else:
 25 |     # fallback to older internal method
 26 |     filters_to_expression = pq._filters_to_expression
 27 | 
 28 | 
 29 | def _get_pq_files(dt: DeltaTable, filter: Filters = None) -> list[str]:
 30 |     """
 31 |     Get the list of parquet files after loading the
 32 |     current datetime version
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     dt : DeltaTable
 37 |         DeltaTable instance
 38 |     filter : list[tuple[str, str, Any]] | list[list[tuple[str, str, Any]]] | None
 39 |         Filters in DNF form.
 40 | 
 41 |     Returns
 42 |     -------
 43 |     list[str]
 44 |         List of files matching optional filter.
 45 |     """
 46 |     partition_filters = utils.get_partition_filters(
 47 |         dt.metadata().partition_columns, filter
 48 |     )
 49 |     if not partition_filters:
 50 |         # can't filter
 51 |         return sorted(dt.file_uris())
 52 |     file_uris = set()
 53 |     for filter_set in partition_filters:
 54 |         file_uris.update(dt.file_uris(partition_filters=filter_set))
 55 |     return sorted(list(file_uris))
 56 | 
 57 | 
 58 | def _read_delta_partition(
 59 |     filename: str,
 60 |     schema: pa.Schema,
 61 |     fs: Any,
 62 |     columns: Sequence[str] | None,
 63 |     filter: Filters = None,
 64 |     pyarrow_to_pandas: dict[str, Any] | None = None,
 65 |     **_kwargs: dict[str, Any],
 66 | ):
 67 |     filter_expression = filters_to_expression(filter) if filter else None
 68 |     if pyarrow_to_pandas is None:
 69 |         pyarrow_to_pandas = {}
 70 |     pyarrow_to_pandas["types_mapper"] = _get_type_mapper(
 71 |         pyarrow_to_pandas.get("types_mapper")
 72 |     )
 73 |     pyarrow_to_pandas["ignore_metadata"] = pyarrow_to_pandas.get(
 74 |         "ignore_metadata", False
 75 |     )
 76 |     table = pa_ds.dataset(
 77 |         source=filename,
 78 |         schema=schema,
 79 |         filesystem=fs,
 80 |         format="parquet",
 81 |         partitioning="hive",
 82 |     ).to_table(filter=filter_expression, columns=columns)
 83 |     return table.to_pandas(**pyarrow_to_pandas)
 84 | 
 85 | 
 86 | def _read_from_filesystem(
 87 |     path: str,
 88 |     version: int | None,
 89 |     columns: Sequence[str] | None,
 90 |     datetime: str | None = None,
 91 |     storage_options: dict[str, str] | None = None,
 92 |     delta_storage_options: dict[str, str] | None = None,
 93 |     **kwargs: dict[str, Any],
 94 | ) -> dd.DataFrame:
 95 |     """
 96 |     Reads the list of parquet files in parallel
 97 |     """
 98 |     storage_options = utils.maybe_set_aws_credentials(path, storage_options)  # type: ignore
 99 |     delta_storage_options = utils.maybe_set_aws_credentials(path, delta_storage_options)  # type: ignore
100 | 
101 |     fs, fs_token, _ = get_fs_token_paths(path, storage_options=storage_options)
102 |     dt = DeltaTable(
103 |         table_uri=path, version=version, storage_options=delta_storage_options
104 |     )
105 |     if datetime is not None:
106 |         dt.load_as_version(datetime)
107 | 
108 |     schema = pa.schema(dt.schema())
109 | 
110 |     filter_value = cast(Filters, kwargs.get("filter", None))
111 |     pq_files = _get_pq_files(dt, filter=filter_value)
112 | 
113 |     mapper_kwargs = kwargs.get("pyarrow_to_pandas", {})
114 |     mapper_kwargs["types_mapper"] = _get_type_mapper(
115 |         mapper_kwargs.get("types_mapper", None)
116 |     )
117 |     meta = make_meta(pa.table(schema.empty_table()).to_pandas(**mapper_kwargs))
118 |     if columns:
119 |         meta = meta[columns]
120 | 
121 |     if not dd._dask_expr_enabled():
122 |         # Setting token not supported in dask-expr
123 |         kwargs["token"] = tokenize(path, fs_token, **kwargs)  # type: ignore
124 | 
125 |     if len(pq_files) == 0:
126 |         df = schema.empty_table().to_pandas()
127 |         if columns is not None:
128 |             df = df[columns]
129 |         return dd.from_pandas(df, npartitions=1)
130 |     else:
131 |         return dd.from_map(
132 |             _read_delta_partition,
133 |             pq_files,
134 |             fs=fs,
135 |             columns=columns,
136 |             schema=schema,
137 |             meta=meta,
138 |             label="read-delta-table",
139 |             **kwargs,
140 |         )
141 | 
142 | 
143 | def _get_type_mapper(
144 |     user_types_mapper: dict[str, Any] | None,
145 | ) -> Callable[[Any], Any] | None:
146 |     """
147 |     Set the type mapper for the schema
148 |     """
149 |     convert_string = dask.config.get("dataframe.convert-string", True)
150 |     if convert_string is None:
151 |         convert_string = True
152 |     return ArrowDatasetEngine._determine_type_mapper(
153 |         dtype_backend=None,
154 |         convert_string=convert_string,
155 |         arrow_to_pandas={"types_mapper": user_types_mapper},
156 |     )
157 | 
158 | 
159 | def read_deltalake(
160 |     path: str | None = None,
161 |     catalog: str | None = None,
162 |     database_name: str | None = None,
163 |     table_name: str | None = None,
164 |     version: int | None = None,
165 |     columns: list[str] | None = None,
166 |     storage_options: dict[str, str] | None = None,
167 |     datetime: str | None = None,
168 |     delta_storage_options: dict[str, str] | None = None,
169 |     **kwargs,
170 | ):
171 |     """
172 |     Read a Delta Table into a Dask DataFrame
173 | 
174 |     This reads a list of Parquet files in delta table directory into a
175 |     Dask.dataframe.
176 | 
177 |     Parameters
178 |     ----------
179 |     path: Optional[str]
180 |         path of Delta table directory
181 |     catalog: Optional[str]
182 |         Currently supports only AWS Glue Catalog
183 |         if catalog is provided, user has to provide database and table name, and
184 |         delta-rs will fetch the metadata from glue catalog, this is used by dask to read
185 |         the parquet tables
186 |     database_name: Optional[str]
187 |         database name present in the catalog
188 |     tablename: Optional[str]
189 |         table name present in the database of the Catalog
190 |     version: int, default None
191 |         DeltaTable Version, used for Time Travelling across the
192 |         different versions of the parquet datasets
193 |     datetime: str, default None
194 |         Time travel Delta table to the latest version that's created at or
195 |         before provided `datetime_string` argument.
196 |         The `datetime_string` argument should be an RFC 3339 and ISO 8601 date
197 |          and time string.
198 | 
199 |         Examples:
200 |         `2018-01-26T18:30:09Z`
201 |         `2018-12-19T16:39:57-08:00`
202 |         `2018-01-26T18:30:09.453+00:00`
203 |          #(copied from delta-rs docs)
204 |     columns: None or list(str)
205 |         Columns to load. If None, loads all.
206 |     storage_options : dict, default None
207 |         Key/value pairs to be passed on to the fsspec backend, if any.
208 |     delta_storage_options : dict, default None
209 |         Key/value pairs to be passed on to the delta-rs filesystem, if any.
210 |     kwargs: dict,optional
211 |         Some most used parameters can be passed here are:
212 |         1. schema
213 |         2. filter
214 |         3. pyarrow_to_pandas
215 | 
216 |         schema: pyarrow.Schema
217 |             Used to maintain schema evolution in deltatable.
218 |             delta protocol stores the schema string in the json log files which is
219 |             converted into pyarrow.Schema and used for schema evolution
220 |             i.e Based on particular version, some columns can be
221 |             shown or not shown.
222 | 
223 |         filter: Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None
224 |             List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``.
225 |             Can act as both partition as well as row based filter, above list of filters
226 |             converted into pyarrow.dataset.Expression built using pyarrow.dataset.Field
227 |             example:
228 |                 [("x",">",400)] --> pyarrow.dataset.field("x")>400
229 | 
230 |         pyarrow_to_pandas: dict
231 |             Options to pass directly to pyarrow.Table.to_pandas.
232 |             Common options include:
233 |             * categories: list[str]
234 |                 List of columns to treat as pandas.Categorical
235 |             * strings_to_categorical: bool
236 |                 Encode string (UTF8) and binary types to pandas.Categorical.
237 |             * types_mapper: Callable
238 |                 A function mapping a pyarrow DataType to a pandas ExtensionDtype
239 | 
240 |             See https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas
241 |             for more.
242 | 
243 |     Returns
244 |     -------
245 |     Dask.DataFrame
246 | 
247 |     Examples
248 |     --------
249 |     >>> import dask_deltatable as ddt
250 |     >>> df = ddt.read_deltalake('s3://bucket/my-delta-table')  # doctest: +SKIP
251 | 
252 |     """
253 |     if catalog is not None:
254 |         if (database_name is None) or (table_name is None):
255 |             raise ValueError(
256 |                 "Since Catalog was provided, please provide Database and table name"
257 |             )
258 |         else:
259 |             raise NotImplementedError(
260 |                 "Reading from a catalog used to be supported ",
261 |                 "but was removed from the upstream dependency delta-rs>=1.0.",
262 |             )
263 |     else:
264 |         if path is None:
265 |             raise ValueError("Please Provide Delta Table path")
266 | 
267 |         delta_storage_options = utils.maybe_set_aws_credentials(path, delta_storage_options)  # type: ignore
268 |         resultdf = _read_from_filesystem(
269 |             path=path,
270 |             version=version,
271 |             columns=columns,
272 |             storage_options=storage_options,
273 |             datetime=datetime,
274 |             delta_storage_options=delta_storage_options,
275 |             **kwargs,
276 |         )
277 |     return resultdf
278 | 
279 | 
280 | def read_unity_catalog(
281 |     catalog_name: str,
282 |     schema_name: str,
283 |     table_name: str,
284 |     **kwargs,
285 | ) -> dd.DataFrame:
286 |     """
287 |     Read a Delta Table from Databricks Unity Catalog into a Dask DataFrame.
288 | 
289 |     This function connects to Databricks using the WorkspaceClient and retrieves
290 |     temporary credentials to access the specified Unity Catalog table. It then
291 |     reads the Delta table's Parquet files into a Dask DataFrame.
292 | 
293 |     Parameters
294 |     ----------
295 |     catalog_name : str
296 |         Name of the Unity Catalog catalog.
297 |     schema_name : str
298 |         Name of the schema within the catalog.
299 |     table_name : str
300 |         Name of the table within the catalog schema.
301 |     **kwargs
302 |         Additional keyword arguments passed to `dask.dataframe.read_parquet`.
303 |         Some most used parameters can be passed here are:
304 |         1. schema
305 |         2. filter
306 |         3. pyarrow_to_pandas
307 |         4. databricks_host
308 |         5. databricks_token
309 | 
310 |         schema: pyarrow.Schema
311 |             Used to maintain schema evolution in deltatable.
312 |             delta protocol stores the schema string in the json log files which is
313 |             converted into pyarrow.Schema and used for schema evolution
314 |             i.e Based on particular version, some columns can be
315 |             shown or not shown.
316 | 
317 |         filter: Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None
318 |             List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``.
319 |             Can act as both partition as well as row based filter, above list of filters
320 |             converted into pyarrow.dataset.Expression built using pyarrow.dataset.Field
321 |             example:
322 |                 [("x",">",400)] --> pyarrow.dataset.field("x")>400
323 | 
324 |         pyarrow_to_pandas: dict
325 |             Options to pass directly to pyarrow.Table.to_pandas.
326 |             Common options include:
327 |             * categories: list[str]
328 |                 List of columns to treat as pandas.Categorical
329 |             * strings_to_categorical: bool
330 |                 Encode string (UTF8) and binary types to pandas.Categorical.
331 |             * types_mapper: Callable
332 |                 A function mapping a pyarrow DataType to a pandas ExtensionDtype
333 | 
334 |             See https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas
335 |             for more.
336 | 
337 |         databricks_host: str
338 |             The Databricks workspace URL hosting the Unity Catalog.
339 | 
340 |         databricks_token: str
341 |             A Databricks personal access token with at least read access on the catalog.
342 | 
343 |     Returns
344 |     -------
345 |     dask.dataframe.DataFrame
346 |         A Dask DataFrame representing the Delta table.
347 | 
348 |     Notes
349 |     -----
350 |     Requires the following to be set as either environment variables or in `kwargs` as
351 |     lower case:
352 |     - DATABRICKS_HOST: The Databricks workspace URL hosting the Unity Catalog.
353 |     - DATABRICKS_TOKEN: A Databricks personal access token with at least read access on
354 |         the catalog.
355 | 
356 |     Example
357 |     -------
358 |     >>> ddf = read_unity_catalog(
359 |             catalog_name="main",
360 |             database_name="my_db",
361 |             able_name="my_table",
362 |         )
363 |     """
364 |     from databricks.sdk import WorkspaceClient
365 |     from databricks.sdk.service.catalog import TableOperation
366 | 
367 |     try:
368 |         workspace_client = WorkspaceClient(
369 |             host=os.environ.get("DATABRICKS_HOST", kwargs["databricks_host"]),
370 |             token=os.environ.get("DATABRICKS_TOKEN", kwargs["databricks_token"]),
371 |         )
372 |     except KeyError:
373 |         raise ValueError(
374 |             "Please set `DATABRICKS_HOST` and `DATABRICKS_TOKEN` either as environment"
375 |             " variables or as part of `kwargs` with lowercase"
376 |         )
377 |     uc_full_url = f"{catalog_name}.{schema_name}.{table_name}"
378 |     table = workspace_client.tables.get(uc_full_url)
379 |     temp_credentials = workspace_client.temporary_table_credentials.generate_temporary_table_credentials(
380 |         operation=TableOperation.READ,
381 |         table_id=table.table_id,
382 |     )
383 |     storage_options = {
384 |         "sas_token": temp_credentials.azure_user_delegation_sas.sas_token
385 |     }
386 |     delta_table = DeltaTable(
387 |         table_uri=table.storage_location, storage_options=storage_options
388 |     )
389 |     ddf = dd.read_parquet(
390 |         path=delta_table.file_uris(),
391 |         storage_options=storage_options,
392 |         **kwargs,
393 |     )
394 |     return ddf
395 | 


--------------------------------------------------------------------------------