├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── black.yml │ ├── ci.yml │ ├── mkdocs.yml │ └── ruff.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── docker-compose.yaml ├── docs ├── gen_ref_pages.py └── index.md ├── images └── mack.jpg ├── mack └── __init__.py ├── mkdocs.yml ├── poetry.lock ├── pyproject.toml └── tests ├── __init__.py └── test_public_interface.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | 20 | 21 | **Expected behavior** 22 | A clear and concise description of what you expected to happen. 23 | 24 | **Additional context** 25 | Add any other context about the problem here. 26 | 27 | **Environment information** 28 | 29 | * Delta Lake version: 30 | * Spark version: 31 | * Python version: 32 | 33 | **Willingness to contribute** 34 | 35 | Would you be willing to contribute a fix for this bug to the mack code base? 36 | 37 | - [ ] Yes. I can contribute a fix for this bug independently. 38 | - [ ] Yes. I would be willing to contribute a fix for this bug with guidance from the mack community. 39 | - [ ] No. I cannot contribute a bug fix at this time. 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE REQUEST]" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | 22 | **Willingness to contribute** 23 | 24 | Would you be willing to contribute an implementation of this feature? 25 | 26 | - [ ] Yes. I can contribute this feature independently. 27 | - [ ] Yes. I would be willing to contribute this feature with guidance from the mack community. 28 | - [ ] No. I cannot contribute this feature at this time. 29 | -------------------------------------------------------------------------------- /.github/workflows/black.yml: -------------------------------------------------------------------------------- 1 | name: Black format check 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | lint: 13 | name: lint 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Black Code Formatter Check 18 | uses: psf/black@stable 19 | with: 20 | options: "--verbose --check" 21 | src: "mack" 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Unit tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | 13 | test: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v1 18 | with: 19 | fetch-depth: 1 20 | 21 | - name: Set up Python 3.9 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: 3.9 25 | 26 | - name: Install Poetry 27 | uses: snok/install-poetry@v1 28 | 29 | - name: Cache Poetry virtualenv 30 | uses: actions/cache@v1 31 | id: cache 32 | with: 33 | path: ~/.virtualenvs 34 | key: poetry-${{ hashFiles('**/poetry.lock') }} 35 | restore-keys: | 36 | poetry-${{ hashFiles('**/poetry.lock') }} 37 | - name: Install dependencies 38 | run: poetry install 39 | if: steps.cache.outputs.cache-hit != 'true' 40 | 41 | - name: Run mack tests with pytest 42 | run: poetry run pytest 43 | -------------------------------------------------------------------------------- /.github/workflows/mkdocs.yml: -------------------------------------------------------------------------------- 1 | name: MKDocs deploy 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 3.9 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.9 19 | - name: Set up Poetry 20 | uses: abatilo/actions-poetry@v2 21 | with: 22 | poetry-version: 1.4.0 23 | - name: Cache Poetry virtualenv 24 | uses: actions/cache@v1 25 | id: cache 26 | with: 27 | path: ~/.virtualenvs 28 | key: poetry-${{ hashFiles('**/poetry.lock') }} 29 | restore-keys: | 30 | poetry-${{ hashFiles('**/poetry.lock') }} 31 | - name: Install dependencies 32 | run: 33 | poetry install --with mkdocs 34 | if: steps.cache.outputs.cache-hit != 'true' 35 | - name: Setup GH 36 | run: | 37 | sudo apt update && sudo apt install -y git 38 | git config user.name 'github-actions[bot]' 39 | git config user.email 'github-actions[bot]@users.noreply.github.com' 40 | - name: Build and Deploy 41 | run: 42 | poetry run mkdocs gh-deploy --force 43 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: ruff check 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.9 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.9 23 | - name: ruff Lint 24 | uses: jpetrucciani/ruff-check@main 25 | with: 26 | line-length: "150" 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | chispa.egg-info/ 4 | .cache/ 5 | tmp/ 6 | .idea/ 7 | .DS_Store 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | 13 | .pytest_cache/ 14 | 15 | # MKDocs 16 | site 17 | 18 | # Virtualenvs 19 | .venv 20 | .env 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: check-added-large-files 6 | - id: check-ast 7 | - id: check-byte-order-marker 8 | - id: check-case-conflict 9 | - id: check-docstring-first 10 | - id: check-executables-have-shebangs 11 | - id: check-json 12 | - id: check-merge-conflict 13 | - id: check-toml 14 | - id: check-yaml 15 | - id: debug-statements 16 | - id: detect-private-key 17 | - id: end-of-file-fixer 18 | - id: requirements-txt-fixer 19 | - id: trailing-whitespace 20 | - repo: https://github.com/charliermarsh/ruff-pre-commit 21 | rev: 'v0.0.245' 22 | hooks: 23 | - id: ruff 24 | args: ["--fix", "--line-length", "150", "--exit-non-zero-on-fix"] 25 | - repo: https://github.com/psf/black 26 | rev: 22.12.0 27 | hooks: 28 | - id: black 29 | language_version: python3.9 30 | - repo: local 31 | hooks: 32 | - id: pytest 33 | name: pytest 34 | entry: pytest tests/ 35 | pass_filenames: false 36 | language: system 37 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome to the mack contributing guide 2 | 3 | ### Issues 4 | 5 | #### Create a new issue 6 | 7 | If you spot a problem with the docs, search if an issue already . If a related issue doesn't exist, you can open a [new issue](https://github.com/MrPowers/mack/issues/new). 8 | 9 | #### Solve an issue 10 | 11 | Scan through our [existing issues](https://github.com/MrPowers/mack/issues) to find one that interests you. If you find an issue to work on, make sure that no one else is already working on it, so you can get assigned. After that, you are welcome to open a PR with a fix. 12 | 13 | ### Pull Request 14 | 15 | When you're finished with the changes, create a pull request, also known as a PR. 16 | - Before you commit, install the pre-commit hooks with `pre-commit install`. 17 | - Make sure that the pre-commit hooks pass on your local machine. 18 | - Don't forget to link PR to issue if you are solving one. 19 | - As you update your PR and apply changes, mark each conversation as resolved. 20 | - If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues. 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | ENV TZ=America/Chicago 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 5 | 6 | RUN apt-get update && \ 7 | apt-get -y install --no-install-recommends default-jdk software-properties-common python3-pip python3.9 python3.9-dev libpq-dev build-essential wget libssl-dev libffi-dev vim && \ 8 | apt-get clean 9 | 10 | RUN wget https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz && \ 11 | tar xvf spark-3.3.2-bin-hadoop3.tgz && \ 12 | mv spark-3.3.2-bin-hadoop3/ /usr/local/spark && \ 13 | ln -s /usr/local/spark spark 14 | 15 | WORKDIR app 16 | COPY . /app 17 | 18 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2 19 | RUN update-alternatives --config python3 20 | 21 | RUN pip3 install poetry delta-spark 22 | RUN poetry install 23 | 24 | ENV PYSPARK_PYTHON=python3 25 | ENV PYSPARK_SUBMIT_ARGS='--packages io.delta:delta-core_2.12:2.2.0 pyspark-shell' 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Matthew Powers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mack 2 | 3 | ![![image](https://github.com/MrPowers/mack/workflows/build/badge.svg)](https://github.com/MrPowers/mack/actions/workflows/ci.yml/badge.svg) 4 | ![![image](https://github.com/MrPowers/mack/workflows/build/badge.svg)](https://github.com/MrPowers/mack/actions/workflows/black.yml/badge.svg) 5 | ![![image](https://github.com/MrPowers/mack/workflows/build/badge.svg)](https://github.com/MrPowers/mack/actions/workflows/ruff.yml/badge.svg) 6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/mack) 7 | [![PyPI version](https://badge.fury.io/py/mack.svg)](https://badge.fury.io/py/mack) 8 | 9 | mack provides a variety of helper methods that make it easy for you to perform common Delta Lake operations. 10 | 11 | ![mack](https://github.com/MrPowers/mack/raw/main/images/mack.jpg) 12 | 13 | ## Setup 14 | 15 | Install mack with `pip install mack`. 16 | 17 | Here's an example of how you can perform a Type 2 SCD upsert with a single line of code using Mack: 18 | 19 | ```python 20 | import mack 21 | 22 | mack.type_2_scd_upsert(path, updatesDF, "pkey", ["attr1", "attr2"]) 23 | ``` 24 | 25 | ## Type 2 SCD Upserts 26 | 27 | This library provides an opinionated, conventions over configuration, approach to Type 2 SCD management. Let's look at an example before 28 | covering the conventions required to take advantage of the functionality. 29 | 30 | Suppose you have the following SCD table with the `pkey` primary key: 31 | 32 | ``` 33 | +----+-----+-----+----------+-------------------+--------+ 34 | |pkey|attr1|attr2|is_current| effective_time|end_time| 35 | +----+-----+-----+----------+-------------------+--------+ 36 | | 1| A| A| true|2019-01-01 00:00:00| null| 37 | | 2| B| B| true|2019-01-01 00:00:00| null| 38 | | 4| D| D| true|2019-01-01 00:00:00| null| 39 | +----+-----+-----+----------+-------------------+--------+ 40 | ``` 41 | 42 | You'd like to perform an upsert with this data: 43 | 44 | ``` 45 | +----+-----+-----+-------------------+ 46 | |pkey|attr1|attr2| effective_time| 47 | +----+-----+-----+-------------------+ 48 | | 2| Z| null|2020-01-01 00:00:00| // upsert data 49 | | 3| C| C|2020-09-15 00:00:00| // new pkey 50 | +----+-----+-----+-------------------+ 51 | ``` 52 | 53 | Here's how to perform the upsert: 54 | 55 | ```scala 56 | mack.type_2_scd_upsert(delta_table, updatesDF, "pkey", ["attr1", "attr2"]) 57 | ``` 58 | 59 | Here's the table after the upsert: 60 | 61 | ``` 62 | +----+-----+-----+----------+-------------------+-------------------+ 63 | |pkey|attr1|attr2|is_current| effective_time| end_time| 64 | +----+-----+-----+----------+-------------------+-------------------+ 65 | | 2| B| B| false|2019-01-01 00:00:00|2020-01-01 00:00:00| 66 | | 4| D| D| true|2019-01-01 00:00:00| null| 67 | | 1| A| A| true|2019-01-01 00:00:00| null| 68 | | 3| C| C| true|2020-09-15 00:00:00| null| 69 | | 2| Z| null| true|2020-01-01 00:00:00| null| 70 | +----+-----+-----+----------+-------------------+-------------------+ 71 | ``` 72 | 73 | You can leverage the upsert code if your SCD table meets these requirements: 74 | 75 | * Contains a unique primary key column 76 | * Any change in an attribute column triggers an upsert 77 | * SCD logic is exposed via `effective_time`, `end_time` and `is_current` column (you can also use date or version columns for SCD upserts) 78 | 79 | ## Kill duplicates 80 | 81 | The `kill_duplicate` function completely removes all duplicate rows from a Delta table. 82 | 83 | Suppose you have the following table: 84 | 85 | ``` 86 | +----+----+----+ 87 | |col1|col2|col3| 88 | +----+----+----+ 89 | | 1| A| A| # duplicate 90 | | 2| A| B| 91 | | 3| A| A| # duplicate 92 | | 4| A| A| # duplicate 93 | | 5| B| B| # duplicate 94 | | 6| D| D| 95 | | 9| B| B| # duplicate 96 | +----+----+----+ 97 | ``` 98 | 99 | Run the `kill_duplicates` function: 100 | 101 | ```python 102 | mack.kill_duplicates(deltaTable, ["col2", "col3"]) 103 | ``` 104 | 105 | Here's the ending state of the table: 106 | 107 | ``` 108 | +----+----+----+ 109 | |col1|col2|col3| 110 | +----+----+----+ 111 | | 2| A| B| 112 | | 6| D| D| 113 | +----+----+----+ 114 | ``` 115 | 116 | ## Drop duplicates with Primary Key 117 | 118 | The `drop_duplicates_pkey` function removes all but one duplicate row from a Delta table. 119 | **Warning:** You have to provide a primary column that **must contain unique values**, otherwise the method will default to kill the duplicates. 120 | If you can not provide a unique primary key, you can use the `drop_duplicates` method. 121 | 122 | Suppose you have the following table: 123 | 124 | ``` 125 | +----+----+----+----+ 126 | |col1|col2|col3|col4| 127 | +----+----+----+----+ 128 | | 1| A| A| C| # duplicate1 129 | | 2| A| B| C| 130 | | 3| A| A| D| # duplicate1 131 | | 4| A| A| E| # duplicate1 132 | | 5| B| B| C| # duplicate2 133 | | 6| D| D| C| 134 | | 9| B| B| E| # duplicate2 135 | +----+----+----+----+ 136 | ``` 137 | 138 | Run the `drop_duplicates` function: 139 | 140 | ```python 141 | mack.drop_duplicates_pkey(delta_table=deltaTable, primary_key="col1", duplication_columns=["col2", "col3"]) 142 | ``` 143 | 144 | Here's the ending state of the table: 145 | 146 | ``` 147 | +----+----+----+----+ 148 | |col1|col2|col3|col4| 149 | +----+----+----+----+ 150 | | 1| A| A| C| 151 | | 2| A| B| C| 152 | | 5| B| B| C| 153 | | 6| D| D| C| 154 | +----+----+----+----+ 155 | ``` 156 | 157 | ## Drop duplicates 158 | 159 | The `drop_duplicates` function removes all but one duplicate row from a Delta table. It behaves exactly like the `drop_duplicates` DataFrame API. 160 | **Warning:** This method is overwriting the whole table, thus very inefficient. If you can, use the `drop_duplicates_pkey` method instead. 161 | 162 | Suppose you have the following table: 163 | 164 | ``` 165 | +----+----+----+----+ 166 | |col1|col2|col3|col4| 167 | +----+----+----+----+ 168 | | 1| A| A| C| # duplicate 169 | | 1| A| A| C| # duplicate 170 | | 2| A| A| C| 171 | +----+----+----+----+ 172 | ``` 173 | 174 | Run the `drop_duplicates` function: 175 | 176 | ```python 177 | mack.drop_duplicates(delta_table=deltaTable, duplication_columns=["col1"]) 178 | ``` 179 | 180 | Here's the ending state of the table: 181 | 182 | ``` 183 | +----+----+----+----+ 184 | |col1|col2|col3|col4| 185 | +----+----+----+----+ 186 | | 1| A| A| C| # duplicate 187 | | 2| A| A| C| # duplicate 188 | +----+----+----+----+ 189 | ``` 190 | 191 | ## Copy table 192 | 193 | The `copy_table` function copies an existing Delta table. 194 | When you copy a table, it gets recreated at a specified target. This target could be a path or a table in a metastore. 195 | Copying includes: 196 | 197 | * Data 198 | * Partitioning 199 | * Table properties 200 | 201 | Copying **does not** include the delta log, which means that you will not be able to restore the new table to an old version of the original 202 | table. 203 | 204 | Here's how to perform the copy: 205 | 206 | ```python 207 | mack.copy_table(delta_table=deltaTable, target_path=path) 208 | ``` 209 | 210 | ## Validate append 211 | 212 | The `validate_append` function provides a mechanism for allowing some columns for schema evolution, but rejecting appends with columns that aren't specificly allowlisted. 213 | 214 | Suppose you have the following Delta table: 215 | 216 | ``` 217 | +----+----+----+ 218 | |col1|col2|col3| 219 | +----+----+----+ 220 | | 2| b| B| 221 | | 1| a| A| 222 | +----+----+----+ 223 | ``` 224 | 225 | Here's a appender function that wraps `validate_append`: 226 | 227 | ```python 228 | def append_fun(delta_table, append_df): 229 | mack.validate_append( 230 | delta_table, 231 | append_df, 232 | required_cols=["col1", "col2"], 233 | optional_cols=["col4"], 234 | ) 235 | ``` 236 | 237 | You can append the following DataFrame that contains the required columns and the optional columns: 238 | 239 | ``` 240 | +----+----+----+ 241 | |col1|col2|col4| 242 | +----+----+----+ 243 | | 3| c| cat| 244 | | 4| d| dog| 245 | +----+----+----+ 246 | ``` 247 | 248 | Here's what the Delta table will contain after that data is appended: 249 | 250 | ``` 251 | +----+----+----+----+ 252 | |col1|col2|col3|col4| 253 | +----+----+----+----+ 254 | | 3| c|null| cat| 255 | | 4| d|null| dog| 256 | | 2| b| B|null| 257 | | 1| a| A|null| 258 | +----+----+----+----+ 259 | ``` 260 | 261 | You cannot append the following DataFrame which contains the required columns, but also contains another column (`col5`) that's not specified as an optional column. 262 | 263 | ``` 264 | +----+----+----+ 265 | |col1|col2|col5| 266 | +----+----+----+ 267 | | 4| b| A| 268 | | 5| y| C| 269 | | 6| z| D| 270 | +----+----+----+ 271 | ``` 272 | 273 | Here's the error you'll get when you attempt this write: "TypeError: The column 'col5' is not part of the current Delta table. If you want to add the column to the table you must set the optional_cols parameter." 274 | 275 | You also cannot append the following DataFrame which is missing one of the required columns. 276 | 277 | ``` 278 | +----+----+ 279 | |col1|col4| 280 | +----+----+ 281 | | 4| A| 282 | | 5| C| 283 | | 6| D| 284 | +----+----+ 285 | ``` 286 | 287 | Here's the error you'll get: "TypeError: The base Delta table has these columns '['col1', 'col4']', but these columns are required '['col1', 'col2']'." 288 | 289 | ## Append data without duplicates 290 | 291 | The `append_without_duplicates` function helps to append records to a existing Delta table without getting duplicates appended to the 292 | record. 293 | 294 | Suppose you have the following Delta table: 295 | 296 | ``` 297 | +----+----+----+ 298 | |col1|col2|col3| 299 | +----+----+----+ 300 | | 1| A| B| 301 | | 2| C| D| 302 | | 3| E| F| 303 | +----+----+----+ 304 | ``` 305 | 306 | Here is data to be appended: 307 | 308 | ``` 309 | +----+----+----+ 310 | |col1|col2|col3| 311 | +----+----+----+ 312 | | 2| R| T| # duplicate col1 313 | | 8| A| B| 314 | | 8| C| D| # duplicate col1 315 | | 10| X| Y| 316 | +----+----+----+ 317 | ``` 318 | 319 | Run the `append_without_duplicates` function: 320 | 321 | ```python 322 | mack.append_without_duplicates(deltaTable, append_df, ["col1"]) 323 | ``` 324 | 325 | Here's the ending result: 326 | 327 | ``` 328 | 329 | +----+----+----+ 330 | |col1|col2|col3| 331 | +----+----+----+ 332 | | 1| A| B| 333 | | 2| C| D| 334 | | 3| E| F| 335 | | 8| A| B| 336 | | 10| X| Y| 337 | +----+----+----+ 338 | ``` 339 | 340 | Notice that the duplicate `col1` value was not appended. If a normal append operation was run, then the Delta table would contain two rows of data with `col1` equal to 2. 341 | 342 | ## Delta File Sizes 343 | 344 | The `delta_file_sizes` function returns a dictionary that contains the total size in bytes, the amount of files and the average file size for a given Delta Table. 345 | 346 | Suppose you have the following Delta Table, partitioned by `col1`: 347 | 348 | ``` 349 | +----+----+----+ 350 | |col1|col2|col3| 351 | +----+----+----+ 352 | | 1| A| A| 353 | | 2| A| B| 354 | +----+----+----+ 355 | ``` 356 | 357 | Running `mack.delta_file_sizes(delta_table)` on that table will return: 358 | 359 | ``` 360 | {"size_in_bytes": 1320, 361 | "number_of_files": 2, 362 | "average_file_size_in_bytes": 660} 363 | ``` 364 | 365 | ## Show Delta File Sizes 366 | 367 | The `show_delta_file_sizes` function prints the amount of files, the size of the table, and the average file size for a delta table. 368 | 369 | Suppose you have the following Delta Table, partitioned by `col1`: 370 | 371 | ``` 372 | +----+----+----+ 373 | |col1|col2|col3| 374 | +----+----+----+ 375 | | 1| A| A| 376 | | 2| A| B| 377 | +----+----+----+ 378 | ``` 379 | 380 | Running `mack.delta_file_sizes(delta_table)` on that table will print: 381 | 382 | `The delta table contains 2 files with a size of 1.32 kB. The average file size is 660.0 B` 383 | 384 | ## Humanize Bytes 385 | 386 | The `humanize_bytes` function formats an integer representing a number of bytes in an easily human readable format. 387 | 388 | ```python 389 | mack.humanize_bytes(1234567890) # "1.23 GB" 390 | mack.humanize_bytes(1234567890000) # "1.23 TB" 391 | ``` 392 | 393 | It's a lot easier for a human to understand 1.23 GB compared to 1234567890 bytes. 394 | 395 | ## Is Composite Key Candidate 396 | 397 | The `is_composite_key_candidate` function returns a boolean that indicates whether a set of columns are unique and could form a composite key or not. 398 | 399 | Suppose you have the following Delta Table: 400 | 401 | ``` 402 | +----+----+----+ 403 | |col1|col2|col3| 404 | +----+----+----+ 405 | | 1| A| A| 406 | | 2| B| B| 407 | | 2| C| B| 408 | +----+----+----+ 409 | ``` 410 | 411 | Running `mack.is_composite_key_candidate(delta_table, ["col1"])` on that table will return `False`. 412 | Running `mack.is_composite_key_candidate(delta_table, ["col1", "col2"])` on that table will return `True`. 413 | 414 | ## Find Composite Key Candidates in the Delta table 415 | 416 | The `find_composite_key_candidates` function helps you find a composite key that uniquely identifies the rows your Delta table. It returns a list of columns that can be used as a composite key. 417 | 418 | Suppose you have the following Delta table: 419 | 420 | ``` 421 | +----+----+----+ 422 | |col1|col2|col3| 423 | +----+----+----+ 424 | | 1| a| z| 425 | | 1| a| b| 426 | | 3| c| b| 427 | +----+----+----+ 428 | ``` 429 | 430 | Running `mack.find_composite_key_candidates(delta_table)` on that table will return `["col1", "col3"]`. 431 | 432 | ## Append md5 column 433 | 434 | The `with_md5_cols` function appends a `md5` hash of specified columns to the DataFrame. This can be used as a unique key if the selected columns form a composite key. 435 | 436 | You can use this function with the columns identified in `find_composite_key_candidates` to append a unique key to the DataFrame. 437 | 438 | Suppose you have the following Delta table: 439 | 440 | ``` 441 | +----+----+----+ 442 | |col1|col2|col3| 443 | +----+----+----+ 444 | | 1| a|null| 445 | | 2| b| b| 446 | | 3| c| c| 447 | +----+----+----+ 448 | ``` 449 | 450 | Running `mack.with_md5_cols(delta_table, ["col2", "col3"])` on that table will append a `md5_col2_col3` as follows: 451 | 452 | ``` 453 | +----+----+----+--------------------------------+ 454 | |col1|col2|col3|md5_col2_col3 | 455 | +----+----+----+--------------------------------+ 456 | |1 |a |null|0cc175b9c0f1b6a831c399e269772661| 457 | |2 |b |b |1eeaac3814eb80cc40efb005cf0b9141| 458 | |3 |c |c |4e202f8309e7b00349c70845ab02fce9| 459 | +----+----+----+--------------------------------+ 460 | ``` 461 | 462 | ## Get Latest Delta Table Version 463 | 464 | The `latest_version` function gets the most current Delta 465 | Table version number and returns it. 466 | 467 | ```python 468 | delta_table = DeltaTable.forPath(spark, path) 469 | mack.latest_version(delta_table) 470 | >> 2 471 | ``` 472 | 473 | ## Append data with constraints 474 | 475 | The `constraint_append` function helps to append records to an existing Delta table even if there are records in the append dataframe that violate table constraints (both check and not null constraints), these records are appended to an existing quarantine Delta table instead of the target table. If the quarantine Delta table is set to `None`, those records that violate table constraints are simply thrown out. 476 | 477 | Suppose you have the following target Delta table with the following schema and constraints: 478 | 479 | ``` 480 | schema: 481 | col1 int not null 482 | col2 string null 483 | col3 string null 484 | 485 | check constraints: 486 | col1_constraint: (col1 > 0) 487 | col2_constraint: (col2 != 'Z') 488 | 489 | +----+----+----+ 490 | |col1|col2|col3| 491 | +----+----+----+ 492 | | 1| A| B| 493 | | 2| C| D| 494 | | 3| E| F| 495 | +----+----+----+ 496 | ``` 497 | 498 | Suppose you have a quarantine Delta table with the same schema but without the constraints. 499 | 500 | Here is data to be appended: 501 | 502 | ``` 503 | +----+----+----+ 504 | |col1|col2|col3| 505 | +----+----+----+ 506 | | | H| H| # violates col1 not null constraint 507 | | 0| Z| Z| # violates both col1_constraint and col2_constraint 508 | | 4| A| B| 509 | | 5| C| D| 510 | | 6| E| F| 511 | | 9| G| G| 512 | | 11| Z| Z| # violates col2_constraint 513 | +----+----+----+ 514 | ``` 515 | 516 | Run the `constraint_append` function: 517 | 518 | ```python 519 | mack.constraint_append(delta_table, append_df, quarantine_table) 520 | ``` 521 | 522 | Here's the ending result in delta_table: 523 | 524 | ``` 525 | 526 | +----+----+----+ 527 | |col1|col2|col3| 528 | +----+----+----+ 529 | | 1| A| B| 530 | | 2| C| D| 531 | | 3| E| F| 532 | | 4| A| B| 533 | | 5| C| D| 534 | | 6| E| F| 535 | | 9| G| G| 536 | +----+----+----+ 537 | ``` 538 | 539 | Here's the ending result in quarantine_table: 540 | 541 | ``` 542 | 543 | +----+----+----+ 544 | |col1|col2|col3| 545 | +----+----+----+ 546 | | | H| H| 547 | | 0| Z| Z| 548 | | 11| Z| Z| 549 | +----+----+----+ 550 | ``` 551 | 552 | Notice that the records that violated either of the constraints are appended to the quarantine table all other records are appended to the target table and the append has not failed. If a normal append operation was run, then it would have failed on the constraint violation. If `quarantine_table` is set to `None`, records that violated either of the constraints are simply thrown out. 553 | 554 | 555 | ## Rename a Delta Table 556 | 557 | This function is designed to rename a Delta table. It can operate either within a Databricks environment or with a standalone Spark session. 558 | 559 | ## Parameters: 560 | 561 | - `delta_table` (`DeltaTable`): An object representing the Delta table to be renamed. 562 | - `new_table_name` (`str`): The new name for the table. 563 | - `table_location` (`str`, optional): The file path where the table is stored. If not provided, the function attempts to deduce the location from the `DeltaTable` object. Defaults to `None`. 564 | - `databricks` (`bool`, optional): A flag indicating the function's operational environment. Set to `True` if running within Databricks, otherwise, `False`. Defaults to `False`. 565 | - `spark_session` (`pyspark.sql.SparkSession`, optional): The Spark session. This is required when `databricks` is set to `True`. Defaults to `None`. 566 | 567 | ## Returns: 568 | 569 | - `None` 570 | 571 | ## Raises: 572 | 573 | - `TypeError`: If the provided `delta_table` is not a DeltaTable object, or if `databricks` is set to `True` and `spark_session` is `None`. 574 | 575 | ## Example Usage: 576 | 577 | ```python 578 | rename_delta_table(existing_delta_table, "new_table_name") 579 | ``` 580 | 581 | 582 | ## Dictionary 583 | 584 | We're leveraging the following terminology defined [here](https://www.databasestar.com/database-keys/#:~:text=Natural%20key%3A%20an%20attribute%20that,can%20uniquely%20identify%20a%20row). 585 | 586 | * **Natural key:** an attribute that can uniquely identify a row, and exists in the real world. 587 | * **Surrogate key:** an attribute that can uniquely identify a row, and does not exist in the real world.
588 | * **Composite key:** more than one attribute that when combined can uniquely identify a row. 589 | * **Primary key:** the single unique identifier for the row. 590 | * **Candidate key:** an attribute that could be the primary key. 591 | * **Alternate key:** a candidate key that is not the primary key. 592 | * **Unique key:** an attribute that can be unique on the table. Can also be called an alternate key. 593 | * **Foreign key:** an attribute that is used to refer to another record in another table. 594 | 595 | ## Project maintainers 596 | 597 | * Matthew Powers aka [MrPowers](https://github.com/MrPowers) 598 | * Robert Kossendey aka [robertkossendey](https://github.com/robertkossendey) 599 | * Souvik Pratiher aka [souvik-databricks](https://github.com/souvik-databricks) 600 | 601 | ## Project philosophy 602 | 603 | The mack library is designed to make common Delta Lake data tasks easier. 604 | 605 | You don't need to use mack of course. You can write the logic yourself. 606 | 607 | If you don't want to add a dependency to your project, you can also easily copy / paste the functions from mack. The functions in this library are intentionally designed to be easy to copy and paste. 608 | 609 | Let's look at some of the reasons you may want to add mack as a dependency. 610 | 611 | ### Exposing nice public interfaces 612 | 613 | The public interface (and only the public interface) is available via the `mack` namespace. 614 | 615 | When you run `import mack`, you can access the entirety of the public interface. No private implementation details are exposed in the `mack` namespace. 616 | 617 | ### Minimal dependencies 618 | 619 | Mack only depends on Spark & Delta Lake. No other dependencies will be added to Mack. 620 | 621 | Spark users leverage a variety of runtimes and it's not always easy to add a dependency. You can run `pip install mack` and won't have to worry about resolving a lot of dependency conflicts. You can also Just attach a mack wheel file to a cluster to leverage the project. 622 | 623 | ### Provide best practices examples for the community 624 | 625 | Mack strives to be a good example codebase for the PySpark / Delta Lake community. 626 | 627 | There aren't a lot of open source Delta Lake projects. There are even fewer that use good software engineering practices like CI and unit testing. You can use mack to help guide your design decisions in proprietary code repos. 628 | 629 | ### Stable public interfaces and long term support after 1.0 release 630 | 631 | Mack reserves the right to make breaking public interface changes before the 1.0 release. We'll always minimize breaking changes whenever possible. 632 | 633 | After the 1.0 release, Mack will stricly follow Semantic Versioning 2.0 and will only make breaking public interface changes in major releases. Hopefully 1.0 will be the only major release and there won't have to be any breaking changes. 634 | 635 | ### Code design 636 | 637 | Here are some of the code design principles used in Mack: 638 | 639 | * We avoid classes whenever possible. Classes make it harder to copy / paste little chunks of code into notebooks. It's good to [Stop Writing Classes](https://www.youtube.com/watch?v=o9pEzgHorH0). 640 | * We try to make functions that are easy to copy. We do this by limiting functions that depend on other functions or classes. We'd rather nest a single use function in a public interface method than make it separate. 641 | * Develop and then abstract. All code goes in a single file till the right abstractions become apparent. We'd rather have a large file than the wrong abstractions. 642 | 643 | ### Docker Environment 644 | The `Dockerfile` and `docker-compose` files provide a containerized way to run and develop 645 | with `mack`. 646 | 647 | - The first time run `docker build --tag=mack .` to build the image. 648 | - To execute the unit tests inside the `Docker` container, run `docker-compose up test` 649 | - To drop into the running `Docker` container to develop, run `docker run -it mack /bin/bash` 650 | 651 | ## Community 652 | 653 | ### Blogs 654 | 655 | - [Daniel Beach (Confessions of a Data Guy): Simplify Delta Lake Complexity with mack.](https://www.confessionsofadataguy.com/simplify-delta-lake-complexity-with-mack/) 656 | - [Bartosz Konieczny (waitingforcode): Simplified Delta Lake operations with Mack](https://www.waitingforcode.com/delta-lake/simplified-delta-lake-operations-mack/read) 657 | 658 | ### Videos 659 | 660 | - [GeekCoders on YouTube: How I use MACK Library in Delta Lake using Databricks/PySpark](https://www.youtube.com/watch?v=qRR5n6T2N_8) 661 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | test: 4 | image: "mack" 5 | volumes: 6 | - .:/app 7 | command: poetry run pytest 8 | -------------------------------------------------------------------------------- /docs/gen_ref_pages.py: -------------------------------------------------------------------------------- 1 | """Generate the code reference pages and navigation. 2 | 3 | Script was taken from 4 | https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages 5 | """ 6 | 7 | from pathlib import Path 8 | 9 | import mkdocs_gen_files 10 | 11 | nav = mkdocs_gen_files.Nav() 12 | 13 | for path in sorted(Path(".").rglob("mack/**/*.py")): 14 | module_path = path.relative_to(".").with_suffix("") 15 | doc_path = path.relative_to(".").with_suffix(".md") 16 | full_doc_path = Path("reference", doc_path) 17 | 18 | parts = tuple(module_path.parts) 19 | 20 | if parts[-1] == "__init__": 21 | parts = parts[:-1] 22 | doc_path = doc_path.with_name("index.md") 23 | full_doc_path = full_doc_path.with_name("index.md") 24 | elif parts[-1] == "__main__": 25 | continue 26 | 27 | nav[parts] = doc_path.as_posix() # 28 | 29 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: 30 | ident = ".".join(parts) 31 | fd.write(f"::: {ident}") 32 | 33 | mkdocs_gen_files.set_edit_path(full_doc_path, path) 34 | 35 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: 36 | nav_file.writelines(nav.build_literate_nav()) 37 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | {!README.md!} 2 | -------------------------------------------------------------------------------- /images/mack.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrPowers/mack/396f297d7e4db7feb1d1b1825c27d0076aa8e3e0/images/mack.jpg -------------------------------------------------------------------------------- /mack/__init__.py: -------------------------------------------------------------------------------- 1 | from itertools import combinations 2 | from typing import List, Union, Dict, Optional 3 | 4 | from delta import DeltaTable 5 | import pyspark 6 | from pyspark.sql.dataframe import DataFrame 7 | from pyspark.sql.functions import col, concat_ws, count, md5, row_number, max 8 | from pyspark.sql.window import Window 9 | 10 | 11 | def type_2_scd_upsert( 12 | delta_table: DeltaTable, 13 | updates_df: DataFrame, 14 | primary_key: str, 15 | attr_col_names: List[str], 16 | ) -> None: 17 | """ 18 | 19 | 20 | :param path: 21 | :type path: DeltaTable 22 | :param updates_df: 23 | :type updates_df: DataFrame 24 | :param primary_key: 25 | :type primary_key: str 26 | :param attr_col_names: 27 | :type attr_col_names: List[str] 28 | 29 | :returns: 30 | :rtype: None 31 | """ 32 | return type_2_scd_generic_upsert( 33 | delta_table, 34 | updates_df, 35 | primary_key, 36 | attr_col_names, 37 | "is_current", 38 | "effective_time", 39 | "end_time", 40 | ) 41 | 42 | 43 | def type_2_scd_generic_upsert( 44 | delta_table: DeltaTable, 45 | updates_df: DataFrame, 46 | primary_key: str, 47 | attr_col_names: List[str], 48 | is_current_col_name: str, 49 | effective_time_col_name: str, 50 | end_time_col_name: str, 51 | ) -> None: 52 | """ 53 | 54 | 55 | :param delta_table: DeltaTable 56 | :type path: str 57 | :param updates_df: 58 | :type updates_df: DataFrame 59 | :param primary_key: 60 | :type primary_key: str 61 | :param attr_col_names: 62 | :type attr_col_names: List[str] 63 | :param is_current_col_name: 64 | :type is_current_col_name: str 65 | :param effective_time_col_name: 66 | :type effective_time_col_name: str 67 | :param end_time_col_name: 68 | :type effective_time_col_name: str 69 | 70 | :raises TypeError: Raises type error when required column names are not in the base table. 71 | :raises TypeError: Raises type error when required column names for updates are not in the attributes columns list. 72 | 73 | :returns: 74 | :rtype: None 75 | """ 76 | 77 | # validate the existing Delta table 78 | base_col_names = delta_table.toDF().columns 79 | required_base_col_names = ( 80 | [primary_key] 81 | + attr_col_names 82 | + [is_current_col_name, effective_time_col_name, end_time_col_name] 83 | ) 84 | if sorted(base_col_names) != sorted(required_base_col_names): 85 | raise TypeError( 86 | f"The base table has these columns {base_col_names!r}, but these columns are required {required_base_col_names!r}" 87 | ) 88 | # validate the updates DataFrame 89 | updates_col_names = updates_df.columns 90 | required_updates_col_names = ( 91 | [primary_key] + attr_col_names + [effective_time_col_name] 92 | ) 93 | if sorted(updates_col_names) != sorted(required_updates_col_names): 94 | raise TypeError( 95 | f"The updates DataFrame has these columns {updates_col_names!r}, but these columns are required {required_updates_col_names!r}" 96 | ) 97 | 98 | # perform the upsert 99 | updates_attrs = list( 100 | map(lambda attr: f"updates.{attr} <> base.{attr}", attr_col_names) 101 | ) 102 | updates_attrs = " OR ".join(updates_attrs) 103 | staged_updates_attrs = list( 104 | map(lambda attr: f"staged_updates.{attr} <> base.{attr}", attr_col_names) 105 | ) 106 | staged_updates_attrs = " OR ".join(staged_updates_attrs) 107 | staged_part_1 = ( 108 | updates_df.alias("updates") 109 | .join(delta_table.toDF().alias("base"), primary_key) 110 | .where(f"base.{is_current_col_name} = true AND ({updates_attrs})") 111 | .selectExpr("NULL as mergeKey", "updates.*") 112 | ) 113 | staged_part_2 = updates_df.selectExpr(f"{primary_key} as mergeKey", "*") 114 | staged_updates = staged_part_1.union(staged_part_2) 115 | thing = {} 116 | for attr in attr_col_names: 117 | thing[attr] = f"staged_updates.{attr}" 118 | thing2 = { 119 | primary_key: f"staged_updates.{primary_key}", 120 | is_current_col_name: "true", 121 | effective_time_col_name: f"staged_updates.{effective_time_col_name}", 122 | end_time_col_name: "null", 123 | } 124 | res_thing = {**thing, **thing2} 125 | res = ( 126 | delta_table.alias("base") 127 | .merge( 128 | source=staged_updates.alias("staged_updates"), 129 | condition=pyspark.sql.functions.expr(f"base.{primary_key} = mergeKey"), 130 | ) 131 | .whenMatchedUpdate( 132 | condition=f"base.{is_current_col_name} = true AND ({staged_updates_attrs})", 133 | set={ 134 | is_current_col_name: "false", 135 | end_time_col_name: f"staged_updates.{effective_time_col_name}", 136 | }, 137 | ) 138 | .whenNotMatchedInsert(values=res_thing) 139 | .execute() 140 | ) 141 | return res 142 | 143 | 144 | def kill_duplicates(delta_table: DeltaTable, duplication_columns: List[str]) -> None: 145 | """ 146 | 147 | 148 | :param delta_table: 149 | :type delta_table: DeltaTable 150 | :param duplication_columns: 151 | :type duplication_columns: List[str] 152 | 153 | :raises TypeError: Raises type error when input arguments have a invalid type or are empty. 154 | :raises TypeError: Raises type error when required columns are missing in the provided delta table. 155 | """ 156 | if not isinstance(delta_table, DeltaTable): 157 | raise TypeError("An existing delta table must be specified.") 158 | 159 | if not duplication_columns or len(duplication_columns) == 0: 160 | raise TypeError("Duplication columns must be specified") 161 | 162 | data_frame = delta_table.toDF() 163 | 164 | # Make sure that all the required columns are present in the provided delta table 165 | append_data_columns = data_frame.columns 166 | for required_column in duplication_columns: 167 | if required_column not in append_data_columns: 168 | raise TypeError( 169 | f"The base table has these columns {append_data_columns!r}, but these columns are required {duplication_columns!r}" 170 | ) 171 | 172 | q = [] 173 | 174 | duplicate_records = ( 175 | data_frame.withColumn( 176 | "amount_of_records", 177 | count("*").over(Window.partitionBy(duplication_columns)), 178 | ) 179 | .filter(col("amount_of_records") > 1) 180 | .drop("amount_of_records") 181 | .distinct() 182 | ) 183 | 184 | for column in duplication_columns: 185 | q.append(f"old.{column} = new.{column}") 186 | 187 | q = " AND ".join(q) 188 | 189 | # Remove all the duplicate records 190 | delta_table.alias("old").merge( 191 | duplicate_records.alias("new"), q 192 | ).whenMatchedDelete().execute() 193 | 194 | 195 | def drop_duplicates_pkey( 196 | delta_table: DeltaTable, primary_key: str, duplication_columns: List[str] 197 | ) -> None: 198 | """ 199 | 200 | 201 | :param delta_table: 202 | :type delta_table: DeltaTable 203 | :param primary_key: 204 | :type primary_key: str 205 | :param duplication_columns: 206 | :type duplication_columns: List[str] 207 | 208 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty. 209 | :raises TypeError: Raises type error when required columns are missing in the provided delta table. 210 | """ 211 | if not isinstance(delta_table, DeltaTable): 212 | raise TypeError("An existing delta table must be specified.") 213 | 214 | if not primary_key: 215 | raise TypeError("A unique primary key must be specified.") 216 | 217 | if not duplication_columns or len(duplication_columns) == 0: 218 | raise TypeError("A duplication column must be specified.") 219 | 220 | if primary_key in duplication_columns: 221 | raise TypeError("Primary key must not be part of the duplication columns.") 222 | 223 | data_frame = delta_table.toDF() 224 | 225 | # Make sure that all the required columns are present in the provided delta table 226 | append_data_columns = data_frame.columns 227 | required_columns = [primary_key] + duplication_columns 228 | for required_column in required_columns: 229 | if required_column not in append_data_columns: 230 | raise TypeError( 231 | f"The base table has these columns {append_data_columns!r}, but these columns are required {required_columns!r}" 232 | ) 233 | 234 | q = [] 235 | 236 | duplicate_records = ( 237 | data_frame.withColumn( 238 | "row_number", 239 | row_number().over( 240 | Window().partitionBy(duplication_columns).orderBy(primary_key) 241 | ), 242 | ) 243 | .filter(col("row_number") > 1) 244 | .drop("row_number") 245 | .distinct() 246 | ) 247 | for column in required_columns: 248 | q.append(f"old.{column} = new.{column}") 249 | 250 | q = " AND ".join(q) 251 | 252 | # Remove all the duplicate records 253 | delta_table.alias("old").merge( 254 | duplicate_records.alias("new"), q 255 | ).whenMatchedDelete().execute() 256 | 257 | 258 | def drop_duplicates(delta_table: DeltaTable, duplication_columns: List[str]) -> None: 259 | """ 260 | 261 | 262 | :param delta_table: 263 | :type delta_table: DeltaTable 264 | :param duplication_columns: 265 | :type duplication_columns: List[str] 266 | 267 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty. 268 | """ 269 | if not isinstance(delta_table, DeltaTable): 270 | raise TypeError("An existing delta table must be specified.") 271 | 272 | if not duplication_columns or len(duplication_columns) == 0: 273 | raise TypeError("A duplication column must be specified.") 274 | 275 | data_frame = delta_table.toDF() 276 | 277 | details = delta_table.detail().select("location").collect()[0] 278 | 279 | ( 280 | data_frame.drop_duplicates(duplication_columns) 281 | .write.format("delta") 282 | .mode("overwrite") 283 | .save(details["location"]) 284 | ) 285 | 286 | 287 | def copy_table( 288 | delta_table: DeltaTable, target_path: str = "", target_table: str = "" 289 | ) -> None: 290 | """ 291 | 292 | 293 | :param delta_table: 294 | :type delta_table: DeltaTable 295 | :param target_path: , defaults to empty string. 296 | :type target_path: str 297 | :param target_table: , defaults to empty string. 298 | :type target_table: str 299 | 300 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty. 301 | """ 302 | if not isinstance(delta_table, DeltaTable): 303 | raise TypeError("An existing delta table must be specified.") 304 | 305 | if not target_path and not target_table: 306 | raise TypeError("Either target_path or target_table must be specified.") 307 | 308 | origin_table = delta_table.toDF() 309 | 310 | details = delta_table.detail().select("partitionColumns", "properties").collect()[0] 311 | 312 | if target_table: 313 | ( 314 | origin_table.write.format("delta") 315 | .partitionBy(details["partitionColumns"]) 316 | .options(**details["properties"]) 317 | .saveAsTable(target_table) 318 | ) 319 | else: 320 | ( 321 | origin_table.write.format("delta") 322 | .partitionBy(details["partitionColumns"]) 323 | .options(**details["properties"]) 324 | .save(target_path) 325 | ) 326 | 327 | 328 | def validate_append( 329 | delta_table: DeltaTable, 330 | append_df: DataFrame, 331 | required_cols: List[str], 332 | optional_cols: List[str], 333 | ) -> None: 334 | """ 335 | 336 | 337 | :param delta_table: 338 | :type delta_table: DeltaTable 339 | :param append_df: 340 | :type append_df: DataFrame 341 | :param required_cols: 342 | :type required_cols: List[str] 343 | :param optional_cols: 344 | :type optional_cols: List[str] 345 | 346 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty. 347 | :raises TypeError: Raises type error when required columns are missing in the provided delta table. 348 | :raises TypeError: Raises type error when column in append dataframe is not part of the original delta table.. 349 | """ 350 | if not isinstance(delta_table, DeltaTable): 351 | raise TypeError("An existing delta table must be specified.") 352 | 353 | if not isinstance(append_df, DataFrame): 354 | raise TypeError("You must provide a DataFrame that is to be appended.") 355 | 356 | append_data_columns = append_df.columns 357 | 358 | for required_column in required_cols: 359 | if required_column not in append_data_columns: 360 | raise TypeError( 361 | f"The base Delta table has these columns {append_data_columns!r}, but these columns are required {required_cols!r}" 362 | ) 363 | 364 | table_columns = delta_table.toDF().columns 365 | 366 | for column in append_data_columns: 367 | if column not in table_columns and column not in optional_cols: 368 | raise TypeError( 369 | f"The column {column!r} is not part of the current Delta table." 370 | + " If you want to add the column to the table you must set the optional_cols parameter." 371 | ) 372 | 373 | details = delta_table.detail().select("location").collect()[0] 374 | 375 | ( 376 | append_df.write.format("delta") 377 | .mode("append") 378 | .option("mergeSchema", "true") 379 | .save(details["location"]) 380 | ) 381 | 382 | 383 | def append_without_duplicates( 384 | delta_table: DeltaTable, append_df: DataFrame, p_keys: List[str] 385 | ) -> None: 386 | """ 387 | 388 | 389 | :param delta_table: 390 | :type delta_table: DeltaTable 391 | :param append_df: 392 | :type append_df: DataFrame 393 | :param p_keys: 394 | :type p_keys: List[str] 395 | 396 | :raises TypeError: Raises type error when input arguments have a invalid type. 397 | """ 398 | if not isinstance(delta_table, DeltaTable): 399 | raise TypeError("An existing delta table must be specified.") 400 | 401 | condition_columns = [] 402 | for column in p_keys: 403 | condition_columns.append(f"old.{column} = new.{column}") 404 | 405 | condition_columns = " AND ".join(condition_columns) 406 | 407 | deduplicated_append_df = append_df.drop_duplicates(p_keys) 408 | 409 | # Insert records without duplicates 410 | delta_table.alias("old").merge( 411 | deduplicated_append_df.alias("new"), condition_columns 412 | ).whenNotMatchedInsertAll().execute() 413 | 414 | 415 | def is_composite_key_candidate(delta_table: DeltaTable, cols: List[str]) -> bool: 416 | """ 417 | 418 | 419 | :param delta_table: 420 | :type delta_table: DeltaTable 421 | :param cols: 422 | :type cols: List[str] 423 | 424 | :raises TypeError: Raises type error when input arguments have a invalid type or are missing. 425 | :raises TypeError: Raises type error when required columns are not in dataframe columns. 426 | 427 | :returns: 428 | :rtype: bool 429 | """ 430 | if not isinstance(delta_table, DeltaTable): 431 | raise TypeError("An existing delta table must be specified.") 432 | 433 | if not cols or len(cols) == 0: 434 | raise TypeError("At least one column must be specified.") 435 | 436 | data_frame = delta_table.toDF() 437 | 438 | for required_column in cols: 439 | if required_column not in data_frame.columns: 440 | raise TypeError( 441 | f"The base table has these columns {data_frame.columns!r}, but these columns are required {cols!r}" 442 | ) 443 | 444 | duplicate_records = ( 445 | data_frame.withColumn( 446 | "amount_of_records", 447 | count("*").over(Window.partitionBy(cols)), 448 | ) 449 | .filter(col("amount_of_records") > 1) 450 | .drop("amount_of_records") 451 | ) 452 | 453 | if len(duplicate_records.take(1)) == 0: 454 | return True 455 | 456 | return False 457 | 458 | 459 | def delta_file_sizes(delta_table: DeltaTable) -> Dict[str, int]: 460 | """ 461 | 462 | 463 | :param delta_table: 464 | :type delta_table: DeltaTable 465 | 466 | :returns: 467 | :rtype: Dict[str, int] 468 | """ 469 | details = delta_table.detail().select("numFiles", "sizeInBytes").collect()[0] 470 | size_in_bytes, number_of_files = details["sizeInBytes"], details["numFiles"] 471 | average_file_size_in_bytes = round(size_in_bytes / number_of_files, 0) 472 | 473 | return { 474 | "size_in_bytes": size_in_bytes, 475 | "number_of_files": number_of_files, 476 | "average_file_size_in_bytes": average_file_size_in_bytes, 477 | } 478 | 479 | 480 | def show_delta_file_sizes( 481 | delta_table: DeltaTable, humanize_binary: bool = False 482 | ) -> None: 483 | """ 484 | 485 | 486 | :param delta_table: 487 | :type delta_table: DeltaTable 488 | :param humanize_binary: 489 | :type humanize_binary: bool 490 | 491 | :returns: 492 | :rtype: None 493 | """ 494 | details = delta_table.detail().select("numFiles", "sizeInBytes").collect()[0] 495 | size_in_bytes, number_of_files = details["sizeInBytes"], details["numFiles"] 496 | average_file_size_in_bytes = round(size_in_bytes / number_of_files, 0) 497 | 498 | if humanize_binary: 499 | humanized_size_in_bytes = humanize_bytes_binary(size_in_bytes) 500 | humanized_average_file_size = humanize_bytes_binary(average_file_size_in_bytes) 501 | else: 502 | humanized_size_in_bytes = humanize_bytes(size_in_bytes) 503 | humanized_average_file_size = humanize_bytes(average_file_size_in_bytes) 504 | humanized_number_of_files = f"{number_of_files:,}" 505 | 506 | print( 507 | f"The delta table contains {humanized_number_of_files} files with a size of {humanized_size_in_bytes}." 508 | + f" The average file size is {humanized_average_file_size}" 509 | ) 510 | 511 | 512 | def humanize_bytes(n: int) -> str: 513 | """ 514 | 515 | 516 | :param n: 517 | :type n: int 518 | 519 | :returns: 520 | :rtype: str 521 | """ 522 | kilobyte = 1000 523 | for prefix, k in ( 524 | ("PB", kilobyte**5), 525 | ("TB", kilobyte**4), 526 | ("GB", kilobyte**3), 527 | ("MB", kilobyte**2), 528 | ("kB", kilobyte**1), 529 | ): 530 | if n >= k * 0.9: 531 | return f"{n / k:.2f} {prefix}" 532 | return f"{n} B" 533 | 534 | 535 | def humanize_bytes_binary(n: int) -> str: 536 | """ 537 | 538 | 539 | :param n: 540 | :type n: int 541 | 542 | :returns: 543 | :rtype: str 544 | """ 545 | kibibyte = 1024 546 | for prefix, k in ( 547 | ("PB", kibibyte**5), 548 | ("TB", kibibyte**4), 549 | ("GB", kibibyte**3), 550 | ("MB", kibibyte**2), 551 | ("kB", kibibyte**1), 552 | ): 553 | if n >= k * 0.9: 554 | return f"{n / k:.2f} {prefix}" 555 | return f"{n} B" 556 | 557 | 558 | def find_composite_key_candidates( 559 | df: Union[DeltaTable, DataFrame], exclude_cols: List[str] = None 560 | ) -> List: 561 | """ 562 | 563 | 564 | :param df: 565 | :type df: DeltaTable or DataFrame 566 | :param exclude_cols: 567 | :type exclude_cols: List[str], defaults to None. 568 | 569 | :raises TypeError: Raises type error when no composite key can be found. 570 | 571 | :returns: 572 | :rtype: List 573 | """ 574 | if type(df) == DeltaTable: 575 | df = df.toDF() 576 | if exclude_cols is None: 577 | exclude_cols = [] 578 | df_col_excluded = df.drop(*exclude_cols) 579 | total_cols = len(df_col_excluded.columns) 580 | total_row_count = df_col_excluded.distinct().count() 581 | for n in range(1, len(df_col_excluded.columns) + 1): 582 | for c in combinations(df_col_excluded.columns, n): 583 | if df_col_excluded.select(*c).distinct().count() == total_row_count: 584 | if len(df_col_excluded.select(*c).columns) == total_cols: 585 | raise ValueError("No composite key candidates could be identified.") 586 | return list(df_col_excluded.select(*c).columns) 587 | 588 | 589 | def with_md5_cols( 590 | df: Union[DeltaTable, DataFrame], 591 | cols: List[str], 592 | output_col_name: Optional[str] = None, 593 | ) -> DataFrame: 594 | """ 595 | 596 | 597 | :param df: 598 | :type df: DeltaTable or DataFrame 599 | :param cols: 600 | :type cols: List[str] 601 | :param output_col_name: 602 | :type output_col_name: str, defaults to empty string. 603 | 604 | :raises TypeError: Raises type error when no composite key can be found. 605 | 606 | :returns: 607 | :rtype: DataFrame 608 | """ 609 | if output_col_name is None: 610 | output_col_name = "_".join(["md5"] + cols) 611 | if type(df) == DeltaTable: 612 | df = df.toDF() 613 | return df.withColumn(output_col_name, md5(concat_ws("||", *cols))) 614 | 615 | 616 | def latest_version(delta_table: DeltaTable) -> float: 617 | """ 618 | 619 | 620 | :param delta_table: 621 | :type delta_table: DeltaTable 622 | 623 | :returns: 624 | :rtype: float 625 | """ 626 | version = delta_table.history().agg(max("version")).collect()[0][0] 627 | return version 628 | 629 | 630 | def constraint_append( 631 | delta_table: DeltaTable, append_df: DataFrame, quarantine_table: DeltaTable 632 | ): 633 | """ 634 | 635 | 636 | :param delta_table: 637 | :type delta_table: DeltaTable 638 | :param append_df: 639 | :type append_df: DataFrame 640 | :param quarantine_table: 641 | :type quarantine_table: DeltaTable 642 | 643 | :raises TypeError: Raises type error when input arguments have an invalid type. 644 | :raises TypeError: Raises type error when delta_table has no constraints. 645 | """ 646 | 647 | if not isinstance(delta_table, DeltaTable): 648 | raise TypeError("An existing delta table must be specified for delta_table.") 649 | 650 | if not isinstance(append_df, DataFrame): 651 | raise TypeError("You must provide a DataFrame that is to be appended.") 652 | 653 | if quarantine_table is not None and not isinstance(quarantine_table, DeltaTable): 654 | raise TypeError( 655 | "An existing delta table must be specified for quarantine_table." 656 | ) 657 | 658 | properties = delta_table.detail().select("properties").collect()[0]["properties"] 659 | check_constraints = [ 660 | v for k, v in properties.items() if k.startswith("delta.constraints") 661 | ] 662 | 663 | # add null checks 664 | fields = delta_table.toDF().schema.fields 665 | null_constraints = [ 666 | f"{field.name} is not null" for field in fields if not field.nullable 667 | ] 668 | 669 | constraints = check_constraints + null_constraints 670 | 671 | if not constraints: 672 | raise TypeError("There are no constraints present in the target delta table") 673 | 674 | target_details = delta_table.detail().select("location").collect()[0] 675 | if quarantine_table: 676 | quarantine_details = quarantine_table.detail().select("location").collect()[0] 677 | quarantine_df = append_df.filter( 678 | "not (" + " and ".join([c for c in constraints]) + ")" 679 | ) 680 | ( 681 | quarantine_df.write.format("delta") 682 | .mode("append") 683 | .option("mergeSchema", "true") 684 | .save(quarantine_details["location"]) 685 | ) 686 | 687 | filtered_df = append_df.filter(" and ".join([c for c in constraints])) 688 | ( 689 | filtered_df.write.format("delta") 690 | .mode("append") 691 | .option("mergeSchema", "true") 692 | .save(target_details["location"]) 693 | ) 694 | 695 | 696 | def rename_delta_table( 697 | delta_table: DeltaTable, 698 | new_table_name: str, 699 | table_location: str = None, 700 | databricks: bool = False, 701 | spark_session: pyspark.sql.SparkSession = None, 702 | ) -> None: 703 | """ 704 | Renames a Delta table to a new name. This function can be used in a Databricks environment or with a 705 | standalone Spark session. 706 | 707 | Parameters: 708 | delta_table (DeltaTable): The DeltaTable object representing the table to be renamed. 709 | new_table_name (str): The new name for the table. 710 | table_location (str, optional): The file path where the table is stored. Defaults to None. 711 | If None, the function will attempt to determine the location from the DeltaTable object. 712 | databricks (bool, optional): A flag indicating whether the function is being run in a Databricks 713 | environment. Defaults to False. If True, a SparkSession must be provided. 714 | spark_session (pyspark.sql.SparkSession, optional): The Spark session. Defaults to None. 715 | Required if `databricks` is set to True. 716 | 717 | Returns: 718 | None 719 | 720 | Raises: 721 | TypeError: If the provided `delta_table` is not a DeltaTable object, or if `databricks` is True 722 | and `spark_session` is None. 723 | 724 | Example Usage: 725 | >>> rename_delta_table(existing_delta_table, "new_table_name") 726 | """ 727 | if not isinstance(delta_table, DeltaTable): 728 | raise TypeError("An existing delta table must be specified for delta_table.") 729 | if databricks and spark_session is None: 730 | raise TypeError("A spark session must be specified for databricks.") 731 | 732 | if databricks: 733 | spark_session.sql(f"ALTER TABLE {delta_table.name} RENAME TO {new_table_name}") 734 | else: 735 | delta_table.toDF().write.format("delta").mode("overwrite").saveAsTable( 736 | new_table_name 737 | ) 738 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Mack 2 | 3 | theme: 4 | name: "readthedocs" 5 | 6 | plugins: 7 | - search 8 | - gen-files: 9 | scripts: 10 | - docs/gen_ref_pages.py 11 | - section-index 12 | - mkdocstrings: 13 | default_handler: python 14 | handlers: 15 | python: 16 | options: 17 | docstring_style: sphinx 18 | docstring_options: 19 | show_if_no_docstring: true 20 | show_source: true 21 | 22 | nav: 23 | - Mack: index.md 24 | - API Docs: reference/SUMMARY.md 25 | 26 | markdown_extensions: 27 | - markdown_include.include: 28 | base_path: . 29 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "attrs" 5 | version = "24.2.0" 6 | description = "Classes Without Boilerplate" 7 | optional = false 8 | python-versions = ">=3.7" 9 | files = [ 10 | {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, 11 | {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, 12 | ] 13 | 14 | [package.extras] 15 | benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] 16 | cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] 17 | dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] 18 | docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] 19 | tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] 20 | tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] 21 | 22 | [[package]] 23 | name = "cfgv" 24 | version = "3.4.0" 25 | description = "Validate configuration and produce human readable error messages." 26 | optional = false 27 | python-versions = ">=3.8" 28 | files = [ 29 | {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, 30 | {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, 31 | ] 32 | 33 | [[package]] 34 | name = "chispa" 35 | version = "0.9.2" 36 | description = "Pyspark test helper library" 37 | optional = false 38 | python-versions = ">=3.5" 39 | files = [ 40 | {file = "chispa-0.9.2-py3-none-any.whl", hash = "sha256:c6eae922f5c3ccd08f4dc3707202291bb249e68e319d0641795d92d80cfb1cad"}, 41 | {file = "chispa-0.9.2.tar.gz", hash = "sha256:621ad2e64fd27e7372c7b90ab2d5ad1f8dd69b737a3421ba5b6f84b113a18b84"}, 42 | ] 43 | 44 | [[package]] 45 | name = "click" 46 | version = "8.1.7" 47 | description = "Composable command line interface toolkit" 48 | optional = false 49 | python-versions = ">=3.7" 50 | files = [ 51 | {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, 52 | {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, 53 | ] 54 | 55 | [package.dependencies] 56 | colorama = {version = "*", markers = "platform_system == \"Windows\""} 57 | 58 | [[package]] 59 | name = "colorama" 60 | version = "0.4.6" 61 | description = "Cross-platform colored terminal text." 62 | optional = false 63 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 64 | files = [ 65 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 66 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 67 | ] 68 | 69 | [[package]] 70 | name = "delta-spark" 71 | version = "3.2.0" 72 | description = "Python APIs for using Delta Lake with Apache Spark" 73 | optional = false 74 | python-versions = ">=3.6" 75 | files = [ 76 | {file = "delta-spark-3.2.0.tar.gz", hash = "sha256:641967828e47c64805f8c746513da80bea24b5f19b069cdcf64561cd3692e11d"}, 77 | {file = "delta_spark-3.2.0-py3-none-any.whl", hash = "sha256:c4ff3fa7218e58a702cb71eb64384b0005c4d6f0bbdd0fe0b38a53564d946e09"}, 78 | ] 79 | 80 | [package.dependencies] 81 | importlib-metadata = ">=1.0.0" 82 | pyspark = ">=3.5.0,<3.6.0" 83 | 84 | [[package]] 85 | name = "distlib" 86 | version = "0.3.8" 87 | description = "Distribution utilities" 88 | optional = false 89 | python-versions = "*" 90 | files = [ 91 | {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"}, 92 | {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, 93 | ] 94 | 95 | [[package]] 96 | name = "filelock" 97 | version = "3.15.4" 98 | description = "A platform independent file lock." 99 | optional = false 100 | python-versions = ">=3.8" 101 | files = [ 102 | {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, 103 | {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, 104 | ] 105 | 106 | [package.extras] 107 | docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] 108 | testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"] 109 | typing = ["typing-extensions (>=4.8)"] 110 | 111 | [[package]] 112 | name = "ghp-import" 113 | version = "2.1.0" 114 | description = "Copy your docs directly to the gh-pages branch." 115 | optional = false 116 | python-versions = "*" 117 | files = [ 118 | {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, 119 | {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, 120 | ] 121 | 122 | [package.dependencies] 123 | python-dateutil = ">=2.8.1" 124 | 125 | [package.extras] 126 | dev = ["flake8", "markdown", "twine", "wheel"] 127 | 128 | [[package]] 129 | name = "griffe" 130 | version = "1.2.0" 131 | description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." 132 | optional = false 133 | python-versions = ">=3.8" 134 | files = [ 135 | {file = "griffe-1.2.0-py3-none-any.whl", hash = "sha256:a8b2fcb1ecdc5a412e646b0b4375eb20a5d2eac3a11dd8c10c56967a4097663c"}, 136 | {file = "griffe-1.2.0.tar.gz", hash = "sha256:1c9f6ef7455930f3f9b0c4145a961c90385d1e2cbc496f7796fbff560ec60d31"}, 137 | ] 138 | 139 | [package.dependencies] 140 | colorama = ">=0.4" 141 | 142 | [[package]] 143 | name = "identify" 144 | version = "2.6.0" 145 | description = "File identification library for Python" 146 | optional = false 147 | python-versions = ">=3.8" 148 | files = [ 149 | {file = "identify-2.6.0-py2.py3-none-any.whl", hash = "sha256:e79ae4406387a9d300332b5fd366d8994f1525e8414984e1a59e058b2eda2dd0"}, 150 | {file = "identify-2.6.0.tar.gz", hash = "sha256:cb171c685bdc31bcc4c1734698736a7d5b6c8bf2e0c15117f4d469c8640ae5cf"}, 151 | ] 152 | 153 | [package.extras] 154 | license = ["ukkonen"] 155 | 156 | [[package]] 157 | name = "importlib-metadata" 158 | version = "8.4.0" 159 | description = "Read metadata from Python packages" 160 | optional = false 161 | python-versions = ">=3.8" 162 | files = [ 163 | {file = "importlib_metadata-8.4.0-py3-none-any.whl", hash = "sha256:66f342cc6ac9818fc6ff340576acd24d65ba0b3efabb2b4ac08b598965a4a2f1"}, 164 | {file = "importlib_metadata-8.4.0.tar.gz", hash = "sha256:9a547d3bc3608b025f93d403fdd1aae741c24fbb8314df4b155675742ce303c5"}, 165 | ] 166 | 167 | [package.dependencies] 168 | zipp = ">=0.5" 169 | 170 | [package.extras] 171 | doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 172 | perf = ["ipython"] 173 | test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] 174 | 175 | [[package]] 176 | name = "iniconfig" 177 | version = "2.0.0" 178 | description = "brain-dead simple config-ini parsing" 179 | optional = false 180 | python-versions = ">=3.7" 181 | files = [ 182 | {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, 183 | {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, 184 | ] 185 | 186 | [[package]] 187 | name = "jinja2" 188 | version = "3.1.4" 189 | description = "A very fast and expressive template engine." 190 | optional = false 191 | python-versions = ">=3.7" 192 | files = [ 193 | {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, 194 | {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, 195 | ] 196 | 197 | [package.dependencies] 198 | MarkupSafe = ">=2.0" 199 | 200 | [package.extras] 201 | i18n = ["Babel (>=2.7)"] 202 | 203 | [[package]] 204 | name = "markdown" 205 | version = "3.7" 206 | description = "Python implementation of John Gruber's Markdown." 207 | optional = false 208 | python-versions = ">=3.8" 209 | files = [ 210 | {file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"}, 211 | {file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"}, 212 | ] 213 | 214 | [package.extras] 215 | docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] 216 | testing = ["coverage", "pyyaml"] 217 | 218 | [[package]] 219 | name = "markdown-include" 220 | version = "0.8.1" 221 | description = "A Python-Markdown extension which provides an 'include' function" 222 | optional = false 223 | python-versions = ">=3.7" 224 | files = [ 225 | {file = "markdown-include-0.8.1.tar.gz", hash = "sha256:1d0623e0fc2757c38d35df53752768356162284259d259c486b4ab6285cdbbe3"}, 226 | {file = "markdown_include-0.8.1-py3-none-any.whl", hash = "sha256:32f0635b9cfef46997b307e2430022852529f7a5b87c0075c504283e7cc7db53"}, 227 | ] 228 | 229 | [package.dependencies] 230 | markdown = ">=3.0" 231 | 232 | [package.extras] 233 | tests = ["pytest"] 234 | 235 | [[package]] 236 | name = "markupsafe" 237 | version = "2.1.5" 238 | description = "Safely add untrusted strings to HTML/XML markup." 239 | optional = false 240 | python-versions = ">=3.7" 241 | files = [ 242 | {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, 243 | {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, 244 | {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, 245 | {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, 246 | {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, 247 | {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, 248 | {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, 249 | {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, 250 | {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, 251 | {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, 252 | {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, 253 | {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, 254 | {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, 255 | {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, 256 | {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, 257 | {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, 258 | {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, 259 | {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, 260 | {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, 261 | {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, 262 | {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, 263 | {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, 264 | {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, 265 | {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, 266 | {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, 267 | {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, 268 | {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, 269 | {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, 270 | {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, 271 | {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, 272 | {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, 273 | {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, 274 | {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, 275 | {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, 276 | {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, 277 | {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, 278 | {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, 279 | {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, 280 | {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, 281 | {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, 282 | {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, 283 | {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, 284 | {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, 285 | {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, 286 | {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, 287 | {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, 288 | {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, 289 | {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, 290 | {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, 291 | {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, 292 | {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, 293 | {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, 294 | {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, 295 | {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, 296 | {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, 297 | {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, 298 | {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, 299 | {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, 300 | {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, 301 | {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, 302 | ] 303 | 304 | [[package]] 305 | name = "mergedeep" 306 | version = "1.3.4" 307 | description = "A deep merge function for 🐍." 308 | optional = false 309 | python-versions = ">=3.6" 310 | files = [ 311 | {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"}, 312 | {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"}, 313 | ] 314 | 315 | [[package]] 316 | name = "mkdocs" 317 | version = "1.6.1" 318 | description = "Project documentation with Markdown." 319 | optional = false 320 | python-versions = ">=3.8" 321 | files = [ 322 | {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"}, 323 | {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"}, 324 | ] 325 | 326 | [package.dependencies] 327 | click = ">=7.0" 328 | colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} 329 | ghp-import = ">=1.0" 330 | jinja2 = ">=2.11.1" 331 | markdown = ">=3.3.6" 332 | markupsafe = ">=2.0.1" 333 | mergedeep = ">=1.3.4" 334 | mkdocs-get-deps = ">=0.2.0" 335 | packaging = ">=20.5" 336 | pathspec = ">=0.11.1" 337 | pyyaml = ">=5.1" 338 | pyyaml-env-tag = ">=0.1" 339 | watchdog = ">=2.0" 340 | 341 | [package.extras] 342 | i18n = ["babel (>=2.9.0)"] 343 | min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"] 344 | 345 | [[package]] 346 | name = "mkdocs-autorefs" 347 | version = "1.2.0" 348 | description = "Automatically link across pages in MkDocs." 349 | optional = false 350 | python-versions = ">=3.8" 351 | files = [ 352 | {file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"}, 353 | {file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"}, 354 | ] 355 | 356 | [package.dependencies] 357 | Markdown = ">=3.3" 358 | markupsafe = ">=2.0.1" 359 | mkdocs = ">=1.1" 360 | 361 | [[package]] 362 | name = "mkdocs-gen-files" 363 | version = "0.4.0" 364 | description = "MkDocs plugin to programmatically generate documentation pages during the build" 365 | optional = false 366 | python-versions = ">=3.7,<4.0" 367 | files = [ 368 | {file = "mkdocs-gen-files-0.4.0.tar.gz", hash = "sha256:377bff8ee8e93515916689f483d971643f83a94eed7e92318854da8f344f0163"}, 369 | {file = "mkdocs_gen_files-0.4.0-py3-none-any.whl", hash = "sha256:3241a4c947ecd11763ca77cc645015305bf71a0e1b9b886801c114fcf9971e71"}, 370 | ] 371 | 372 | [package.dependencies] 373 | mkdocs = ">=1.0.3,<2.0.0" 374 | 375 | [[package]] 376 | name = "mkdocs-get-deps" 377 | version = "0.2.0" 378 | description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file" 379 | optional = false 380 | python-versions = ">=3.8" 381 | files = [ 382 | {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"}, 383 | {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"}, 384 | ] 385 | 386 | [package.dependencies] 387 | mergedeep = ">=1.3.4" 388 | platformdirs = ">=2.2.0" 389 | pyyaml = ">=5.1" 390 | 391 | [[package]] 392 | name = "mkdocs-literate-nav" 393 | version = "0.6.1" 394 | description = "MkDocs plugin to specify the navigation in Markdown instead of YAML" 395 | optional = false 396 | python-versions = ">=3.7" 397 | files = [ 398 | {file = "mkdocs_literate_nav-0.6.1-py3-none-any.whl", hash = "sha256:e70bdc4a07050d32da79c0b697bd88e9a104cf3294282e9cb20eec94c6b0f401"}, 399 | {file = "mkdocs_literate_nav-0.6.1.tar.gz", hash = "sha256:78a7ab6d878371728acb0cdc6235c9b0ffc6e83c997b037f4a5c6ff7cef7d759"}, 400 | ] 401 | 402 | [package.dependencies] 403 | mkdocs = ">=1.0.3" 404 | 405 | [[package]] 406 | name = "mkdocs-section-index" 407 | version = "0.3.9" 408 | description = "MkDocs plugin to allow clickable sections that lead to an index page" 409 | optional = false 410 | python-versions = ">=3.8" 411 | files = [ 412 | {file = "mkdocs_section_index-0.3.9-py3-none-any.whl", hash = "sha256:5e5eb288e8d7984d36c11ead5533f376fdf23498f44e903929d72845b24dfe34"}, 413 | {file = "mkdocs_section_index-0.3.9.tar.gz", hash = "sha256:b66128d19108beceb08b226ee1ba0981840d14baf8a652b6c59e650f3f92e4f8"}, 414 | ] 415 | 416 | [package.dependencies] 417 | mkdocs = ">=1.2" 418 | 419 | [[package]] 420 | name = "mkdocstrings" 421 | version = "0.26.0" 422 | description = "Automatic documentation from sources, for MkDocs." 423 | optional = false 424 | python-versions = ">=3.8" 425 | files = [ 426 | {file = "mkdocstrings-0.26.0-py3-none-any.whl", hash = "sha256:1aa227fe94f88e80737d37514523aacd473fc4b50a7f6852ce41447ab23f2654"}, 427 | {file = "mkdocstrings-0.26.0.tar.gz", hash = "sha256:ff9d0de28c8fa877ed9b29a42fe407cfe6736d70a1c48177aa84fcc3dc8518cd"}, 428 | ] 429 | 430 | [package.dependencies] 431 | click = ">=7.0" 432 | Jinja2 = ">=2.11.1" 433 | Markdown = ">=3.6" 434 | MarkupSafe = ">=1.1" 435 | mkdocs = ">=1.4" 436 | mkdocs-autorefs = ">=1.2" 437 | platformdirs = ">=2.2" 438 | pymdown-extensions = ">=6.3" 439 | 440 | [package.extras] 441 | crystal = ["mkdocstrings-crystal (>=0.3.4)"] 442 | python = ["mkdocstrings-python (>=0.5.2)"] 443 | python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] 444 | 445 | [[package]] 446 | name = "mkdocstrings-python" 447 | version = "0.8.3" 448 | description = "A Python handler for mkdocstrings." 449 | optional = false 450 | python-versions = ">=3.7" 451 | files = [ 452 | {file = "mkdocstrings-python-0.8.3.tar.gz", hash = "sha256:9ae473f6dc599339b09eee17e4d2b05d6ac0ec29860f3fc9b7512d940fc61adf"}, 453 | {file = "mkdocstrings_python-0.8.3-py3-none-any.whl", hash = "sha256:4e6e1cd6f37a785de0946ced6eb846eb2f5d891ac1cc2c7b832943d3529087a7"}, 454 | ] 455 | 456 | [package.dependencies] 457 | griffe = ">=0.24" 458 | mkdocstrings = ">=0.19" 459 | 460 | [[package]] 461 | name = "nodeenv" 462 | version = "1.9.1" 463 | description = "Node.js virtual environment builder" 464 | optional = false 465 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 466 | files = [ 467 | {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, 468 | {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, 469 | ] 470 | 471 | [[package]] 472 | name = "packaging" 473 | version = "24.1" 474 | description = "Core utilities for Python packages" 475 | optional = false 476 | python-versions = ">=3.8" 477 | files = [ 478 | {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, 479 | {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, 480 | ] 481 | 482 | [[package]] 483 | name = "pathspec" 484 | version = "0.12.1" 485 | description = "Utility library for gitignore style pattern matching of file paths." 486 | optional = false 487 | python-versions = ">=3.8" 488 | files = [ 489 | {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, 490 | {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, 491 | ] 492 | 493 | [[package]] 494 | name = "platformdirs" 495 | version = "4.2.2" 496 | description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." 497 | optional = false 498 | python-versions = ">=3.8" 499 | files = [ 500 | {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, 501 | {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, 502 | ] 503 | 504 | [package.extras] 505 | docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] 506 | test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] 507 | type = ["mypy (>=1.8)"] 508 | 509 | [[package]] 510 | name = "pluggy" 511 | version = "1.5.0" 512 | description = "plugin and hook calling mechanisms for python" 513 | optional = false 514 | python-versions = ">=3.8" 515 | files = [ 516 | {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, 517 | {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, 518 | ] 519 | 520 | [package.extras] 521 | dev = ["pre-commit", "tox"] 522 | testing = ["pytest", "pytest-benchmark"] 523 | 524 | [[package]] 525 | name = "pre-commit" 526 | version = "2.21.0" 527 | description = "A framework for managing and maintaining multi-language pre-commit hooks." 528 | optional = false 529 | python-versions = ">=3.7" 530 | files = [ 531 | {file = "pre_commit-2.21.0-py2.py3-none-any.whl", hash = "sha256:e2f91727039fc39a92f58a588a25b87f936de6567eed4f0e673e0507edc75bad"}, 532 | {file = "pre_commit-2.21.0.tar.gz", hash = "sha256:31ef31af7e474a8d8995027fefdfcf509b5c913ff31f2015b4ec4beb26a6f658"}, 533 | ] 534 | 535 | [package.dependencies] 536 | cfgv = ">=2.0.0" 537 | identify = ">=1.0.0" 538 | nodeenv = ">=0.11.1" 539 | pyyaml = ">=5.1" 540 | virtualenv = ">=20.10.0" 541 | 542 | [[package]] 543 | name = "py4j" 544 | version = "0.10.9.7" 545 | description = "Enables Python programs to dynamically access arbitrary Java objects" 546 | optional = false 547 | python-versions = "*" 548 | files = [ 549 | {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, 550 | {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, 551 | ] 552 | 553 | [[package]] 554 | name = "pymdown-extensions" 555 | version = "10.9" 556 | description = "Extension pack for Python Markdown." 557 | optional = false 558 | python-versions = ">=3.8" 559 | files = [ 560 | {file = "pymdown_extensions-10.9-py3-none-any.whl", hash = "sha256:d323f7e90d83c86113ee78f3fe62fc9dee5f56b54d912660703ea1816fed5626"}, 561 | {file = "pymdown_extensions-10.9.tar.gz", hash = "sha256:6ff740bcd99ec4172a938970d42b96128bdc9d4b9bcad72494f29921dc69b753"}, 562 | ] 563 | 564 | [package.dependencies] 565 | markdown = ">=3.6" 566 | pyyaml = "*" 567 | 568 | [package.extras] 569 | extra = ["pygments (>=2.12)"] 570 | 571 | [[package]] 572 | name = "pyspark" 573 | version = "3.5.0" 574 | description = "Apache Spark Python API" 575 | optional = false 576 | python-versions = ">=3.8" 577 | files = [ 578 | {file = "pyspark-3.5.0.tar.gz", hash = "sha256:d41a9b76bd2aca370a6100d075c029e22ba44c5940927877e9435a3a9c566558"}, 579 | ] 580 | 581 | [package.dependencies] 582 | py4j = "0.10.9.7" 583 | 584 | [package.extras] 585 | connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] 586 | ml = ["numpy (>=1.15)"] 587 | mllib = ["numpy (>=1.15)"] 588 | pandas-on-spark = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] 589 | sql = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] 590 | 591 | [[package]] 592 | name = "pytest" 593 | version = "7.2.0" 594 | description = "pytest: simple powerful testing with Python" 595 | optional = false 596 | python-versions = ">=3.7" 597 | files = [ 598 | {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, 599 | {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"}, 600 | ] 601 | 602 | [package.dependencies] 603 | attrs = ">=19.2.0" 604 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 605 | iniconfig = "*" 606 | packaging = "*" 607 | pluggy = ">=0.12,<2.0" 608 | 609 | [package.extras] 610 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] 611 | 612 | [[package]] 613 | name = "pytest-describe" 614 | version = "1.0.0" 615 | description = "Describe-style plugin for pytest" 616 | optional = false 617 | python-versions = "*" 618 | files = [ 619 | {file = "pytest-describe-1.0.0.tar.gz", hash = "sha256:3e2ea0e77efa09edb98cf90423bf1da21a462ed90bd3120f8f98fe7519a167d5"}, 620 | {file = "pytest_describe-1.0.0-py2-none-any.whl", hash = "sha256:cc3862662faa5a6fb721927aaef46b46cf787e4a8163e5459fc8778e650fabad"}, 621 | {file = "pytest_describe-1.0.0-py3-none-any.whl", hash = "sha256:95fe78639d4d16c4a1e7d62c70f63030b217c08d2ee6dca49559fe6e730c6696"}, 622 | ] 623 | 624 | [package.dependencies] 625 | pytest = ">=2.6.0" 626 | 627 | [[package]] 628 | name = "python-dateutil" 629 | version = "2.9.0.post0" 630 | description = "Extensions to the standard Python datetime module" 631 | optional = false 632 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" 633 | files = [ 634 | {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, 635 | {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, 636 | ] 637 | 638 | [package.dependencies] 639 | six = ">=1.5" 640 | 641 | [[package]] 642 | name = "pyyaml" 643 | version = "6.0.2" 644 | description = "YAML parser and emitter for Python" 645 | optional = false 646 | python-versions = ">=3.8" 647 | files = [ 648 | {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, 649 | {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, 650 | {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"}, 651 | {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"}, 652 | {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"}, 653 | {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"}, 654 | {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"}, 655 | {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"}, 656 | {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"}, 657 | {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"}, 658 | {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"}, 659 | {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"}, 660 | {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"}, 661 | {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"}, 662 | {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"}, 663 | {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"}, 664 | {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"}, 665 | {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"}, 666 | {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"}, 667 | {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"}, 668 | {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"}, 669 | {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"}, 670 | {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"}, 671 | {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"}, 672 | {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"}, 673 | {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"}, 674 | {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"}, 675 | {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"}, 676 | {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"}, 677 | {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"}, 678 | {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"}, 679 | {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"}, 680 | {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"}, 681 | {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"}, 682 | {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"}, 683 | {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"}, 684 | {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"}, 685 | {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"}, 686 | {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"}, 687 | {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"}, 688 | {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"}, 689 | {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"}, 690 | {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"}, 691 | {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"}, 692 | {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"}, 693 | {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"}, 694 | {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"}, 695 | {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"}, 696 | {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"}, 697 | {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"}, 698 | {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"}, 699 | {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, 700 | {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, 701 | ] 702 | 703 | [[package]] 704 | name = "pyyaml-env-tag" 705 | version = "0.1" 706 | description = "A custom YAML tag for referencing environment variables in YAML files. " 707 | optional = false 708 | python-versions = ">=3.6" 709 | files = [ 710 | {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, 711 | {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, 712 | ] 713 | 714 | [package.dependencies] 715 | pyyaml = "*" 716 | 717 | [[package]] 718 | name = "ruff" 719 | version = "0.0.254" 720 | description = "An extremely fast Python linter, written in Rust." 721 | optional = false 722 | python-versions = ">=3.7" 723 | files = [ 724 | {file = "ruff-0.0.254-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:dd58c500d039fb381af8d861ef456c3e94fd6855c3d267d6c6718c9a9fe07be0"}, 725 | {file = "ruff-0.0.254-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:688379050ae05394a6f9f9c8471587fd5dcf22149bd4304a4ede233cc4ef89a1"}, 726 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac1429be6d8bd3db0bf5becac3a38bd56f8421447790c50599cd90fd53417ec4"}, 727 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:059a380c08e849b6f312479b18cc63bba2808cff749ad71555f61dd930e3c9a2"}, 728 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3f15d5d033fd3dcb85d982d6828ddab94134686fac2c02c13a8822aa03e1321"}, 729 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:8deba44fd563361c488dedec90dc330763ee0c01ba54e17df54ef5820079e7e0"}, 730 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ef20bf798ffe634090ad3dc2e8aa6a055f08c448810a2f800ab716cc18b80107"}, 731 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0deb1d7226ea9da9b18881736d2d96accfa7f328c67b7410478cc064ad1fa6aa"}, 732 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27d39d697fdd7df1f2a32c1063756ee269ad8d5345c471ee3ca450636d56e8c6"}, 733 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2fc21d060a3197ac463596a97d9b5db2d429395938b270ded61dd60f0e57eb21"}, 734 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f70dc93bc9db15cccf2ed2a831938919e3e630993eeea6aba5c84bc274237885"}, 735 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_i686.whl", hash = "sha256:09c764bc2bd80c974f7ce1f73a46092c286085355a5711126af351b9ae4bea0c"}, 736 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d4385cdd30153b7aa1d8f75dfd1ae30d49c918ead7de07e69b7eadf0d5538a1f"}, 737 | {file = "ruff-0.0.254-py3-none-win32.whl", hash = "sha256:c38291bda4c7b40b659e8952167f386e86ec29053ad2f733968ff1d78b4c7e15"}, 738 | {file = "ruff-0.0.254-py3-none-win_amd64.whl", hash = "sha256:e15742df0f9a3615fbdc1ee9a243467e97e75bf88f86d363eee1ed42cedab1ec"}, 739 | {file = "ruff-0.0.254-py3-none-win_arm64.whl", hash = "sha256:b435afc4d65591399eaf4b2af86e441a71563a2091c386cadf33eaa11064dc09"}, 740 | {file = "ruff-0.0.254.tar.gz", hash = "sha256:0eb66c9520151d3bd950ea43b3a088618a8e4e10a5014a72687881e6f3606312"}, 741 | ] 742 | 743 | [[package]] 744 | name = "six" 745 | version = "1.16.0" 746 | description = "Python 2 and 3 compatibility utilities" 747 | optional = false 748 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 749 | files = [ 750 | {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, 751 | {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, 752 | ] 753 | 754 | [[package]] 755 | name = "virtualenv" 756 | version = "20.26.3" 757 | description = "Virtual Python Environment builder" 758 | optional = false 759 | python-versions = ">=3.7" 760 | files = [ 761 | {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"}, 762 | {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"}, 763 | ] 764 | 765 | [package.dependencies] 766 | distlib = ">=0.3.7,<1" 767 | filelock = ">=3.12.2,<4" 768 | platformdirs = ">=3.9.1,<5" 769 | 770 | [package.extras] 771 | docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] 772 | test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] 773 | 774 | [[package]] 775 | name = "watchdog" 776 | version = "5.0.2" 777 | description = "Filesystem events monitoring" 778 | optional = false 779 | python-versions = ">=3.9" 780 | files = [ 781 | {file = "watchdog-5.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d961f4123bb3c447d9fcdcb67e1530c366f10ab3a0c7d1c0c9943050936d4877"}, 782 | {file = "watchdog-5.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72990192cb63872c47d5e5fefe230a401b87fd59d257ee577d61c9e5564c62e5"}, 783 | {file = "watchdog-5.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6bec703ad90b35a848e05e1b40bf0050da7ca28ead7ac4be724ae5ac2653a1a0"}, 784 | {file = "watchdog-5.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:dae7a1879918f6544201d33666909b040a46421054a50e0f773e0d870ed7438d"}, 785 | {file = "watchdog-5.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c4a440f725f3b99133de610bfec93d570b13826f89616377715b9cd60424db6e"}, 786 | {file = "watchdog-5.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8b2918c19e0d48f5f20df458c84692e2a054f02d9df25e6c3c930063eca64c1"}, 787 | {file = "watchdog-5.0.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:aa9cd6e24126d4afb3752a3e70fce39f92d0e1a58a236ddf6ee823ff7dba28ee"}, 788 | {file = "watchdog-5.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f627c5bf5759fdd90195b0c0431f99cff4867d212a67b384442c51136a098ed7"}, 789 | {file = "watchdog-5.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d7594a6d32cda2b49df3fd9abf9b37c8d2f3eab5df45c24056b4a671ac661619"}, 790 | {file = "watchdog-5.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba32efcccfe2c58f4d01115440d1672b4eb26cdd6fc5b5818f1fb41f7c3e1889"}, 791 | {file = "watchdog-5.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:963f7c4c91e3f51c998eeff1b3fb24a52a8a34da4f956e470f4b068bb47b78ee"}, 792 | {file = "watchdog-5.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8c47150aa12f775e22efff1eee9f0f6beee542a7aa1a985c271b1997d340184f"}, 793 | {file = "watchdog-5.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:14dd4ed023d79d1f670aa659f449bcd2733c33a35c8ffd88689d9d243885198b"}, 794 | {file = "watchdog-5.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b84bff0391ad4abe25c2740c7aec0e3de316fdf7764007f41e248422a7760a7f"}, 795 | {file = "watchdog-5.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e8d5ff39f0a9968952cce548e8e08f849141a4fcc1290b1c17c032ba697b9d7"}, 796 | {file = "watchdog-5.0.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fb223456db6e5f7bd9bbd5cd969f05aae82ae21acc00643b60d81c770abd402b"}, 797 | {file = "watchdog-5.0.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9814adb768c23727a27792c77812cf4e2fd9853cd280eafa2bcfa62a99e8bd6e"}, 798 | {file = "watchdog-5.0.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:901ee48c23f70193d1a7bc2d9ee297df66081dd5f46f0ca011be4f70dec80dab"}, 799 | {file = "watchdog-5.0.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:638bcca3d5b1885c6ec47be67bf712b00a9ab3d4b22ec0881f4889ad870bc7e8"}, 800 | {file = "watchdog-5.0.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5597c051587f8757798216f2485e85eac583c3b343e9aa09127a3a6f82c65ee8"}, 801 | {file = "watchdog-5.0.2-py3-none-manylinux2014_armv7l.whl", hash = "sha256:53ed1bf71fcb8475dd0ef4912ab139c294c87b903724b6f4a8bd98e026862e6d"}, 802 | {file = "watchdog-5.0.2-py3-none-manylinux2014_i686.whl", hash = "sha256:29e4a2607bd407d9552c502d38b45a05ec26a8e40cc7e94db9bb48f861fa5abc"}, 803 | {file = "watchdog-5.0.2-py3-none-manylinux2014_ppc64.whl", hash = "sha256:b6dc8f1d770a8280997e4beae7b9a75a33b268c59e033e72c8a10990097e5fde"}, 804 | {file = "watchdog-5.0.2-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:d2ab34adc9bf1489452965cdb16a924e97d4452fcf88a50b21859068b50b5c3b"}, 805 | {file = "watchdog-5.0.2-py3-none-manylinux2014_s390x.whl", hash = "sha256:7d1aa7e4bb0f0c65a1a91ba37c10e19dabf7eaaa282c5787e51371f090748f4b"}, 806 | {file = "watchdog-5.0.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:726eef8f8c634ac6584f86c9c53353a010d9f311f6c15a034f3800a7a891d941"}, 807 | {file = "watchdog-5.0.2-py3-none-win32.whl", hash = "sha256:bda40c57115684d0216556671875e008279dea2dc00fcd3dde126ac8e0d7a2fb"}, 808 | {file = "watchdog-5.0.2-py3-none-win_amd64.whl", hash = "sha256:d010be060c996db725fbce7e3ef14687cdcc76f4ca0e4339a68cc4532c382a73"}, 809 | {file = "watchdog-5.0.2-py3-none-win_ia64.whl", hash = "sha256:3960136b2b619510569b90f0cd96408591d6c251a75c97690f4553ca88889769"}, 810 | {file = "watchdog-5.0.2.tar.gz", hash = "sha256:dcebf7e475001d2cdeb020be630dc5b687e9acdd60d16fea6bb4508e7b94cf76"}, 811 | ] 812 | 813 | [package.extras] 814 | watchmedo = ["PyYAML (>=3.10)"] 815 | 816 | [[package]] 817 | name = "zipp" 818 | version = "3.20.1" 819 | description = "Backport of pathlib-compatible object wrapper for zip files" 820 | optional = false 821 | python-versions = ">=3.8" 822 | files = [ 823 | {file = "zipp-3.20.1-py3-none-any.whl", hash = "sha256:9960cd8967c8f85a56f920d5d507274e74f9ff813a0ab8889a5b5be2daf44064"}, 824 | {file = "zipp-3.20.1.tar.gz", hash = "sha256:c22b14cc4763c5a5b04134207736c107db42e9d3ef2d9779d465f5f1bcba572b"}, 825 | ] 826 | 827 | [package.extras] 828 | check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] 829 | cover = ["pytest-cov"] 830 | doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 831 | enabler = ["pytest-enabler (>=2.2)"] 832 | test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] 833 | type = ["pytest-mypy"] 834 | 835 | [metadata] 836 | lock-version = "2.0" 837 | python-versions = "^3.11" 838 | content-hash = "b2bbb36ae36bc6bb07e5b595bbd95c8e0c2a980e2628710bfb908e946c2a0dbc" 839 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "mack" 3 | version = "0.5.0" 4 | description = "" 5 | authors = ["Matthew Powers "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | 11 | [tool.poetry.dev-dependencies] 12 | pre-commit = "^2.20.0" 13 | pyspark = "3.5.0" 14 | delta-spark = "3.2.0" 15 | pytest = "7.2.0" 16 | chispa = "0.9.2" 17 | pytest-describe = "^1.0.0" 18 | ruff = "^0.0.254" 19 | 20 | [tool.poetry.group.mkdocs] 21 | optional = true 22 | 23 | [tool.poetry.group.mkdocs.dependencies] 24 | mkdocstrings-python = "^0.8.3" 25 | mkdocs-gen-files = "^0.4.0" 26 | mkdocs-literate-nav = "^0.6.0" 27 | mkdocs-section-index = "^0.3.5" 28 | markdown-include = "^0.8.1" 29 | mkdocs = "^1.4.2" 30 | 31 | [build-system] 32 | requires = ["poetry-core"] 33 | build-backend = "poetry.core.masonry.api" 34 | 35 | [tool.black] 36 | include = '\.pyi?$' 37 | exclude = ''' 38 | /( 39 | \.git 40 | | \.hg 41 | | \.mypy_cache 42 | | \.tox 43 | | \.venv 44 | | _build 45 | | buck-out 46 | | build 47 | | dist 48 | )/ 49 | ''' 50 | 51 | [tool.ruff] 52 | line-length = 150 53 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrPowers/mack/396f297d7e4db7feb1d1b1825c27d0076aa8e3e0/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_public_interface.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import chispa 3 | import pyspark 4 | from delta import DeltaTable, configure_spark_with_delta_pip 5 | from datetime import datetime as dt 6 | from pyspark.sql.types import ( 7 | StructType, 8 | StructField, 9 | StringType, 10 | IntegerType, 11 | BooleanType, 12 | DateType, 13 | TimestampType, 14 | ) 15 | import mack 16 | 17 | builder = ( 18 | pyspark.sql.SparkSession.builder.appName("MyApp") 19 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 20 | .config( 21 | "spark.sql.catalog.spark_catalog", 22 | "org.apache.spark.sql.delta.catalog.DeltaCatalog", 23 | ) 24 | .config("spark.sql.shuffle.partitions", "2") 25 | ) 26 | 27 | spark = configure_spark_with_delta_pip(builder).getOrCreate() 28 | 29 | 30 | # upsert 31 | def test_upserts_with_single_attribute(tmp_path): 32 | path = f"{tmp_path}/tmp/delta-upsert-single-attr" 33 | data2 = [ 34 | (1, "A", True, dt(2019, 1, 1), None), 35 | (2, "B", True, dt(2019, 1, 1), None), 36 | (4, "D", True, dt(2019, 1, 1), None), 37 | ] 38 | schema = StructType( 39 | [ 40 | StructField("pkey", IntegerType(), True), 41 | StructField("attr", StringType(), True), 42 | StructField("is_current", BooleanType(), True), 43 | StructField("effective_time", TimestampType(), True), 44 | StructField("end_time", TimestampType(), True), 45 | ] 46 | ) 47 | df = spark.createDataFrame(data=data2, schema=schema) 48 | df.write.format("delta").save(path) 49 | 50 | updates_data = [ 51 | (2, "Z", dt(2020, 1, 1)), # value to upsert 52 | (3, "C", dt(2020, 9, 15)), # new value 53 | ] 54 | updates_schema = StructType( 55 | [ 56 | StructField("pkey", IntegerType(), True), 57 | StructField("attr", StringType(), True), 58 | StructField("effective_time", TimestampType(), True), 59 | ] 60 | ) 61 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema) 62 | 63 | delta_table = DeltaTable.forPath(spark, path) 64 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr"]) 65 | 66 | actual_df = spark.read.format("delta").load(path) 67 | 68 | expected_df = spark.createDataFrame( 69 | [ 70 | (2, "B", False, dt(2019, 1, 1), dt(2020, 1, 1)), 71 | (3, "C", True, dt(2020, 9, 15), None), 72 | (2, "Z", True, dt(2020, 1, 1), None), 73 | (4, "D", True, dt(2019, 1, 1), None), 74 | (1, "A", True, dt(2019, 1, 1), None), 75 | ], 76 | schema, 77 | ) 78 | 79 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True) 80 | 81 | 82 | def test_errors_out_if_base_df_does_not_have_all_required_columns(tmp_path): 83 | path = f"{tmp_path}/tmp/delta-incomplete" 84 | data2 = [ 85 | ("A", True, dt(2019, 1, 1), None), 86 | ("B", True, dt(2019, 1, 1), None), 87 | ("D", True, dt(2019, 1, 1), None), 88 | ] 89 | schema = StructType( 90 | [ 91 | # pkey is missing from base! 92 | StructField("attr", StringType(), True), 93 | StructField("is_current", BooleanType(), True), 94 | StructField("effective_time", TimestampType(), True), 95 | StructField("end_time", TimestampType(), True), 96 | ] 97 | ) 98 | df = spark.createDataFrame(data=data2, schema=schema) 99 | df.write.format("delta").save(path) 100 | 101 | updates_data = [ 102 | (2, "Z", dt(2020, 1, 1)), # value to upsert 103 | (3, "C", dt(2020, 9, 15)), # new value 104 | ] 105 | updates_schema = StructType( 106 | [ 107 | StructField("pkey", IntegerType(), True), 108 | StructField("attr", StringType(), True), 109 | StructField("effective_time", TimestampType(), True), 110 | ] 111 | ) 112 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema) 113 | 114 | delta_table = DeltaTable.forPath(spark, path) 115 | with pytest.raises(TypeError): 116 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr"]) 117 | 118 | 119 | def test_errors_out_if_updates_table_does_not_contain_all_required_columns(tmp_path): 120 | path = f"{tmp_path}/tmp/delta-error-udpate-missing-col" 121 | data2 = [ 122 | (1, "A", True, dt(2019, 1, 1), None), 123 | (2, "B", True, dt(2019, 1, 1), None), 124 | (4, "D", True, dt(2019, 1, 1), None), 125 | ] 126 | schema = StructType( 127 | [ 128 | StructField("pkey", IntegerType(), True), 129 | StructField("attr", StringType(), True), 130 | StructField("is_current", BooleanType(), True), 131 | StructField("effective_time", TimestampType(), True), 132 | StructField("end_time", TimestampType(), True), 133 | ] 134 | ) 135 | df = spark.createDataFrame(data=data2, schema=schema) 136 | df.write.format("delta").save(path) 137 | 138 | updates_data = [ 139 | ("Z", dt(2020, 1, 1)), # value to upsert 140 | ("C", dt(2020, 9, 15)), # new value 141 | ] 142 | updates_schema = StructType( 143 | [ 144 | # pkey is missing from updates DataFrame 145 | StructField("attr", StringType(), True), 146 | StructField("effective_time", TimestampType(), True), 147 | ] 148 | ) 149 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema) 150 | 151 | delta_table = DeltaTable.forPath(spark, path) 152 | with pytest.raises(TypeError): 153 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr"]) 154 | 155 | 156 | def test_upserts_based_on_multiple_attributes(tmp_path): 157 | path = f"{tmp_path}/tmp/delta-upsert-multiple-attr" 158 | data2 = [ 159 | (1, "A", "A", True, dt(2019, 1, 1), None), 160 | (2, "B", "B", True, dt(2019, 1, 1), None), 161 | (4, "D", "D", True, dt(2019, 1, 1), None), 162 | ] 163 | schema = StructType( 164 | [ 165 | StructField("pkey", IntegerType(), True), 166 | StructField("attr1", StringType(), True), 167 | StructField("attr2", StringType(), True), 168 | StructField("is_current", BooleanType(), True), 169 | StructField("effective_time", TimestampType(), True), 170 | StructField("end_time", TimestampType(), True), 171 | ] 172 | ) 173 | df = spark.createDataFrame(data=data2, schema=schema) 174 | df.write.format("delta").save(path) 175 | 176 | updates_data = [ 177 | (2, "Z", None, dt(2020, 1, 1)), # value to upsert 178 | (3, "C", "C", dt(2020, 9, 15)), # new value 179 | ] 180 | updates_schema = StructType( 181 | [ 182 | StructField("pkey", IntegerType(), True), 183 | StructField("attr1", StringType(), True), 184 | StructField("attr2", StringType(), True), 185 | StructField("effective_time", TimestampType(), True), 186 | ] 187 | ) 188 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema) 189 | 190 | delta_table = DeltaTable.forPath(spark, path) 191 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr1", "attr2"]) 192 | 193 | actual_df = spark.read.format("delta").load(path) 194 | 195 | expected_df = spark.createDataFrame( 196 | [ 197 | (2, "B", "B", False, dt(2019, 1, 1), dt(2020, 1, 1)), 198 | (3, "C", "C", True, dt(2020, 9, 15), None), 199 | (2, "Z", None, True, dt(2020, 1, 1), None), 200 | (4, "D", "D", True, dt(2019, 1, 1), None), 201 | (1, "A", "A", True, dt(2019, 1, 1), None), 202 | ], 203 | schema, 204 | ) 205 | 206 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True) 207 | 208 | 209 | # def describe_type_2_scd_generic_upsert(): 210 | # type_2_scd_generic_upsert 211 | def test_upserts_based_on_date_columns(tmp_path): 212 | path = f"{tmp_path}/tmp/delta-upsert-date" 213 | # // create Delta Lake 214 | data2 = [ 215 | (1, "A", True, dt(2019, 1, 1), None), 216 | (2, "B", True, dt(2019, 1, 1), None), 217 | (4, "D", True, dt(2019, 1, 1), None), 218 | ] 219 | 220 | schema = StructType( 221 | [ 222 | StructField("pkey", IntegerType(), True), 223 | StructField("attr", StringType(), True), 224 | StructField("cur", BooleanType(), True), 225 | StructField("effective_date", DateType(), True), 226 | StructField("end_date", DateType(), True), 227 | ] 228 | ) 229 | 230 | df = spark.createDataFrame(data=data2, schema=schema) 231 | df.write.format("delta").save(path) 232 | 233 | # create updates DF 234 | updates_df = spark.createDataFrame( 235 | [ 236 | (3, "C", dt(2020, 9, 15)), # new value 237 | (2, "Z", dt(2020, 1, 1)), # value to upsert 238 | ] 239 | ).toDF("pkey", "attr", "effective_date") 240 | 241 | # perform upsert 242 | delta_table = DeltaTable.forPath(spark, path) 243 | mack.type_2_scd_generic_upsert( 244 | delta_table, updates_df, "pkey", ["attr"], "cur", "effective_date", "end_date" 245 | ) 246 | 247 | actual_df = spark.read.format("delta").load(path) 248 | 249 | expected_df = spark.createDataFrame( 250 | [ 251 | (2, "B", False, dt(2019, 1, 1), dt(2020, 1, 1)), 252 | (3, "C", True, dt(2020, 9, 15), None), 253 | (2, "Z", True, dt(2020, 1, 1), None), 254 | (4, "D", True, dt(2019, 1, 1), None), 255 | (1, "A", True, dt(2019, 1, 1), None), 256 | ], 257 | schema, 258 | ) 259 | 260 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True) 261 | 262 | 263 | def test_upserts_based_on_version_number(tmp_path): 264 | path = f"{tmp_path}/tmp/delta-upsert-version" 265 | # create Delta Lake 266 | data2 = [ 267 | (1, "A", True, 1, None), 268 | (2, "B", True, 1, None), 269 | (4, "D", True, 1, None), 270 | ] 271 | 272 | schema = StructType( 273 | [ 274 | StructField("pkey", IntegerType(), True), 275 | StructField("attr", StringType(), True), 276 | StructField("is_current", BooleanType(), True), 277 | StructField("effective_ver", IntegerType(), True), 278 | StructField("end_ver", IntegerType(), True), 279 | ] 280 | ) 281 | 282 | df = spark.createDataFrame(data=data2, schema=schema) 283 | 284 | df.write.format("delta").save(path) 285 | 286 | # create updates DF 287 | updates_df = spark.createDataFrame( 288 | [ 289 | (2, "Z", 2), # value to upsert 290 | (3, "C", 3), # new value 291 | ] 292 | ).toDF("pkey", "attr", "effective_ver") 293 | 294 | # perform upsert 295 | delta_table = DeltaTable.forPath(spark, path) 296 | mack.type_2_scd_generic_upsert( 297 | delta_table, 298 | updates_df, 299 | "pkey", 300 | ["attr"], 301 | "is_current", 302 | "effective_ver", 303 | "end_ver", 304 | ) 305 | 306 | # show result 307 | res = spark.read.format("delta").load(path) 308 | 309 | expected_data = [ 310 | (2, "B", False, 1, 2), 311 | (3, "C", True, 3, None), 312 | (2, "Z", True, 2, None), 313 | (4, "D", True, 1, None), 314 | (1, "A", True, 1, None), 315 | ] 316 | 317 | expected = spark.createDataFrame(expected_data, schema) 318 | 319 | chispa.assert_df_equality(res, expected, ignore_row_order=True) 320 | 321 | 322 | def test_upserts_does_not_insert_duplicate(tmp_path): 323 | path = f"{tmp_path}/tmp/delta-no-duplicate" 324 | # create Delta Lake 325 | data2 = [ 326 | (1, "A", True, dt(2019, 1, 1), None), 327 | (2, "B", True, dt(2019, 1, 1), None), 328 | (4, "D", True, dt(2019, 1, 1), None), 329 | ] 330 | 331 | schema = StructType( 332 | [ 333 | StructField("pkey", IntegerType(), True), 334 | StructField("attr", StringType(), True), 335 | StructField("cur", BooleanType(), True), 336 | StructField("effective_date", DateType(), True), 337 | StructField("end_date", DateType(), True), 338 | ] 339 | ) 340 | 341 | df = spark.createDataFrame(data=data2, schema=schema) 342 | df.write.format("delta").save(path) 343 | 344 | # create updates DF 345 | updates_df = spark.createDataFrame( 346 | [ 347 | (1, "A", dt(2019, 1, 1)), # duplicate row 348 | ] 349 | ).toDF("pkey", "attr", "effective_date") 350 | 351 | # perform upsert 352 | delta_table = DeltaTable.forPath(spark, path) 353 | mack.type_2_scd_generic_upsert( 354 | delta_table, updates_df, "pkey", ["attr"], "cur", "effective_date", "end_date" 355 | ) 356 | 357 | actual_df = spark.read.format("delta").load(path) 358 | 359 | expected_df = spark.createDataFrame( 360 | [ 361 | (1, "A", True, dt(2019, 1, 1), None), 362 | (2, "B", True, dt(2019, 1, 1), None), 363 | (4, "D", True, dt(2019, 1, 1), None), 364 | ], 365 | schema, 366 | ) 367 | 368 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True) 369 | 370 | 371 | # def describe_kill_duplicates(): 372 | def test_kills_duplicates_in_a_delta_table(tmp_path): 373 | path = f"{tmp_path}/deduplicate1" 374 | data = [ 375 | (1, "A", "A"), # duplicate 376 | (2, "A", "B"), 377 | (3, "A", "A"), # duplicate 378 | (4, "A", "A"), # duplicate 379 | (5, "B", "B"), # duplicate 380 | (6, "D", "D"), 381 | (9, "B", "B"), # duplicate 382 | ] 383 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 384 | df.write.format("delta").save(path) 385 | 386 | delta_table = DeltaTable.forPath(spark, path) 387 | 388 | mack.kill_duplicates(delta_table, ["col3", "col2"]) 389 | 390 | res = spark.read.format("delta").load(path) 391 | 392 | expected_data = [ 393 | (2, "A", "B"), 394 | (6, "D", "D"), 395 | ] 396 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3"]) 397 | 398 | chispa.assert_df_equality(res, expected, ignore_row_order=True) 399 | 400 | 401 | def test_drop_duplicates_pkey_in_a_delta_table(tmp_path): 402 | path = f"{tmp_path}/drop_duplicates_pkey" 403 | data = [ 404 | (1, "A", "A", "C"), # duplicate 405 | (2, "A", "B", "C"), 406 | (3, "A", "A", "D"), # duplicate 407 | (4, "A", "A", "E"), # duplicate 408 | (5, "B", "B", "C"), # duplicate 409 | (6, "D", "D", "C"), 410 | (9, "B", "B", "E"), # duplicate 411 | ] 412 | df = spark.createDataFrame(data, ["col1", "col2", "col3", "col4"]) 413 | df.write.format("delta").save(path) 414 | 415 | delta_table = DeltaTable.forPath(spark, path) 416 | 417 | mack.drop_duplicates_pkey(delta_table, "col1", ["col2", "col3"]) 418 | 419 | res = spark.read.format("delta").load(path) 420 | 421 | expected_data = [ 422 | (1, "A", "A", "C"), 423 | (2, "A", "B", "C"), 424 | (5, "B", "B", "C"), 425 | (6, "D", "D", "C"), 426 | ] 427 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3", "col4"]) 428 | 429 | chispa.assert_df_equality(res, expected, ignore_row_order=True) 430 | 431 | 432 | def test_drop_duplicates_pkey_in_a_delta_table_no_duplication_cols(tmp_path): 433 | path = f"{tmp_path}/drop_duplicates_pkey_no_duplication_cols" 434 | data = [ 435 | (1, "A", "A", "C"), # duplicate 436 | (1, "A", "A", "C"), # duplicate 437 | (1, "A", "A", "C"), # duplicate 438 | (1, "A", "A", "C"), # duplicate 439 | ] 440 | df = spark.createDataFrame(data, ["col1", "col2", "col3", "col4"]) 441 | df.write.format("delta").save(path) 442 | 443 | delta_table = DeltaTable.forPath(spark, path) 444 | 445 | with pytest.raises(TypeError): 446 | mack.drop_duplicates_pkey(delta_table, "col1", []) 447 | 448 | 449 | def test_drop_duplicates_in_a_delta_table(tmp_path): 450 | path = f"{tmp_path}/drop_duplicates" 451 | data = [ 452 | (1, "A", "A", "C"), # duplicate 453 | (1, "A", "A", "C"), # duplicate 454 | (1, "A", "A", "C"), # duplicate 455 | (1, "A", "A", "C"), # duplicate 456 | ] 457 | df = spark.createDataFrame(data, ["col1", "col2", "col3", "col4"]) 458 | df.write.format("delta").save(path) 459 | 460 | delta_table = DeltaTable.forPath(spark, path) 461 | 462 | mack.drop_duplicates(delta_table, ["col1"]), 463 | 464 | res = spark.read.format("delta").load(path) 465 | 466 | expected_data = [ 467 | (1, "A", "A", "C"), 468 | ] 469 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3", "col4"]) 470 | 471 | chispa.assert_df_equality(res, expected, ignore_row_order=True) 472 | 473 | 474 | def test_copy_delta_table(tmp_path): 475 | path = f"{tmp_path}/copy_test_1" 476 | data = [ 477 | (1, "A", "A"), 478 | (2, "A", "B"), 479 | ] 480 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 481 | 482 | ( 483 | df.write.format("delta") 484 | .partitionBy(["col1"]) 485 | .option("delta.logRetentionDuration", "interval 30 days") 486 | .save(path) 487 | ) 488 | 489 | origin_table = DeltaTable.forPath(spark, path) 490 | origin_details = origin_table.detail().select("partitionColumns", "properties") 491 | 492 | mack.copy_table(origin_table, f"{tmp_path}/copy_test_2") 493 | 494 | copied_table = DeltaTable.forPath(spark, f"{tmp_path}/copy_test_2") 495 | copied_details = copied_table.detail().select("partitionColumns", "properties") 496 | 497 | chispa.assert_df_equality(origin_details, copied_details) 498 | chispa.assert_df_equality( 499 | origin_table.toDF(), copied_table.toDF(), ignore_row_order=True 500 | ) 501 | 502 | 503 | # append without duplicates 504 | def test_append_without_duplicates_single_column(tmp_path): 505 | path = f"{tmp_path}/append_without_duplicates" 506 | data = [ 507 | (1, "A", "B"), 508 | (2, "C", "D"), 509 | (3, "E", "F"), 510 | ] 511 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 512 | df.write.format("delta").save(path) 513 | 514 | delta_table = DeltaTable.forPath(spark, path) 515 | 516 | append_df = spark.createDataFrame( 517 | [ 518 | (2, "R", "T"), # duplicate 519 | (8, "A", "B"), 520 | (8, "B", "C"), # duplicate 521 | (10, "X", "Y"), 522 | ], 523 | ["col1", "col2", "col3"], 524 | ) 525 | 526 | mack.append_without_duplicates(delta_table, append_df, ["col1"]) 527 | 528 | appended_data = spark.read.format("delta").load(path) 529 | 530 | expected_data = [ 531 | (1, "A", "B"), 532 | (2, "C", "D"), 533 | (3, "E", "F"), 534 | (8, "A", "B"), 535 | (10, "X", "Y"), 536 | ] 537 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3"]) 538 | chispa.assert_df_equality(appended_data, expected, ignore_row_order=True) 539 | 540 | 541 | def test_validate_append(tmp_path): 542 | path = f"{tmp_path}/validate_append" 543 | 544 | def append_fun(delta_table, append_df): 545 | mack.validate_append( 546 | delta_table, 547 | append_df, 548 | required_cols=["col1", "col2"], 549 | optional_cols=["col4"], 550 | ) 551 | 552 | # Create Delta table 553 | data = [ 554 | (1, "a", "A"), 555 | (2, "b", "B"), 556 | ] 557 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 558 | df.write.format("delta").save(path) 559 | 560 | # Demonstrate that certain DataFrames with optional columns can be appended 561 | delta_table = DeltaTable.forPath(spark, path) 562 | append_df = spark.createDataFrame( 563 | [ 564 | (3, "c", "cat"), 565 | (4, "d", "dog"), 566 | ], 567 | ["col1", "col2", "col4"], 568 | ) 569 | append_fun(delta_table, append_df) 570 | 571 | expected_data = [ 572 | (1, "a", "A", None), 573 | (2, "b", "B", None), 574 | (3, "c", None, "cat"), 575 | (4, "d", None, "dog"), 576 | ] 577 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3", "col4"]) 578 | chispa.assert_df_equality( 579 | spark.read.format("delta").load(path), expected, ignore_row_order=True 580 | ) 581 | 582 | # demonstrate that DataFrames with columns that are not on the accept list cannot be appended 583 | append_df = spark.createDataFrame( 584 | [ 585 | (4, "b", "A"), 586 | (5, "y", "C"), 587 | (6, "z", "D"), 588 | ], 589 | ["col1", "col2", "col5"], 590 | ) 591 | with pytest.raises(TypeError): 592 | mack.validate_append( 593 | delta_table, 594 | append_df, 595 | required_cols=["col1", "col2"], 596 | optional_cols=["col4"], 597 | ) 598 | 599 | # demonstrate that DataFrames with missing required columns cannot be appended 600 | append_df = spark.createDataFrame( 601 | [ 602 | (4, "A"), 603 | (5, "C"), 604 | (6, "D"), 605 | ], 606 | ["col1", "col4"], 607 | ) 608 | with pytest.raises(TypeError): 609 | mack.validate_append( 610 | delta_table, 611 | append_df, 612 | required_cols=["col1", "col2"], 613 | optional_cols=["col4"], 614 | ) 615 | 616 | 617 | def test_append_without_duplicates_multi_column(tmp_path): 618 | path = f"{tmp_path}/append_without_duplicates" 619 | data = [ 620 | (1, "a", "A"), 621 | (2, "b", "R"), 622 | (3, "c", "X"), 623 | ] 624 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 625 | df.write.format("delta").save(path) 626 | 627 | delta_table = DeltaTable.forPath(spark, path) 628 | 629 | append_data = spark.createDataFrame( 630 | [ 631 | (2, "b", "R"), # duplicate col1, col2 632 | (2, "x", "R"), # NOT duplicate col1, col2 633 | (8, "y", "F"), 634 | (10, "z", "U"), 635 | ], 636 | ["col1", "col2", "col3"], 637 | ) 638 | 639 | mack.append_without_duplicates(delta_table, append_data, ["col1", "col2"]) 640 | 641 | appended_data = spark.read.format("delta").load(path) 642 | 643 | expected_data = [ 644 | (1, "a", "A"), 645 | (2, "b", "R"), 646 | (2, "x", "R"), 647 | (3, "c", "X"), 648 | (8, "y", "F"), 649 | (10, "z", "U"), 650 | ] 651 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3"]) 652 | chispa.assert_df_equality(appended_data, expected, ignore_row_order=True) 653 | 654 | 655 | def test_is_composite_key_candidate(tmp_path): 656 | path = f"{tmp_path}/is_composite_key_candidate" 657 | data = [ 658 | (1, "a", "A"), 659 | (2, "b", "R"), 660 | (2, "c", "D"), 661 | (3, "e", "F"), 662 | ] 663 | 664 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 665 | df.write.format("delta").save(path) 666 | 667 | delta_table = DeltaTable.forPath(spark, path) 668 | 669 | assert not mack.is_composite_key_candidate(delta_table, ["col1"]) 670 | assert mack.is_composite_key_candidate(delta_table, ["col1", "col2"]) 671 | 672 | 673 | def test_delta_file_sizes(tmp_path): 674 | path = f"{tmp_path}/delta_file_sizes" 675 | data = [ 676 | (1, "A", "A"), 677 | (2, "A", "B"), 678 | ] 679 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 680 | 681 | ( 682 | df.write.format("delta") 683 | .partitionBy(["col1"]) 684 | .option("delta.logRetentionDuration", "interval 30 days") 685 | .save(path) 686 | ) 687 | 688 | delta_table = DeltaTable.forPath(spark, path) 689 | 690 | result = mack.delta_file_sizes(delta_table) 691 | 692 | expected_result = { 693 | "size_in_bytes": 1320, 694 | "number_of_files": 2, 695 | "average_file_size_in_bytes": 660, 696 | } 697 | 698 | assert result == expected_result 699 | 700 | 701 | def test_show_delta_file_sizes(capfd, tmp_path): 702 | path = f"{tmp_path}/show_delta_file_sizes" 703 | data = [ 704 | (1, "A", "A"), 705 | (2, "A", "B"), 706 | ] 707 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 708 | 709 | (df.write.format("delta").partitionBy(["col1"]).save(path)) 710 | 711 | delta_table = DeltaTable.forPath(spark, path) 712 | 713 | mack.show_delta_file_sizes(delta_table) 714 | 715 | out, _ = capfd.readouterr() 716 | 717 | assert ( 718 | out 719 | == "The delta table contains 2 files with a size of 1.32 kB. The average file size is 660.0 B\n" 720 | ) 721 | 722 | 723 | def test_humanize_bytes_formats_nicely(): 724 | assert mack.humanize_bytes(12345678) == "12.35 MB" 725 | assert mack.humanize_bytes(1234567890) == "1.23 GB" 726 | assert mack.humanize_bytes(1234567890000) == "1.23 TB" 727 | assert mack.humanize_bytes(1234567890000000) == "1.23 PB" 728 | 729 | 730 | def test_humanize_bytes_binary_formats_nicely(): 731 | assert mack.humanize_bytes_binary(12345678) == "11.77 MB" 732 | assert mack.humanize_bytes_binary(1234567890) == "1.15 GB" 733 | assert mack.humanize_bytes_binary(1234567890000) == "1.12 TB" 734 | assert mack.humanize_bytes_binary(1234567890000000) == "1.10 PB" 735 | 736 | 737 | def test_find_composite_key(tmp_path): 738 | path = f"{tmp_path}/find_composite_key" 739 | data = [ 740 | (1, "a", "z"), 741 | (1, "a", "b"), 742 | (3, "c", "b"), 743 | ] 744 | df = spark.createDataFrame( 745 | data, 746 | [ 747 | "col1", 748 | "col2", 749 | "col3", 750 | ], 751 | ) 752 | df.write.format("delta").save(path) 753 | 754 | delta_table = DeltaTable.forPath(spark, path) 755 | 756 | composite_keys = mack.find_composite_key_candidates(delta_table) 757 | 758 | expected_keys = ["col1", "col3"] 759 | 760 | assert composite_keys == expected_keys 761 | 762 | 763 | def test_find_composite_key_with_value_error(tmp_path): 764 | path = f"{tmp_path}/find_composite_key" 765 | data = [ 766 | (1, "a", "A"), 767 | (2, "b", "R"), 768 | (2, "c", "D"), 769 | (3, "e", "F"), 770 | ] 771 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 772 | df.write.format("delta").save(path) 773 | 774 | delta_table = DeltaTable.forPath(spark, path) 775 | with pytest.raises( 776 | ValueError, match="No composite key candidates could be identified." 777 | ): 778 | mack.find_composite_key_candidates(delta_table, ["col2", "col3"]) 779 | 780 | 781 | def test_with_md5_cols(tmp_path): 782 | path = f"{tmp_path}/find_composite_key" 783 | data = [ 784 | (1, "a", None), 785 | (2, "b", "b"), 786 | (3, "c", "c"), 787 | ] 788 | df = spark.createDataFrame( 789 | data, 790 | [ 791 | "col1", 792 | "col2", 793 | "col3", 794 | ], 795 | ) 796 | df.write.format("delta").save(path) 797 | 798 | delta_table = DeltaTable.forPath(spark, path) 799 | with_md5 = mack.with_md5_cols(delta_table, ["col2", "col3"]) 800 | 801 | expected_data = [ 802 | (1, "a", None, "0cc175b9c0f1b6a831c399e269772661"), 803 | (2, "b", "b", "1eeaac3814eb80cc40efb005cf0b9141"), 804 | (3, "c", "c", "4e202f8309e7b00349c70845ab02fce9"), 805 | ] 806 | expected_df = spark.createDataFrame( 807 | expected_data, 808 | ["col1", "col2", "col3", "md5_col2_col3"], 809 | ) 810 | chispa.assert_df_equality( 811 | with_md5, expected_df, ignore_row_order=True, ignore_nullable=True 812 | ) 813 | 814 | 815 | def test_lastest_version(tmp_path): 816 | path = f"{tmp_path}/latestversion" 817 | 818 | data = [ 819 | (1, "a", None), 820 | (2, "b", "b"), 821 | (3, "c", "c"), 822 | ] 823 | df = spark.createDataFrame( 824 | data, 825 | [ 826 | "col1", 827 | "col2", 828 | "col3", 829 | ], 830 | ) 831 | df.write.format("delta").save(path) 832 | 833 | # write the same dataframe twice 834 | df.write.format("delta").mode("append").save(path) 835 | df.write.format("delta").mode("append").save(path) 836 | 837 | delta_table = DeltaTable.forPath(spark, path) 838 | latest_version = mack.latest_version(delta_table) 839 | assert latest_version == 2 840 | 841 | 842 | def test_constraint_append_no_constraint(tmp_path): 843 | 844 | target_path = f"{tmp_path}/constraint_append/target_table" 845 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table" 846 | 847 | data = [ 848 | (1, "A", "B"), 849 | (2, "C", "D"), 850 | (3, "E", "F"), 851 | ] 852 | 853 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 854 | df.write.format("delta").save(target_path) 855 | 856 | df2 = spark.createDataFrame([], df.schema) 857 | df2.write.format("delta").save(quarantine_path) 858 | 859 | target_table = DeltaTable.forPath(spark, target_path) 860 | append_df = spark.createDataFrame([], df.schema) 861 | quarantine_table = DeltaTable.forPath(spark, quarantine_path) 862 | 863 | # demonstrate that the function cannot be run with target table not having constraints 864 | with pytest.raises( 865 | TypeError, match="There are no constraints present in the target delta table" 866 | ): 867 | mack.constraint_append(target_table, append_df, quarantine_table) 868 | 869 | 870 | def test_constraint_append_multi_constraint(tmp_path): 871 | 872 | target_path = f"{tmp_path}/constraint_append/target_table" 873 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table" 874 | 875 | data = [ 876 | (1, "A", "B"), 877 | (2, "C", "D"), 878 | (3, "E", "F"), 879 | ] 880 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 881 | 882 | df.write.format("delta").save(target_path) 883 | 884 | df2 = spark.createDataFrame([], df.schema) 885 | df2.write.format("delta").save(quarantine_path) 886 | 887 | target_table = DeltaTable.forPath(spark, target_path) 888 | 889 | # adding two constraints 890 | spark.sql( 891 | f"ALTER TABLE delta.`{target_path}` ADD CONSTRAINT col1_constraint CHECK (col1 > 0) " 892 | ) 893 | spark.sql( 894 | f"ALTER TABLE delta.`{target_path}` ADD CONSTRAINT col2_constraint CHECK (col2 != 'Z') " 895 | ) 896 | 897 | # adding other table properties 898 | spark.sql( 899 | f"ALTER TABLE delta.`{target_path}` SET TBLPROPERTIES('this.is.my.key' = 12, this.is.my.key2 = true)" 900 | ) 901 | 902 | append_data = [ 903 | (0, "Z", "Z"), 904 | (4, "A", "B"), 905 | (5, "C", "D"), 906 | (6, "E", "F"), 907 | (9, "G", "G"), 908 | (11, "Z", "Z"), 909 | ] 910 | append_df = spark.createDataFrame(append_data, ["col1", "col2", "col3"]) 911 | 912 | # testing with two constraints 913 | target_table = DeltaTable.forPath(spark, target_path) 914 | quarantine_table = DeltaTable.forPath(spark, quarantine_path) 915 | mack.constraint_append(target_table, append_df, quarantine_table) 916 | 917 | expected_data = [ 918 | (1, "A", "B"), 919 | (2, "C", "D"), 920 | (3, "E", "F"), 921 | (4, "A", "B"), 922 | (5, "C", "D"), 923 | (6, "E", "F"), 924 | (9, "G", "G"), 925 | ] 926 | expected_df = spark.createDataFrame(expected_data, ["col1", "col2", "col3"]) 927 | 928 | appended_data = spark.read.format("delta").load(target_path) 929 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True) 930 | 931 | expected_quarantined_data = [(0, "Z", "Z"), (11, "Z", "Z")] 932 | expected_quarantined_df = spark.createDataFrame( 933 | expected_quarantined_data, ["col1", "col2", "col3"] 934 | ) 935 | 936 | quarantined_data = spark.read.format("delta").load(quarantine_path) 937 | chispa.assert_df_equality( 938 | quarantined_data, expected_quarantined_df, ignore_row_order=True 939 | ) 940 | 941 | 942 | def test_constraint_append_single_constraint(tmp_path): 943 | 944 | target_path = f"{tmp_path}/constraint_append/target_table" 945 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table" 946 | 947 | data = [ 948 | (1, "A", "B"), 949 | (2, "C", "D"), 950 | (3, "E", "F"), 951 | ] 952 | df = spark.createDataFrame(data, ["col1", "col2", "col3"]) 953 | 954 | df.write.format("delta").save(target_path) 955 | 956 | df2 = spark.createDataFrame([], df.schema) 957 | df2.write.format("delta").save(quarantine_path) 958 | 959 | target_table = DeltaTable.forPath(spark, target_path) 960 | 961 | # adding two constraints 962 | spark.sql( 963 | f"ALTER TABLE delta.`{target_path}` ADD CONSTRAINT col1_constraint CHECK (col1 > 0) " 964 | ) 965 | 966 | append_data = [ 967 | (0, "Z", "Z"), 968 | (4, "A", "B"), 969 | (5, "C", "D"), 970 | (6, "E", "F"), 971 | (9, "G", "G"), 972 | (11, "Z", "Z"), 973 | ] 974 | append_df = spark.createDataFrame(append_data, ["col1", "col2", "col3"]) 975 | 976 | # testing with two constraints 977 | target_table = DeltaTable.forPath(spark, target_path) 978 | quarantine_table = DeltaTable.forPath(spark, quarantine_path) 979 | mack.constraint_append(target_table, append_df, quarantine_table) 980 | 981 | expected_data = [ 982 | (1, "A", "B"), 983 | (2, "C", "D"), 984 | (3, "E", "F"), 985 | (4, "A", "B"), 986 | (5, "C", "D"), 987 | (6, "E", "F"), 988 | (9, "G", "G"), 989 | (11, "Z", "Z"), 990 | ] 991 | expected_df = spark.createDataFrame(expected_data, ["col1", "col2", "col3"]) 992 | 993 | appended_data = spark.read.format("delta").load(target_path) 994 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True) 995 | 996 | expected_quarantined_data = [(0, "Z", "Z")] 997 | expected_quarantined_df = spark.createDataFrame( 998 | expected_quarantined_data, ["col1", "col2", "col3"] 999 | ) 1000 | 1001 | quarantined_data = spark.read.format("delta").load(quarantine_path) 1002 | chispa.assert_df_equality( 1003 | quarantined_data, expected_quarantined_df, ignore_row_order=True 1004 | ) 1005 | 1006 | 1007 | def test_constraint_append_notnull_constraint(tmp_path): 1008 | 1009 | target_path = f"{tmp_path}/constraint_append/target_table" 1010 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table" 1011 | 1012 | target_schema = StructType( 1013 | [ 1014 | StructField("col1", IntegerType(), False), 1015 | StructField("col2", StringType(), True), 1016 | StructField("col3", StringType(), False), 1017 | ] 1018 | ) 1019 | 1020 | df = spark.createDataFrame([], target_schema) 1021 | 1022 | target_table = ( 1023 | DeltaTable.create(spark).location(target_path).addColumns(df.schema).execute() 1024 | ) 1025 | 1026 | quarantine_schema = StructType( 1027 | [ 1028 | StructField("col1", IntegerType(), True), 1029 | StructField("col2", StringType(), True), 1030 | StructField("col3", StringType(), True), 1031 | ] 1032 | ) 1033 | 1034 | qdf = spark.createDataFrame([], quarantine_schema) 1035 | 1036 | quarantine_table = ( 1037 | DeltaTable.create(spark) 1038 | .location(quarantine_path) 1039 | .addColumns(qdf.schema) 1040 | .execute() 1041 | ) 1042 | 1043 | data = [(None, "A", "B"), (2, "C", None), (3, "E", "F"), (4, "G", "H")] 1044 | append_df = spark.createDataFrame(data, quarantine_schema) 1045 | 1046 | mack.constraint_append(target_table, append_df, quarantine_table) 1047 | 1048 | # target data equality check 1049 | expected_data = [(3, "E", "F"), (4, "G", "H")] 1050 | expected_df = spark.createDataFrame(expected_data, target_schema) 1051 | 1052 | appended_data = spark.read.format("delta").load(target_path) 1053 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True) 1054 | 1055 | # quarantined data equality check 1056 | expected_quarantined_data = [ 1057 | (None, "A", "B"), 1058 | (2, "C", None), 1059 | ] 1060 | expected_quarantined_df = spark.createDataFrame( 1061 | expected_quarantined_data, quarantine_schema 1062 | ) 1063 | 1064 | quarantined_data = spark.read.format("delta").load(quarantine_path) 1065 | chispa.assert_df_equality( 1066 | quarantined_data, expected_quarantined_df, ignore_row_order=True 1067 | ) 1068 | 1069 | 1070 | def test_constraint_append_notnull_and_check_constraint(tmp_path): 1071 | 1072 | target_path = f"{tmp_path}/constraint_append/target_table" 1073 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table" 1074 | 1075 | target_schema = StructType( 1076 | [ 1077 | StructField("col1", IntegerType(), False), 1078 | StructField("col2", StringType(), True), 1079 | StructField("col3", StringType(), False), 1080 | ] 1081 | ) 1082 | 1083 | df = spark.createDataFrame([], target_schema) 1084 | 1085 | target_table = ( 1086 | DeltaTable.create(spark) 1087 | .location(target_path) 1088 | .addColumns(df.schema) 1089 | .property("delta.constraints.col1_constraint", "col1 > 0") 1090 | .execute() 1091 | ) 1092 | 1093 | quarantine_schema = StructType( 1094 | [ 1095 | StructField("col1", IntegerType(), True), 1096 | StructField("col2", StringType(), True), 1097 | StructField("col3", StringType(), True), 1098 | ] 1099 | ) 1100 | 1101 | qdf = spark.createDataFrame([], quarantine_schema) 1102 | 1103 | quarantine_table = ( 1104 | DeltaTable.create(spark) 1105 | .location(quarantine_path) 1106 | .addColumns(qdf.schema) 1107 | .execute() 1108 | ) 1109 | 1110 | data = [ 1111 | (0, "A", "B"), 1112 | (0, "A", None), 1113 | (None, "A", "B"), 1114 | (2, "C", None), 1115 | (3, "E", "F"), 1116 | (4, "G", "H"), 1117 | ] 1118 | append_df = spark.createDataFrame(data, quarantine_schema) 1119 | 1120 | mack.constraint_append(target_table, append_df, quarantine_table) 1121 | 1122 | # target data equality check 1123 | expected_data = [(3, "E", "F"), (4, "G", "H")] 1124 | expected_df = spark.createDataFrame(expected_data, target_schema) 1125 | 1126 | appended_data = spark.read.format("delta").load(target_path) 1127 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True) 1128 | 1129 | # quarantined data equality check 1130 | expected_quarantined_data = [ 1131 | (0, "A", "B"), 1132 | (0, "A", None), 1133 | (None, "A", "B"), 1134 | (2, "C", None), 1135 | ] 1136 | expected_quarantined_df = spark.createDataFrame( 1137 | expected_quarantined_data, quarantine_schema 1138 | ) 1139 | 1140 | quarantined_data = spark.read.format("delta").load(quarantine_path) 1141 | chispa.assert_df_equality( 1142 | quarantined_data, expected_quarantined_df, ignore_row_order=True 1143 | ) 1144 | 1145 | 1146 | def test_rename_delta_table(tmp_path): 1147 | # Create a temporary directory to hold the Delta table 1148 | # Create a sample DataFrame 1149 | data = [("Alice", 1), ("Bob", 2)] 1150 | df = spark.createDataFrame(data, ["Name", "Age"]) 1151 | 1152 | # Write the DataFrame to a Delta table 1153 | old_table_path = f"{tmp_path}/old_table" 1154 | df.write.format("delta").save(old_table_path) 1155 | 1156 | # Load the Delta table 1157 | old_table = DeltaTable.forPath(spark, old_table_path) 1158 | 1159 | # Call the function to rename the Delta table 1160 | new_table_name = "new_table" 1161 | mack.rename_delta_table( 1162 | old_table, new_table_name, databricks=False, spark_session=spark 1163 | ) 1164 | 1165 | # Verify the table has been renamed 1166 | assert spark._jsparkSession.catalog().tableExists(new_table_name) 1167 | 1168 | # Clean up: Drop the new table 1169 | spark.sql(f"DROP TABLE IF EXISTS {new_table_name}") 1170 | 1171 | --------------------------------------------------------------------------------