├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ ├── black.yml
│ ├── ci.yml
│ ├── mkdocs.yml
│ └── ruff.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── docker-compose.yaml
├── docs
├── gen_ref_pages.py
└── index.md
├── images
└── mack.jpg
├── mack
└── __init__.py
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
└── tests
├── __init__.py
└── test_public_interface.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 |
20 |
21 | **Expected behavior**
22 | A clear and concise description of what you expected to happen.
23 |
24 | **Additional context**
25 | Add any other context about the problem here.
26 |
27 | **Environment information**
28 |
29 | * Delta Lake version:
30 | * Spark version:
31 | * Python version:
32 |
33 | **Willingness to contribute**
34 |
35 | Would you be willing to contribute a fix for this bug to the mack code base?
36 |
37 | - [ ] Yes. I can contribute a fix for this bug independently.
38 | - [ ] Yes. I would be willing to contribute a fix for this bug with guidance from the mack community.
39 | - [ ] No. I cannot contribute a bug fix at this time.
40 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[FEATURE REQUEST]"
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
22 | **Willingness to contribute**
23 |
24 | Would you be willing to contribute an implementation of this feature?
25 |
26 | - [ ] Yes. I can contribute this feature independently.
27 | - [ ] Yes. I would be willing to contribute this feature with guidance from the mack community.
28 | - [ ] No. I cannot contribute this feature at this time.
29 |
--------------------------------------------------------------------------------
/.github/workflows/black.yml:
--------------------------------------------------------------------------------
1 | name: Black format check
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | jobs:
12 | lint:
13 | name: lint
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Black Code Formatter Check
18 | uses: psf/black@stable
19 | with:
20 | options: "--verbose --check"
21 | src: "mack"
22 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Unit tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | jobs:
12 |
13 | test:
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - uses: actions/checkout@v1
18 | with:
19 | fetch-depth: 1
20 |
21 | - name: Set up Python 3.9
22 | uses: actions/setup-python@v2
23 | with:
24 | python-version: 3.9
25 |
26 | - name: Install Poetry
27 | uses: snok/install-poetry@v1
28 |
29 | - name: Cache Poetry virtualenv
30 | uses: actions/cache@v1
31 | id: cache
32 | with:
33 | path: ~/.virtualenvs
34 | key: poetry-${{ hashFiles('**/poetry.lock') }}
35 | restore-keys: |
36 | poetry-${{ hashFiles('**/poetry.lock') }}
37 | - name: Install dependencies
38 | run: poetry install
39 | if: steps.cache.outputs.cache-hit != 'true'
40 |
41 | - name: Run mack tests with pytest
42 | run: poetry run pytest
43 |
--------------------------------------------------------------------------------
/.github/workflows/mkdocs.yml:
--------------------------------------------------------------------------------
1 | name: MKDocs deploy
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | workflow_dispatch:
8 |
9 | jobs:
10 | build:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python 3.9
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: 3.9
19 | - name: Set up Poetry
20 | uses: abatilo/actions-poetry@v2
21 | with:
22 | poetry-version: 1.4.0
23 | - name: Cache Poetry virtualenv
24 | uses: actions/cache@v1
25 | id: cache
26 | with:
27 | path: ~/.virtualenvs
28 | key: poetry-${{ hashFiles('**/poetry.lock') }}
29 | restore-keys: |
30 | poetry-${{ hashFiles('**/poetry.lock') }}
31 | - name: Install dependencies
32 | run:
33 | poetry install --with mkdocs
34 | if: steps.cache.outputs.cache-hit != 'true'
35 | - name: Setup GH
36 | run: |
37 | sudo apt update && sudo apt install -y git
38 | git config user.name 'github-actions[bot]'
39 | git config user.email 'github-actions[bot]@users.noreply.github.com'
40 | - name: Build and Deploy
41 | run:
42 | poetry run mkdocs gh-deploy --force
43 |
--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: ruff check
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python 3.9
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: 3.9
23 | - name: ruff Lint
24 | uses: jpetrucciani/ruff-check@main
25 | with:
26 | line-length: "150"
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | chispa.egg-info/
4 | .cache/
5 | tmp/
6 | .idea/
7 | .DS_Store
8 |
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 |
13 | .pytest_cache/
14 |
15 | # MKDocs
16 | site
17 |
18 | # Virtualenvs
19 | .venv
20 | .env
21 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v3.2.0
4 | hooks:
5 | - id: check-added-large-files
6 | - id: check-ast
7 | - id: check-byte-order-marker
8 | - id: check-case-conflict
9 | - id: check-docstring-first
10 | - id: check-executables-have-shebangs
11 | - id: check-json
12 | - id: check-merge-conflict
13 | - id: check-toml
14 | - id: check-yaml
15 | - id: debug-statements
16 | - id: detect-private-key
17 | - id: end-of-file-fixer
18 | - id: requirements-txt-fixer
19 | - id: trailing-whitespace
20 | - repo: https://github.com/charliermarsh/ruff-pre-commit
21 | rev: 'v0.0.245'
22 | hooks:
23 | - id: ruff
24 | args: ["--fix", "--line-length", "150", "--exit-non-zero-on-fix"]
25 | - repo: https://github.com/psf/black
26 | rev: 22.12.0
27 | hooks:
28 | - id: black
29 | language_version: python3.9
30 | - repo: local
31 | hooks:
32 | - id: pytest
33 | name: pytest
34 | entry: pytest tests/
35 | pass_filenames: false
36 | language: system
37 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Welcome to the mack contributing guide
2 |
3 | ### Issues
4 |
5 | #### Create a new issue
6 |
7 | If you spot a problem with the docs, search if an issue already . If a related issue doesn't exist, you can open a [new issue](https://github.com/MrPowers/mack/issues/new).
8 |
9 | #### Solve an issue
10 |
11 | Scan through our [existing issues](https://github.com/MrPowers/mack/issues) to find one that interests you. If you find an issue to work on, make sure that no one else is already working on it, so you can get assigned. After that, you are welcome to open a PR with a fix.
12 |
13 | ### Pull Request
14 |
15 | When you're finished with the changes, create a pull request, also known as a PR.
16 | - Before you commit, install the pre-commit hooks with `pre-commit install`.
17 | - Make sure that the pre-commit hooks pass on your local machine.
18 | - Don't forget to link PR to issue if you are solving one.
19 | - As you update your PR and apply changes, mark each conversation as resolved.
20 | - If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues.
21 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 |
3 | ENV TZ=America/Chicago
4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
5 |
6 | RUN apt-get update && \
7 | apt-get -y install --no-install-recommends default-jdk software-properties-common python3-pip python3.9 python3.9-dev libpq-dev build-essential wget libssl-dev libffi-dev vim && \
8 | apt-get clean
9 |
10 | RUN wget https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz && \
11 | tar xvf spark-3.3.2-bin-hadoop3.tgz && \
12 | mv spark-3.3.2-bin-hadoop3/ /usr/local/spark && \
13 | ln -s /usr/local/spark spark
14 |
15 | WORKDIR app
16 | COPY . /app
17 |
18 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2
19 | RUN update-alternatives --config python3
20 |
21 | RUN pip3 install poetry delta-spark
22 | RUN poetry install
23 |
24 | ENV PYSPARK_PYTHON=python3
25 | ENV PYSPARK_SUBMIT_ARGS='--packages io.delta:delta-core_2.12:2.2.0 pyspark-shell'
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Matthew Powers
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mack
2 |
3 | ](https://github.com/MrPowers/mack/actions/workflows/ci.yml/badge.svg)
4 | ](https://github.com/MrPowers/mack/actions/workflows/black.yml/badge.svg)
5 | ](https://github.com/MrPowers/mack/actions/workflows/ruff.yml/badge.svg)
6 | 
7 | [](https://badge.fury.io/py/mack)
8 |
9 | mack provides a variety of helper methods that make it easy for you to perform common Delta Lake operations.
10 |
11 | 
12 |
13 | ## Setup
14 |
15 | Install mack with `pip install mack`.
16 |
17 | Here's an example of how you can perform a Type 2 SCD upsert with a single line of code using Mack:
18 |
19 | ```python
20 | import mack
21 |
22 | mack.type_2_scd_upsert(path, updatesDF, "pkey", ["attr1", "attr2"])
23 | ```
24 |
25 | ## Type 2 SCD Upserts
26 |
27 | This library provides an opinionated, conventions over configuration, approach to Type 2 SCD management. Let's look at an example before
28 | covering the conventions required to take advantage of the functionality.
29 |
30 | Suppose you have the following SCD table with the `pkey` primary key:
31 |
32 | ```
33 | +----+-----+-----+----------+-------------------+--------+
34 | |pkey|attr1|attr2|is_current| effective_time|end_time|
35 | +----+-----+-----+----------+-------------------+--------+
36 | | 1| A| A| true|2019-01-01 00:00:00| null|
37 | | 2| B| B| true|2019-01-01 00:00:00| null|
38 | | 4| D| D| true|2019-01-01 00:00:00| null|
39 | +----+-----+-----+----------+-------------------+--------+
40 | ```
41 |
42 | You'd like to perform an upsert with this data:
43 |
44 | ```
45 | +----+-----+-----+-------------------+
46 | |pkey|attr1|attr2| effective_time|
47 | +----+-----+-----+-------------------+
48 | | 2| Z| null|2020-01-01 00:00:00| // upsert data
49 | | 3| C| C|2020-09-15 00:00:00| // new pkey
50 | +----+-----+-----+-------------------+
51 | ```
52 |
53 | Here's how to perform the upsert:
54 |
55 | ```scala
56 | mack.type_2_scd_upsert(delta_table, updatesDF, "pkey", ["attr1", "attr2"])
57 | ```
58 |
59 | Here's the table after the upsert:
60 |
61 | ```
62 | +----+-----+-----+----------+-------------------+-------------------+
63 | |pkey|attr1|attr2|is_current| effective_time| end_time|
64 | +----+-----+-----+----------+-------------------+-------------------+
65 | | 2| B| B| false|2019-01-01 00:00:00|2020-01-01 00:00:00|
66 | | 4| D| D| true|2019-01-01 00:00:00| null|
67 | | 1| A| A| true|2019-01-01 00:00:00| null|
68 | | 3| C| C| true|2020-09-15 00:00:00| null|
69 | | 2| Z| null| true|2020-01-01 00:00:00| null|
70 | +----+-----+-----+----------+-------------------+-------------------+
71 | ```
72 |
73 | You can leverage the upsert code if your SCD table meets these requirements:
74 |
75 | * Contains a unique primary key column
76 | * Any change in an attribute column triggers an upsert
77 | * SCD logic is exposed via `effective_time`, `end_time` and `is_current` column (you can also use date or version columns for SCD upserts)
78 |
79 | ## Kill duplicates
80 |
81 | The `kill_duplicate` function completely removes all duplicate rows from a Delta table.
82 |
83 | Suppose you have the following table:
84 |
85 | ```
86 | +----+----+----+
87 | |col1|col2|col3|
88 | +----+----+----+
89 | | 1| A| A| # duplicate
90 | | 2| A| B|
91 | | 3| A| A| # duplicate
92 | | 4| A| A| # duplicate
93 | | 5| B| B| # duplicate
94 | | 6| D| D|
95 | | 9| B| B| # duplicate
96 | +----+----+----+
97 | ```
98 |
99 | Run the `kill_duplicates` function:
100 |
101 | ```python
102 | mack.kill_duplicates(deltaTable, ["col2", "col3"])
103 | ```
104 |
105 | Here's the ending state of the table:
106 |
107 | ```
108 | +----+----+----+
109 | |col1|col2|col3|
110 | +----+----+----+
111 | | 2| A| B|
112 | | 6| D| D|
113 | +----+----+----+
114 | ```
115 |
116 | ## Drop duplicates with Primary Key
117 |
118 | The `drop_duplicates_pkey` function removes all but one duplicate row from a Delta table.
119 | **Warning:** You have to provide a primary column that **must contain unique values**, otherwise the method will default to kill the duplicates.
120 | If you can not provide a unique primary key, you can use the `drop_duplicates` method.
121 |
122 | Suppose you have the following table:
123 |
124 | ```
125 | +----+----+----+----+
126 | |col1|col2|col3|col4|
127 | +----+----+----+----+
128 | | 1| A| A| C| # duplicate1
129 | | 2| A| B| C|
130 | | 3| A| A| D| # duplicate1
131 | | 4| A| A| E| # duplicate1
132 | | 5| B| B| C| # duplicate2
133 | | 6| D| D| C|
134 | | 9| B| B| E| # duplicate2
135 | +----+----+----+----+
136 | ```
137 |
138 | Run the `drop_duplicates` function:
139 |
140 | ```python
141 | mack.drop_duplicates_pkey(delta_table=deltaTable, primary_key="col1", duplication_columns=["col2", "col3"])
142 | ```
143 |
144 | Here's the ending state of the table:
145 |
146 | ```
147 | +----+----+----+----+
148 | |col1|col2|col3|col4|
149 | +----+----+----+----+
150 | | 1| A| A| C|
151 | | 2| A| B| C|
152 | | 5| B| B| C|
153 | | 6| D| D| C|
154 | +----+----+----+----+
155 | ```
156 |
157 | ## Drop duplicates
158 |
159 | The `drop_duplicates` function removes all but one duplicate row from a Delta table. It behaves exactly like the `drop_duplicates` DataFrame API.
160 | **Warning:** This method is overwriting the whole table, thus very inefficient. If you can, use the `drop_duplicates_pkey` method instead.
161 |
162 | Suppose you have the following table:
163 |
164 | ```
165 | +----+----+----+----+
166 | |col1|col2|col3|col4|
167 | +----+----+----+----+
168 | | 1| A| A| C| # duplicate
169 | | 1| A| A| C| # duplicate
170 | | 2| A| A| C|
171 | +----+----+----+----+
172 | ```
173 |
174 | Run the `drop_duplicates` function:
175 |
176 | ```python
177 | mack.drop_duplicates(delta_table=deltaTable, duplication_columns=["col1"])
178 | ```
179 |
180 | Here's the ending state of the table:
181 |
182 | ```
183 | +----+----+----+----+
184 | |col1|col2|col3|col4|
185 | +----+----+----+----+
186 | | 1| A| A| C| # duplicate
187 | | 2| A| A| C| # duplicate
188 | +----+----+----+----+
189 | ```
190 |
191 | ## Copy table
192 |
193 | The `copy_table` function copies an existing Delta table.
194 | When you copy a table, it gets recreated at a specified target. This target could be a path or a table in a metastore.
195 | Copying includes:
196 |
197 | * Data
198 | * Partitioning
199 | * Table properties
200 |
201 | Copying **does not** include the delta log, which means that you will not be able to restore the new table to an old version of the original
202 | table.
203 |
204 | Here's how to perform the copy:
205 |
206 | ```python
207 | mack.copy_table(delta_table=deltaTable, target_path=path)
208 | ```
209 |
210 | ## Validate append
211 |
212 | The `validate_append` function provides a mechanism for allowing some columns for schema evolution, but rejecting appends with columns that aren't specificly allowlisted.
213 |
214 | Suppose you have the following Delta table:
215 |
216 | ```
217 | +----+----+----+
218 | |col1|col2|col3|
219 | +----+----+----+
220 | | 2| b| B|
221 | | 1| a| A|
222 | +----+----+----+
223 | ```
224 |
225 | Here's a appender function that wraps `validate_append`:
226 |
227 | ```python
228 | def append_fun(delta_table, append_df):
229 | mack.validate_append(
230 | delta_table,
231 | append_df,
232 | required_cols=["col1", "col2"],
233 | optional_cols=["col4"],
234 | )
235 | ```
236 |
237 | You can append the following DataFrame that contains the required columns and the optional columns:
238 |
239 | ```
240 | +----+----+----+
241 | |col1|col2|col4|
242 | +----+----+----+
243 | | 3| c| cat|
244 | | 4| d| dog|
245 | +----+----+----+
246 | ```
247 |
248 | Here's what the Delta table will contain after that data is appended:
249 |
250 | ```
251 | +----+----+----+----+
252 | |col1|col2|col3|col4|
253 | +----+----+----+----+
254 | | 3| c|null| cat|
255 | | 4| d|null| dog|
256 | | 2| b| B|null|
257 | | 1| a| A|null|
258 | +----+----+----+----+
259 | ```
260 |
261 | You cannot append the following DataFrame which contains the required columns, but also contains another column (`col5`) that's not specified as an optional column.
262 |
263 | ```
264 | +----+----+----+
265 | |col1|col2|col5|
266 | +----+----+----+
267 | | 4| b| A|
268 | | 5| y| C|
269 | | 6| z| D|
270 | +----+----+----+
271 | ```
272 |
273 | Here's the error you'll get when you attempt this write: "TypeError: The column 'col5' is not part of the current Delta table. If you want to add the column to the table you must set the optional_cols parameter."
274 |
275 | You also cannot append the following DataFrame which is missing one of the required columns.
276 |
277 | ```
278 | +----+----+
279 | |col1|col4|
280 | +----+----+
281 | | 4| A|
282 | | 5| C|
283 | | 6| D|
284 | +----+----+
285 | ```
286 |
287 | Here's the error you'll get: "TypeError: The base Delta table has these columns '['col1', 'col4']', but these columns are required '['col1', 'col2']'."
288 |
289 | ## Append data without duplicates
290 |
291 | The `append_without_duplicates` function helps to append records to a existing Delta table without getting duplicates appended to the
292 | record.
293 |
294 | Suppose you have the following Delta table:
295 |
296 | ```
297 | +----+----+----+
298 | |col1|col2|col3|
299 | +----+----+----+
300 | | 1| A| B|
301 | | 2| C| D|
302 | | 3| E| F|
303 | +----+----+----+
304 | ```
305 |
306 | Here is data to be appended:
307 |
308 | ```
309 | +----+----+----+
310 | |col1|col2|col3|
311 | +----+----+----+
312 | | 2| R| T| # duplicate col1
313 | | 8| A| B|
314 | | 8| C| D| # duplicate col1
315 | | 10| X| Y|
316 | +----+----+----+
317 | ```
318 |
319 | Run the `append_without_duplicates` function:
320 |
321 | ```python
322 | mack.append_without_duplicates(deltaTable, append_df, ["col1"])
323 | ```
324 |
325 | Here's the ending result:
326 |
327 | ```
328 |
329 | +----+----+----+
330 | |col1|col2|col3|
331 | +----+----+----+
332 | | 1| A| B|
333 | | 2| C| D|
334 | | 3| E| F|
335 | | 8| A| B|
336 | | 10| X| Y|
337 | +----+----+----+
338 | ```
339 |
340 | Notice that the duplicate `col1` value was not appended. If a normal append operation was run, then the Delta table would contain two rows of data with `col1` equal to 2.
341 |
342 | ## Delta File Sizes
343 |
344 | The `delta_file_sizes` function returns a dictionary that contains the total size in bytes, the amount of files and the average file size for a given Delta Table.
345 |
346 | Suppose you have the following Delta Table, partitioned by `col1`:
347 |
348 | ```
349 | +----+----+----+
350 | |col1|col2|col3|
351 | +----+----+----+
352 | | 1| A| A|
353 | | 2| A| B|
354 | +----+----+----+
355 | ```
356 |
357 | Running `mack.delta_file_sizes(delta_table)` on that table will return:
358 |
359 | ```
360 | {"size_in_bytes": 1320,
361 | "number_of_files": 2,
362 | "average_file_size_in_bytes": 660}
363 | ```
364 |
365 | ## Show Delta File Sizes
366 |
367 | The `show_delta_file_sizes` function prints the amount of files, the size of the table, and the average file size for a delta table.
368 |
369 | Suppose you have the following Delta Table, partitioned by `col1`:
370 |
371 | ```
372 | +----+----+----+
373 | |col1|col2|col3|
374 | +----+----+----+
375 | | 1| A| A|
376 | | 2| A| B|
377 | +----+----+----+
378 | ```
379 |
380 | Running `mack.delta_file_sizes(delta_table)` on that table will print:
381 |
382 | `The delta table contains 2 files with a size of 1.32 kB. The average file size is 660.0 B`
383 |
384 | ## Humanize Bytes
385 |
386 | The `humanize_bytes` function formats an integer representing a number of bytes in an easily human readable format.
387 |
388 | ```python
389 | mack.humanize_bytes(1234567890) # "1.23 GB"
390 | mack.humanize_bytes(1234567890000) # "1.23 TB"
391 | ```
392 |
393 | It's a lot easier for a human to understand 1.23 GB compared to 1234567890 bytes.
394 |
395 | ## Is Composite Key Candidate
396 |
397 | The `is_composite_key_candidate` function returns a boolean that indicates whether a set of columns are unique and could form a composite key or not.
398 |
399 | Suppose you have the following Delta Table:
400 |
401 | ```
402 | +----+----+----+
403 | |col1|col2|col3|
404 | +----+----+----+
405 | | 1| A| A|
406 | | 2| B| B|
407 | | 2| C| B|
408 | +----+----+----+
409 | ```
410 |
411 | Running `mack.is_composite_key_candidate(delta_table, ["col1"])` on that table will return `False`.
412 | Running `mack.is_composite_key_candidate(delta_table, ["col1", "col2"])` on that table will return `True`.
413 |
414 | ## Find Composite Key Candidates in the Delta table
415 |
416 | The `find_composite_key_candidates` function helps you find a composite key that uniquely identifies the rows your Delta table. It returns a list of columns that can be used as a composite key.
417 |
418 | Suppose you have the following Delta table:
419 |
420 | ```
421 | +----+----+----+
422 | |col1|col2|col3|
423 | +----+----+----+
424 | | 1| a| z|
425 | | 1| a| b|
426 | | 3| c| b|
427 | +----+----+----+
428 | ```
429 |
430 | Running `mack.find_composite_key_candidates(delta_table)` on that table will return `["col1", "col3"]`.
431 |
432 | ## Append md5 column
433 |
434 | The `with_md5_cols` function appends a `md5` hash of specified columns to the DataFrame. This can be used as a unique key if the selected columns form a composite key.
435 |
436 | You can use this function with the columns identified in `find_composite_key_candidates` to append a unique key to the DataFrame.
437 |
438 | Suppose you have the following Delta table:
439 |
440 | ```
441 | +----+----+----+
442 | |col1|col2|col3|
443 | +----+----+----+
444 | | 1| a|null|
445 | | 2| b| b|
446 | | 3| c| c|
447 | +----+----+----+
448 | ```
449 |
450 | Running `mack.with_md5_cols(delta_table, ["col2", "col3"])` on that table will append a `md5_col2_col3` as follows:
451 |
452 | ```
453 | +----+----+----+--------------------------------+
454 | |col1|col2|col3|md5_col2_col3 |
455 | +----+----+----+--------------------------------+
456 | |1 |a |null|0cc175b9c0f1b6a831c399e269772661|
457 | |2 |b |b |1eeaac3814eb80cc40efb005cf0b9141|
458 | |3 |c |c |4e202f8309e7b00349c70845ab02fce9|
459 | +----+----+----+--------------------------------+
460 | ```
461 |
462 | ## Get Latest Delta Table Version
463 |
464 | The `latest_version` function gets the most current Delta
465 | Table version number and returns it.
466 |
467 | ```python
468 | delta_table = DeltaTable.forPath(spark, path)
469 | mack.latest_version(delta_table)
470 | >> 2
471 | ```
472 |
473 | ## Append data with constraints
474 |
475 | The `constraint_append` function helps to append records to an existing Delta table even if there are records in the append dataframe that violate table constraints (both check and not null constraints), these records are appended to an existing quarantine Delta table instead of the target table. If the quarantine Delta table is set to `None`, those records that violate table constraints are simply thrown out.
476 |
477 | Suppose you have the following target Delta table with the following schema and constraints:
478 |
479 | ```
480 | schema:
481 | col1 int not null
482 | col2 string null
483 | col3 string null
484 |
485 | check constraints:
486 | col1_constraint: (col1 > 0)
487 | col2_constraint: (col2 != 'Z')
488 |
489 | +----+----+----+
490 | |col1|col2|col3|
491 | +----+----+----+
492 | | 1| A| B|
493 | | 2| C| D|
494 | | 3| E| F|
495 | +----+----+----+
496 | ```
497 |
498 | Suppose you have a quarantine Delta table with the same schema but without the constraints.
499 |
500 | Here is data to be appended:
501 |
502 | ```
503 | +----+----+----+
504 | |col1|col2|col3|
505 | +----+----+----+
506 | | | H| H| # violates col1 not null constraint
507 | | 0| Z| Z| # violates both col1_constraint and col2_constraint
508 | | 4| A| B|
509 | | 5| C| D|
510 | | 6| E| F|
511 | | 9| G| G|
512 | | 11| Z| Z| # violates col2_constraint
513 | +----+----+----+
514 | ```
515 |
516 | Run the `constraint_append` function:
517 |
518 | ```python
519 | mack.constraint_append(delta_table, append_df, quarantine_table)
520 | ```
521 |
522 | Here's the ending result in delta_table:
523 |
524 | ```
525 |
526 | +----+----+----+
527 | |col1|col2|col3|
528 | +----+----+----+
529 | | 1| A| B|
530 | | 2| C| D|
531 | | 3| E| F|
532 | | 4| A| B|
533 | | 5| C| D|
534 | | 6| E| F|
535 | | 9| G| G|
536 | +----+----+----+
537 | ```
538 |
539 | Here's the ending result in quarantine_table:
540 |
541 | ```
542 |
543 | +----+----+----+
544 | |col1|col2|col3|
545 | +----+----+----+
546 | | | H| H|
547 | | 0| Z| Z|
548 | | 11| Z| Z|
549 | +----+----+----+
550 | ```
551 |
552 | Notice that the records that violated either of the constraints are appended to the quarantine table all other records are appended to the target table and the append has not failed. If a normal append operation was run, then it would have failed on the constraint violation. If `quarantine_table` is set to `None`, records that violated either of the constraints are simply thrown out.
553 |
554 |
555 | ## Rename a Delta Table
556 |
557 | This function is designed to rename a Delta table. It can operate either within a Databricks environment or with a standalone Spark session.
558 |
559 | ## Parameters:
560 |
561 | - `delta_table` (`DeltaTable`): An object representing the Delta table to be renamed.
562 | - `new_table_name` (`str`): The new name for the table.
563 | - `table_location` (`str`, optional): The file path where the table is stored. If not provided, the function attempts to deduce the location from the `DeltaTable` object. Defaults to `None`.
564 | - `databricks` (`bool`, optional): A flag indicating the function's operational environment. Set to `True` if running within Databricks, otherwise, `False`. Defaults to `False`.
565 | - `spark_session` (`pyspark.sql.SparkSession`, optional): The Spark session. This is required when `databricks` is set to `True`. Defaults to `None`.
566 |
567 | ## Returns:
568 |
569 | - `None`
570 |
571 | ## Raises:
572 |
573 | - `TypeError`: If the provided `delta_table` is not a DeltaTable object, or if `databricks` is set to `True` and `spark_session` is `None`.
574 |
575 | ## Example Usage:
576 |
577 | ```python
578 | rename_delta_table(existing_delta_table, "new_table_name")
579 | ```
580 |
581 |
582 | ## Dictionary
583 |
584 | We're leveraging the following terminology defined [here](https://www.databasestar.com/database-keys/#:~:text=Natural%20key%3A%20an%20attribute%20that,can%20uniquely%20identify%20a%20row).
585 |
586 | * **Natural key:** an attribute that can uniquely identify a row, and exists in the real world.
587 | * **Surrogate key:** an attribute that can uniquely identify a row, and does not exist in the real world.
588 | * **Composite key:** more than one attribute that when combined can uniquely identify a row.
589 | * **Primary key:** the single unique identifier for the row.
590 | * **Candidate key:** an attribute that could be the primary key.
591 | * **Alternate key:** a candidate key that is not the primary key.
592 | * **Unique key:** an attribute that can be unique on the table. Can also be called an alternate key.
593 | * **Foreign key:** an attribute that is used to refer to another record in another table.
594 |
595 | ## Project maintainers
596 |
597 | * Matthew Powers aka [MrPowers](https://github.com/MrPowers)
598 | * Robert Kossendey aka [robertkossendey](https://github.com/robertkossendey)
599 | * Souvik Pratiher aka [souvik-databricks](https://github.com/souvik-databricks)
600 |
601 | ## Project philosophy
602 |
603 | The mack library is designed to make common Delta Lake data tasks easier.
604 |
605 | You don't need to use mack of course. You can write the logic yourself.
606 |
607 | If you don't want to add a dependency to your project, you can also easily copy / paste the functions from mack. The functions in this library are intentionally designed to be easy to copy and paste.
608 |
609 | Let's look at some of the reasons you may want to add mack as a dependency.
610 |
611 | ### Exposing nice public interfaces
612 |
613 | The public interface (and only the public interface) is available via the `mack` namespace.
614 |
615 | When you run `import mack`, you can access the entirety of the public interface. No private implementation details are exposed in the `mack` namespace.
616 |
617 | ### Minimal dependencies
618 |
619 | Mack only depends on Spark & Delta Lake. No other dependencies will be added to Mack.
620 |
621 | Spark users leverage a variety of runtimes and it's not always easy to add a dependency. You can run `pip install mack` and won't have to worry about resolving a lot of dependency conflicts. You can also Just attach a mack wheel file to a cluster to leverage the project.
622 |
623 | ### Provide best practices examples for the community
624 |
625 | Mack strives to be a good example codebase for the PySpark / Delta Lake community.
626 |
627 | There aren't a lot of open source Delta Lake projects. There are even fewer that use good software engineering practices like CI and unit testing. You can use mack to help guide your design decisions in proprietary code repos.
628 |
629 | ### Stable public interfaces and long term support after 1.0 release
630 |
631 | Mack reserves the right to make breaking public interface changes before the 1.0 release. We'll always minimize breaking changes whenever possible.
632 |
633 | After the 1.0 release, Mack will stricly follow Semantic Versioning 2.0 and will only make breaking public interface changes in major releases. Hopefully 1.0 will be the only major release and there won't have to be any breaking changes.
634 |
635 | ### Code design
636 |
637 | Here are some of the code design principles used in Mack:
638 |
639 | * We avoid classes whenever possible. Classes make it harder to copy / paste little chunks of code into notebooks. It's good to [Stop Writing Classes](https://www.youtube.com/watch?v=o9pEzgHorH0).
640 | * We try to make functions that are easy to copy. We do this by limiting functions that depend on other functions or classes. We'd rather nest a single use function in a public interface method than make it separate.
641 | * Develop and then abstract. All code goes in a single file till the right abstractions become apparent. We'd rather have a large file than the wrong abstractions.
642 |
643 | ### Docker Environment
644 | The `Dockerfile` and `docker-compose` files provide a containerized way to run and develop
645 | with `mack`.
646 |
647 | - The first time run `docker build --tag=mack .` to build the image.
648 | - To execute the unit tests inside the `Docker` container, run `docker-compose up test`
649 | - To drop into the running `Docker` container to develop, run `docker run -it mack /bin/bash`
650 |
651 | ## Community
652 |
653 | ### Blogs
654 |
655 | - [Daniel Beach (Confessions of a Data Guy): Simplify Delta Lake Complexity with mack.](https://www.confessionsofadataguy.com/simplify-delta-lake-complexity-with-mack/)
656 | - [Bartosz Konieczny (waitingforcode): Simplified Delta Lake operations with Mack](https://www.waitingforcode.com/delta-lake/simplified-delta-lake-operations-mack/read)
657 |
658 | ### Videos
659 |
660 | - [GeekCoders on YouTube: How I use MACK Library in Delta Lake using Databricks/PySpark](https://www.youtube.com/watch?v=qRR5n6T2N_8)
661 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.9"
2 | services:
3 | test:
4 | image: "mack"
5 | volumes:
6 | - .:/app
7 | command: poetry run pytest
8 |
--------------------------------------------------------------------------------
/docs/gen_ref_pages.py:
--------------------------------------------------------------------------------
1 | """Generate the code reference pages and navigation.
2 |
3 | Script was taken from
4 | https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages
5 | """
6 |
7 | from pathlib import Path
8 |
9 | import mkdocs_gen_files
10 |
11 | nav = mkdocs_gen_files.Nav()
12 |
13 | for path in sorted(Path(".").rglob("mack/**/*.py")):
14 | module_path = path.relative_to(".").with_suffix("")
15 | doc_path = path.relative_to(".").with_suffix(".md")
16 | full_doc_path = Path("reference", doc_path)
17 |
18 | parts = tuple(module_path.parts)
19 |
20 | if parts[-1] == "__init__":
21 | parts = parts[:-1]
22 | doc_path = doc_path.with_name("index.md")
23 | full_doc_path = full_doc_path.with_name("index.md")
24 | elif parts[-1] == "__main__":
25 | continue
26 |
27 | nav[parts] = doc_path.as_posix() #
28 |
29 | with mkdocs_gen_files.open(full_doc_path, "w") as fd:
30 | ident = ".".join(parts)
31 | fd.write(f"::: {ident}")
32 |
33 | mkdocs_gen_files.set_edit_path(full_doc_path, path)
34 |
35 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
36 | nav_file.writelines(nav.build_literate_nav())
37 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | {!README.md!}
2 |
--------------------------------------------------------------------------------
/images/mack.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MrPowers/mack/396f297d7e4db7feb1d1b1825c27d0076aa8e3e0/images/mack.jpg
--------------------------------------------------------------------------------
/mack/__init__.py:
--------------------------------------------------------------------------------
1 | from itertools import combinations
2 | from typing import List, Union, Dict, Optional
3 |
4 | from delta import DeltaTable
5 | import pyspark
6 | from pyspark.sql.dataframe import DataFrame
7 | from pyspark.sql.functions import col, concat_ws, count, md5, row_number, max
8 | from pyspark.sql.window import Window
9 |
10 |
11 | def type_2_scd_upsert(
12 | delta_table: DeltaTable,
13 | updates_df: DataFrame,
14 | primary_key: str,
15 | attr_col_names: List[str],
16 | ) -> None:
17 | """
18 |
19 |
20 | :param path:
21 | :type path: DeltaTable
22 | :param updates_df:
23 | :type updates_df: DataFrame
24 | :param primary_key:
25 | :type primary_key: str
26 | :param attr_col_names:
27 | :type attr_col_names: List[str]
28 |
29 | :returns:
30 | :rtype: None
31 | """
32 | return type_2_scd_generic_upsert(
33 | delta_table,
34 | updates_df,
35 | primary_key,
36 | attr_col_names,
37 | "is_current",
38 | "effective_time",
39 | "end_time",
40 | )
41 |
42 |
43 | def type_2_scd_generic_upsert(
44 | delta_table: DeltaTable,
45 | updates_df: DataFrame,
46 | primary_key: str,
47 | attr_col_names: List[str],
48 | is_current_col_name: str,
49 | effective_time_col_name: str,
50 | end_time_col_name: str,
51 | ) -> None:
52 | """
53 |
54 |
55 | :param delta_table: DeltaTable
56 | :type path: str
57 | :param updates_df:
58 | :type updates_df: DataFrame
59 | :param primary_key:
60 | :type primary_key: str
61 | :param attr_col_names:
62 | :type attr_col_names: List[str]
63 | :param is_current_col_name:
64 | :type is_current_col_name: str
65 | :param effective_time_col_name:
66 | :type effective_time_col_name: str
67 | :param end_time_col_name:
68 | :type effective_time_col_name: str
69 |
70 | :raises TypeError: Raises type error when required column names are not in the base table.
71 | :raises TypeError: Raises type error when required column names for updates are not in the attributes columns list.
72 |
73 | :returns:
74 | :rtype: None
75 | """
76 |
77 | # validate the existing Delta table
78 | base_col_names = delta_table.toDF().columns
79 | required_base_col_names = (
80 | [primary_key]
81 | + attr_col_names
82 | + [is_current_col_name, effective_time_col_name, end_time_col_name]
83 | )
84 | if sorted(base_col_names) != sorted(required_base_col_names):
85 | raise TypeError(
86 | f"The base table has these columns {base_col_names!r}, but these columns are required {required_base_col_names!r}"
87 | )
88 | # validate the updates DataFrame
89 | updates_col_names = updates_df.columns
90 | required_updates_col_names = (
91 | [primary_key] + attr_col_names + [effective_time_col_name]
92 | )
93 | if sorted(updates_col_names) != sorted(required_updates_col_names):
94 | raise TypeError(
95 | f"The updates DataFrame has these columns {updates_col_names!r}, but these columns are required {required_updates_col_names!r}"
96 | )
97 |
98 | # perform the upsert
99 | updates_attrs = list(
100 | map(lambda attr: f"updates.{attr} <> base.{attr}", attr_col_names)
101 | )
102 | updates_attrs = " OR ".join(updates_attrs)
103 | staged_updates_attrs = list(
104 | map(lambda attr: f"staged_updates.{attr} <> base.{attr}", attr_col_names)
105 | )
106 | staged_updates_attrs = " OR ".join(staged_updates_attrs)
107 | staged_part_1 = (
108 | updates_df.alias("updates")
109 | .join(delta_table.toDF().alias("base"), primary_key)
110 | .where(f"base.{is_current_col_name} = true AND ({updates_attrs})")
111 | .selectExpr("NULL as mergeKey", "updates.*")
112 | )
113 | staged_part_2 = updates_df.selectExpr(f"{primary_key} as mergeKey", "*")
114 | staged_updates = staged_part_1.union(staged_part_2)
115 | thing = {}
116 | for attr in attr_col_names:
117 | thing[attr] = f"staged_updates.{attr}"
118 | thing2 = {
119 | primary_key: f"staged_updates.{primary_key}",
120 | is_current_col_name: "true",
121 | effective_time_col_name: f"staged_updates.{effective_time_col_name}",
122 | end_time_col_name: "null",
123 | }
124 | res_thing = {**thing, **thing2}
125 | res = (
126 | delta_table.alias("base")
127 | .merge(
128 | source=staged_updates.alias("staged_updates"),
129 | condition=pyspark.sql.functions.expr(f"base.{primary_key} = mergeKey"),
130 | )
131 | .whenMatchedUpdate(
132 | condition=f"base.{is_current_col_name} = true AND ({staged_updates_attrs})",
133 | set={
134 | is_current_col_name: "false",
135 | end_time_col_name: f"staged_updates.{effective_time_col_name}",
136 | },
137 | )
138 | .whenNotMatchedInsert(values=res_thing)
139 | .execute()
140 | )
141 | return res
142 |
143 |
144 | def kill_duplicates(delta_table: DeltaTable, duplication_columns: List[str]) -> None:
145 | """
146 |
147 |
148 | :param delta_table:
149 | :type delta_table: DeltaTable
150 | :param duplication_columns:
151 | :type duplication_columns: List[str]
152 |
153 | :raises TypeError: Raises type error when input arguments have a invalid type or are empty.
154 | :raises TypeError: Raises type error when required columns are missing in the provided delta table.
155 | """
156 | if not isinstance(delta_table, DeltaTable):
157 | raise TypeError("An existing delta table must be specified.")
158 |
159 | if not duplication_columns or len(duplication_columns) == 0:
160 | raise TypeError("Duplication columns must be specified")
161 |
162 | data_frame = delta_table.toDF()
163 |
164 | # Make sure that all the required columns are present in the provided delta table
165 | append_data_columns = data_frame.columns
166 | for required_column in duplication_columns:
167 | if required_column not in append_data_columns:
168 | raise TypeError(
169 | f"The base table has these columns {append_data_columns!r}, but these columns are required {duplication_columns!r}"
170 | )
171 |
172 | q = []
173 |
174 | duplicate_records = (
175 | data_frame.withColumn(
176 | "amount_of_records",
177 | count("*").over(Window.partitionBy(duplication_columns)),
178 | )
179 | .filter(col("amount_of_records") > 1)
180 | .drop("amount_of_records")
181 | .distinct()
182 | )
183 |
184 | for column in duplication_columns:
185 | q.append(f"old.{column} = new.{column}")
186 |
187 | q = " AND ".join(q)
188 |
189 | # Remove all the duplicate records
190 | delta_table.alias("old").merge(
191 | duplicate_records.alias("new"), q
192 | ).whenMatchedDelete().execute()
193 |
194 |
195 | def drop_duplicates_pkey(
196 | delta_table: DeltaTable, primary_key: str, duplication_columns: List[str]
197 | ) -> None:
198 | """
199 |
200 |
201 | :param delta_table:
202 | :type delta_table: DeltaTable
203 | :param primary_key:
204 | :type primary_key: str
205 | :param duplication_columns:
206 | :type duplication_columns: List[str]
207 |
208 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty.
209 | :raises TypeError: Raises type error when required columns are missing in the provided delta table.
210 | """
211 | if not isinstance(delta_table, DeltaTable):
212 | raise TypeError("An existing delta table must be specified.")
213 |
214 | if not primary_key:
215 | raise TypeError("A unique primary key must be specified.")
216 |
217 | if not duplication_columns or len(duplication_columns) == 0:
218 | raise TypeError("A duplication column must be specified.")
219 |
220 | if primary_key in duplication_columns:
221 | raise TypeError("Primary key must not be part of the duplication columns.")
222 |
223 | data_frame = delta_table.toDF()
224 |
225 | # Make sure that all the required columns are present in the provided delta table
226 | append_data_columns = data_frame.columns
227 | required_columns = [primary_key] + duplication_columns
228 | for required_column in required_columns:
229 | if required_column not in append_data_columns:
230 | raise TypeError(
231 | f"The base table has these columns {append_data_columns!r}, but these columns are required {required_columns!r}"
232 | )
233 |
234 | q = []
235 |
236 | duplicate_records = (
237 | data_frame.withColumn(
238 | "row_number",
239 | row_number().over(
240 | Window().partitionBy(duplication_columns).orderBy(primary_key)
241 | ),
242 | )
243 | .filter(col("row_number") > 1)
244 | .drop("row_number")
245 | .distinct()
246 | )
247 | for column in required_columns:
248 | q.append(f"old.{column} = new.{column}")
249 |
250 | q = " AND ".join(q)
251 |
252 | # Remove all the duplicate records
253 | delta_table.alias("old").merge(
254 | duplicate_records.alias("new"), q
255 | ).whenMatchedDelete().execute()
256 |
257 |
258 | def drop_duplicates(delta_table: DeltaTable, duplication_columns: List[str]) -> None:
259 | """
260 |
261 |
262 | :param delta_table:
263 | :type delta_table: DeltaTable
264 | :param duplication_columns:
265 | :type duplication_columns: List[str]
266 |
267 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty.
268 | """
269 | if not isinstance(delta_table, DeltaTable):
270 | raise TypeError("An existing delta table must be specified.")
271 |
272 | if not duplication_columns or len(duplication_columns) == 0:
273 | raise TypeError("A duplication column must be specified.")
274 |
275 | data_frame = delta_table.toDF()
276 |
277 | details = delta_table.detail().select("location").collect()[0]
278 |
279 | (
280 | data_frame.drop_duplicates(duplication_columns)
281 | .write.format("delta")
282 | .mode("overwrite")
283 | .save(details["location"])
284 | )
285 |
286 |
287 | def copy_table(
288 | delta_table: DeltaTable, target_path: str = "", target_table: str = ""
289 | ) -> None:
290 | """
291 |
292 |
293 | :param delta_table:
294 | :type delta_table: DeltaTable
295 | :param target_path: , defaults to empty string.
296 | :type target_path: str
297 | :param target_table: , defaults to empty string.
298 | :type target_table: str
299 |
300 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty.
301 | """
302 | if not isinstance(delta_table, DeltaTable):
303 | raise TypeError("An existing delta table must be specified.")
304 |
305 | if not target_path and not target_table:
306 | raise TypeError("Either target_path or target_table must be specified.")
307 |
308 | origin_table = delta_table.toDF()
309 |
310 | details = delta_table.detail().select("partitionColumns", "properties").collect()[0]
311 |
312 | if target_table:
313 | (
314 | origin_table.write.format("delta")
315 | .partitionBy(details["partitionColumns"])
316 | .options(**details["properties"])
317 | .saveAsTable(target_table)
318 | )
319 | else:
320 | (
321 | origin_table.write.format("delta")
322 | .partitionBy(details["partitionColumns"])
323 | .options(**details["properties"])
324 | .save(target_path)
325 | )
326 |
327 |
328 | def validate_append(
329 | delta_table: DeltaTable,
330 | append_df: DataFrame,
331 | required_cols: List[str],
332 | optional_cols: List[str],
333 | ) -> None:
334 | """
335 |
336 |
337 | :param delta_table:
338 | :type delta_table: DeltaTable
339 | :param append_df:
340 | :type append_df: DataFrame
341 | :param required_cols:
342 | :type required_cols: List[str]
343 | :param optional_cols:
344 | :type optional_cols: List[str]
345 |
346 | :raises TypeError: Raises type error when input arguments have a invalid type, are missing or are empty.
347 | :raises TypeError: Raises type error when required columns are missing in the provided delta table.
348 | :raises TypeError: Raises type error when column in append dataframe is not part of the original delta table..
349 | """
350 | if not isinstance(delta_table, DeltaTable):
351 | raise TypeError("An existing delta table must be specified.")
352 |
353 | if not isinstance(append_df, DataFrame):
354 | raise TypeError("You must provide a DataFrame that is to be appended.")
355 |
356 | append_data_columns = append_df.columns
357 |
358 | for required_column in required_cols:
359 | if required_column not in append_data_columns:
360 | raise TypeError(
361 | f"The base Delta table has these columns {append_data_columns!r}, but these columns are required {required_cols!r}"
362 | )
363 |
364 | table_columns = delta_table.toDF().columns
365 |
366 | for column in append_data_columns:
367 | if column not in table_columns and column not in optional_cols:
368 | raise TypeError(
369 | f"The column {column!r} is not part of the current Delta table."
370 | + " If you want to add the column to the table you must set the optional_cols parameter."
371 | )
372 |
373 | details = delta_table.detail().select("location").collect()[0]
374 |
375 | (
376 | append_df.write.format("delta")
377 | .mode("append")
378 | .option("mergeSchema", "true")
379 | .save(details["location"])
380 | )
381 |
382 |
383 | def append_without_duplicates(
384 | delta_table: DeltaTable, append_df: DataFrame, p_keys: List[str]
385 | ) -> None:
386 | """
387 |
388 |
389 | :param delta_table:
390 | :type delta_table: DeltaTable
391 | :param append_df:
392 | :type append_df: DataFrame
393 | :param p_keys:
394 | :type p_keys: List[str]
395 |
396 | :raises TypeError: Raises type error when input arguments have a invalid type.
397 | """
398 | if not isinstance(delta_table, DeltaTable):
399 | raise TypeError("An existing delta table must be specified.")
400 |
401 | condition_columns = []
402 | for column in p_keys:
403 | condition_columns.append(f"old.{column} = new.{column}")
404 |
405 | condition_columns = " AND ".join(condition_columns)
406 |
407 | deduplicated_append_df = append_df.drop_duplicates(p_keys)
408 |
409 | # Insert records without duplicates
410 | delta_table.alias("old").merge(
411 | deduplicated_append_df.alias("new"), condition_columns
412 | ).whenNotMatchedInsertAll().execute()
413 |
414 |
415 | def is_composite_key_candidate(delta_table: DeltaTable, cols: List[str]) -> bool:
416 | """
417 |
418 |
419 | :param delta_table:
420 | :type delta_table: DeltaTable
421 | :param cols:
422 | :type cols: List[str]
423 |
424 | :raises TypeError: Raises type error when input arguments have a invalid type or are missing.
425 | :raises TypeError: Raises type error when required columns are not in dataframe columns.
426 |
427 | :returns:
428 | :rtype: bool
429 | """
430 | if not isinstance(delta_table, DeltaTable):
431 | raise TypeError("An existing delta table must be specified.")
432 |
433 | if not cols or len(cols) == 0:
434 | raise TypeError("At least one column must be specified.")
435 |
436 | data_frame = delta_table.toDF()
437 |
438 | for required_column in cols:
439 | if required_column not in data_frame.columns:
440 | raise TypeError(
441 | f"The base table has these columns {data_frame.columns!r}, but these columns are required {cols!r}"
442 | )
443 |
444 | duplicate_records = (
445 | data_frame.withColumn(
446 | "amount_of_records",
447 | count("*").over(Window.partitionBy(cols)),
448 | )
449 | .filter(col("amount_of_records") > 1)
450 | .drop("amount_of_records")
451 | )
452 |
453 | if len(duplicate_records.take(1)) == 0:
454 | return True
455 |
456 | return False
457 |
458 |
459 | def delta_file_sizes(delta_table: DeltaTable) -> Dict[str, int]:
460 | """
461 |
462 |
463 | :param delta_table:
464 | :type delta_table: DeltaTable
465 |
466 | :returns:
467 | :rtype: Dict[str, int]
468 | """
469 | details = delta_table.detail().select("numFiles", "sizeInBytes").collect()[0]
470 | size_in_bytes, number_of_files = details["sizeInBytes"], details["numFiles"]
471 | average_file_size_in_bytes = round(size_in_bytes / number_of_files, 0)
472 |
473 | return {
474 | "size_in_bytes": size_in_bytes,
475 | "number_of_files": number_of_files,
476 | "average_file_size_in_bytes": average_file_size_in_bytes,
477 | }
478 |
479 |
480 | def show_delta_file_sizes(
481 | delta_table: DeltaTable, humanize_binary: bool = False
482 | ) -> None:
483 | """
484 |
485 |
486 | :param delta_table:
487 | :type delta_table: DeltaTable
488 | :param humanize_binary:
489 | :type humanize_binary: bool
490 |
491 | :returns:
492 | :rtype: None
493 | """
494 | details = delta_table.detail().select("numFiles", "sizeInBytes").collect()[0]
495 | size_in_bytes, number_of_files = details["sizeInBytes"], details["numFiles"]
496 | average_file_size_in_bytes = round(size_in_bytes / number_of_files, 0)
497 |
498 | if humanize_binary:
499 | humanized_size_in_bytes = humanize_bytes_binary(size_in_bytes)
500 | humanized_average_file_size = humanize_bytes_binary(average_file_size_in_bytes)
501 | else:
502 | humanized_size_in_bytes = humanize_bytes(size_in_bytes)
503 | humanized_average_file_size = humanize_bytes(average_file_size_in_bytes)
504 | humanized_number_of_files = f"{number_of_files:,}"
505 |
506 | print(
507 | f"The delta table contains {humanized_number_of_files} files with a size of {humanized_size_in_bytes}."
508 | + f" The average file size is {humanized_average_file_size}"
509 | )
510 |
511 |
512 | def humanize_bytes(n: int) -> str:
513 | """
514 |
515 |
516 | :param n:
517 | :type n: int
518 |
519 | :returns:
520 | :rtype: str
521 | """
522 | kilobyte = 1000
523 | for prefix, k in (
524 | ("PB", kilobyte**5),
525 | ("TB", kilobyte**4),
526 | ("GB", kilobyte**3),
527 | ("MB", kilobyte**2),
528 | ("kB", kilobyte**1),
529 | ):
530 | if n >= k * 0.9:
531 | return f"{n / k:.2f} {prefix}"
532 | return f"{n} B"
533 |
534 |
535 | def humanize_bytes_binary(n: int) -> str:
536 | """
537 |
538 |
539 | :param n:
540 | :type n: int
541 |
542 | :returns:
543 | :rtype: str
544 | """
545 | kibibyte = 1024
546 | for prefix, k in (
547 | ("PB", kibibyte**5),
548 | ("TB", kibibyte**4),
549 | ("GB", kibibyte**3),
550 | ("MB", kibibyte**2),
551 | ("kB", kibibyte**1),
552 | ):
553 | if n >= k * 0.9:
554 | return f"{n / k:.2f} {prefix}"
555 | return f"{n} B"
556 |
557 |
558 | def find_composite_key_candidates(
559 | df: Union[DeltaTable, DataFrame], exclude_cols: List[str] = None
560 | ) -> List:
561 | """
562 |
563 |
564 | :param df:
565 | :type df: DeltaTable or DataFrame
566 | :param exclude_cols:
567 | :type exclude_cols: List[str], defaults to None.
568 |
569 | :raises TypeError: Raises type error when no composite key can be found.
570 |
571 | :returns:
572 | :rtype: List
573 | """
574 | if type(df) == DeltaTable:
575 | df = df.toDF()
576 | if exclude_cols is None:
577 | exclude_cols = []
578 | df_col_excluded = df.drop(*exclude_cols)
579 | total_cols = len(df_col_excluded.columns)
580 | total_row_count = df_col_excluded.distinct().count()
581 | for n in range(1, len(df_col_excluded.columns) + 1):
582 | for c in combinations(df_col_excluded.columns, n):
583 | if df_col_excluded.select(*c).distinct().count() == total_row_count:
584 | if len(df_col_excluded.select(*c).columns) == total_cols:
585 | raise ValueError("No composite key candidates could be identified.")
586 | return list(df_col_excluded.select(*c).columns)
587 |
588 |
589 | def with_md5_cols(
590 | df: Union[DeltaTable, DataFrame],
591 | cols: List[str],
592 | output_col_name: Optional[str] = None,
593 | ) -> DataFrame:
594 | """
595 |
596 |
597 | :param df:
598 | :type df: DeltaTable or DataFrame
599 | :param cols:
600 | :type cols: List[str]
601 | :param output_col_name:
602 | :type output_col_name: str, defaults to empty string.
603 |
604 | :raises TypeError: Raises type error when no composite key can be found.
605 |
606 | :returns:
607 | :rtype: DataFrame
608 | """
609 | if output_col_name is None:
610 | output_col_name = "_".join(["md5"] + cols)
611 | if type(df) == DeltaTable:
612 | df = df.toDF()
613 | return df.withColumn(output_col_name, md5(concat_ws("||", *cols)))
614 |
615 |
616 | def latest_version(delta_table: DeltaTable) -> float:
617 | """
618 |
619 |
620 | :param delta_table:
621 | :type delta_table: DeltaTable
622 |
623 | :returns:
624 | :rtype: float
625 | """
626 | version = delta_table.history().agg(max("version")).collect()[0][0]
627 | return version
628 |
629 |
630 | def constraint_append(
631 | delta_table: DeltaTable, append_df: DataFrame, quarantine_table: DeltaTable
632 | ):
633 | """
634 |
635 |
636 | :param delta_table:
637 | :type delta_table: DeltaTable
638 | :param append_df:
639 | :type append_df: DataFrame
640 | :param quarantine_table:
641 | :type quarantine_table: DeltaTable
642 |
643 | :raises TypeError: Raises type error when input arguments have an invalid type.
644 | :raises TypeError: Raises type error when delta_table has no constraints.
645 | """
646 |
647 | if not isinstance(delta_table, DeltaTable):
648 | raise TypeError("An existing delta table must be specified for delta_table.")
649 |
650 | if not isinstance(append_df, DataFrame):
651 | raise TypeError("You must provide a DataFrame that is to be appended.")
652 |
653 | if quarantine_table is not None and not isinstance(quarantine_table, DeltaTable):
654 | raise TypeError(
655 | "An existing delta table must be specified for quarantine_table."
656 | )
657 |
658 | properties = delta_table.detail().select("properties").collect()[0]["properties"]
659 | check_constraints = [
660 | v for k, v in properties.items() if k.startswith("delta.constraints")
661 | ]
662 |
663 | # add null checks
664 | fields = delta_table.toDF().schema.fields
665 | null_constraints = [
666 | f"{field.name} is not null" for field in fields if not field.nullable
667 | ]
668 |
669 | constraints = check_constraints + null_constraints
670 |
671 | if not constraints:
672 | raise TypeError("There are no constraints present in the target delta table")
673 |
674 | target_details = delta_table.detail().select("location").collect()[0]
675 | if quarantine_table:
676 | quarantine_details = quarantine_table.detail().select("location").collect()[0]
677 | quarantine_df = append_df.filter(
678 | "not (" + " and ".join([c for c in constraints]) + ")"
679 | )
680 | (
681 | quarantine_df.write.format("delta")
682 | .mode("append")
683 | .option("mergeSchema", "true")
684 | .save(quarantine_details["location"])
685 | )
686 |
687 | filtered_df = append_df.filter(" and ".join([c for c in constraints]))
688 | (
689 | filtered_df.write.format("delta")
690 | .mode("append")
691 | .option("mergeSchema", "true")
692 | .save(target_details["location"])
693 | )
694 |
695 |
696 | def rename_delta_table(
697 | delta_table: DeltaTable,
698 | new_table_name: str,
699 | table_location: str = None,
700 | databricks: bool = False,
701 | spark_session: pyspark.sql.SparkSession = None,
702 | ) -> None:
703 | """
704 | Renames a Delta table to a new name. This function can be used in a Databricks environment or with a
705 | standalone Spark session.
706 |
707 | Parameters:
708 | delta_table (DeltaTable): The DeltaTable object representing the table to be renamed.
709 | new_table_name (str): The new name for the table.
710 | table_location (str, optional): The file path where the table is stored. Defaults to None.
711 | If None, the function will attempt to determine the location from the DeltaTable object.
712 | databricks (bool, optional): A flag indicating whether the function is being run in a Databricks
713 | environment. Defaults to False. If True, a SparkSession must be provided.
714 | spark_session (pyspark.sql.SparkSession, optional): The Spark session. Defaults to None.
715 | Required if `databricks` is set to True.
716 |
717 | Returns:
718 | None
719 |
720 | Raises:
721 | TypeError: If the provided `delta_table` is not a DeltaTable object, or if `databricks` is True
722 | and `spark_session` is None.
723 |
724 | Example Usage:
725 | >>> rename_delta_table(existing_delta_table, "new_table_name")
726 | """
727 | if not isinstance(delta_table, DeltaTable):
728 | raise TypeError("An existing delta table must be specified for delta_table.")
729 | if databricks and spark_session is None:
730 | raise TypeError("A spark session must be specified for databricks.")
731 |
732 | if databricks:
733 | spark_session.sql(f"ALTER TABLE {delta_table.name} RENAME TO {new_table_name}")
734 | else:
735 | delta_table.toDF().write.format("delta").mode("overwrite").saveAsTable(
736 | new_table_name
737 | )
738 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Mack
2 |
3 | theme:
4 | name: "readthedocs"
5 |
6 | plugins:
7 | - search
8 | - gen-files:
9 | scripts:
10 | - docs/gen_ref_pages.py
11 | - section-index
12 | - mkdocstrings:
13 | default_handler: python
14 | handlers:
15 | python:
16 | options:
17 | docstring_style: sphinx
18 | docstring_options:
19 | show_if_no_docstring: true
20 | show_source: true
21 |
22 | nav:
23 | - Mack: index.md
24 | - API Docs: reference/SUMMARY.md
25 |
26 | markdown_extensions:
27 | - markdown_include.include:
28 | base_path: .
29 |
--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
1 | # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
2 |
3 | [[package]]
4 | name = "attrs"
5 | version = "24.2.0"
6 | description = "Classes Without Boilerplate"
7 | optional = false
8 | python-versions = ">=3.7"
9 | files = [
10 | {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
11 | {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
12 | ]
13 |
14 | [package.extras]
15 | benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
16 | cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
17 | dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
18 | docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
19 | tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
20 | tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
21 |
22 | [[package]]
23 | name = "cfgv"
24 | version = "3.4.0"
25 | description = "Validate configuration and produce human readable error messages."
26 | optional = false
27 | python-versions = ">=3.8"
28 | files = [
29 | {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
30 | {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
31 | ]
32 |
33 | [[package]]
34 | name = "chispa"
35 | version = "0.9.2"
36 | description = "Pyspark test helper library"
37 | optional = false
38 | python-versions = ">=3.5"
39 | files = [
40 | {file = "chispa-0.9.2-py3-none-any.whl", hash = "sha256:c6eae922f5c3ccd08f4dc3707202291bb249e68e319d0641795d92d80cfb1cad"},
41 | {file = "chispa-0.9.2.tar.gz", hash = "sha256:621ad2e64fd27e7372c7b90ab2d5ad1f8dd69b737a3421ba5b6f84b113a18b84"},
42 | ]
43 |
44 | [[package]]
45 | name = "click"
46 | version = "8.1.7"
47 | description = "Composable command line interface toolkit"
48 | optional = false
49 | python-versions = ">=3.7"
50 | files = [
51 | {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
52 | {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
53 | ]
54 |
55 | [package.dependencies]
56 | colorama = {version = "*", markers = "platform_system == \"Windows\""}
57 |
58 | [[package]]
59 | name = "colorama"
60 | version = "0.4.6"
61 | description = "Cross-platform colored terminal text."
62 | optional = false
63 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
64 | files = [
65 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
66 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
67 | ]
68 |
69 | [[package]]
70 | name = "delta-spark"
71 | version = "3.2.0"
72 | description = "Python APIs for using Delta Lake with Apache Spark"
73 | optional = false
74 | python-versions = ">=3.6"
75 | files = [
76 | {file = "delta-spark-3.2.0.tar.gz", hash = "sha256:641967828e47c64805f8c746513da80bea24b5f19b069cdcf64561cd3692e11d"},
77 | {file = "delta_spark-3.2.0-py3-none-any.whl", hash = "sha256:c4ff3fa7218e58a702cb71eb64384b0005c4d6f0bbdd0fe0b38a53564d946e09"},
78 | ]
79 |
80 | [package.dependencies]
81 | importlib-metadata = ">=1.0.0"
82 | pyspark = ">=3.5.0,<3.6.0"
83 |
84 | [[package]]
85 | name = "distlib"
86 | version = "0.3.8"
87 | description = "Distribution utilities"
88 | optional = false
89 | python-versions = "*"
90 | files = [
91 | {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"},
92 | {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
93 | ]
94 |
95 | [[package]]
96 | name = "filelock"
97 | version = "3.15.4"
98 | description = "A platform independent file lock."
99 | optional = false
100 | python-versions = ">=3.8"
101 | files = [
102 | {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"},
103 | {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"},
104 | ]
105 |
106 | [package.extras]
107 | docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
108 | testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"]
109 | typing = ["typing-extensions (>=4.8)"]
110 |
111 | [[package]]
112 | name = "ghp-import"
113 | version = "2.1.0"
114 | description = "Copy your docs directly to the gh-pages branch."
115 | optional = false
116 | python-versions = "*"
117 | files = [
118 | {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"},
119 | {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"},
120 | ]
121 |
122 | [package.dependencies]
123 | python-dateutil = ">=2.8.1"
124 |
125 | [package.extras]
126 | dev = ["flake8", "markdown", "twine", "wheel"]
127 |
128 | [[package]]
129 | name = "griffe"
130 | version = "1.2.0"
131 | description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
132 | optional = false
133 | python-versions = ">=3.8"
134 | files = [
135 | {file = "griffe-1.2.0-py3-none-any.whl", hash = "sha256:a8b2fcb1ecdc5a412e646b0b4375eb20a5d2eac3a11dd8c10c56967a4097663c"},
136 | {file = "griffe-1.2.0.tar.gz", hash = "sha256:1c9f6ef7455930f3f9b0c4145a961c90385d1e2cbc496f7796fbff560ec60d31"},
137 | ]
138 |
139 | [package.dependencies]
140 | colorama = ">=0.4"
141 |
142 | [[package]]
143 | name = "identify"
144 | version = "2.6.0"
145 | description = "File identification library for Python"
146 | optional = false
147 | python-versions = ">=3.8"
148 | files = [
149 | {file = "identify-2.6.0-py2.py3-none-any.whl", hash = "sha256:e79ae4406387a9d300332b5fd366d8994f1525e8414984e1a59e058b2eda2dd0"},
150 | {file = "identify-2.6.0.tar.gz", hash = "sha256:cb171c685bdc31bcc4c1734698736a7d5b6c8bf2e0c15117f4d469c8640ae5cf"},
151 | ]
152 |
153 | [package.extras]
154 | license = ["ukkonen"]
155 |
156 | [[package]]
157 | name = "importlib-metadata"
158 | version = "8.4.0"
159 | description = "Read metadata from Python packages"
160 | optional = false
161 | python-versions = ">=3.8"
162 | files = [
163 | {file = "importlib_metadata-8.4.0-py3-none-any.whl", hash = "sha256:66f342cc6ac9818fc6ff340576acd24d65ba0b3efabb2b4ac08b598965a4a2f1"},
164 | {file = "importlib_metadata-8.4.0.tar.gz", hash = "sha256:9a547d3bc3608b025f93d403fdd1aae741c24fbb8314df4b155675742ce303c5"},
165 | ]
166 |
167 | [package.dependencies]
168 | zipp = ">=0.5"
169 |
170 | [package.extras]
171 | doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
172 | perf = ["ipython"]
173 | test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]
174 |
175 | [[package]]
176 | name = "iniconfig"
177 | version = "2.0.0"
178 | description = "brain-dead simple config-ini parsing"
179 | optional = false
180 | python-versions = ">=3.7"
181 | files = [
182 | {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
183 | {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
184 | ]
185 |
186 | [[package]]
187 | name = "jinja2"
188 | version = "3.1.4"
189 | description = "A very fast and expressive template engine."
190 | optional = false
191 | python-versions = ">=3.7"
192 | files = [
193 | {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
194 | {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
195 | ]
196 |
197 | [package.dependencies]
198 | MarkupSafe = ">=2.0"
199 |
200 | [package.extras]
201 | i18n = ["Babel (>=2.7)"]
202 |
203 | [[package]]
204 | name = "markdown"
205 | version = "3.7"
206 | description = "Python implementation of John Gruber's Markdown."
207 | optional = false
208 | python-versions = ">=3.8"
209 | files = [
210 | {file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"},
211 | {file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"},
212 | ]
213 |
214 | [package.extras]
215 | docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
216 | testing = ["coverage", "pyyaml"]
217 |
218 | [[package]]
219 | name = "markdown-include"
220 | version = "0.8.1"
221 | description = "A Python-Markdown extension which provides an 'include' function"
222 | optional = false
223 | python-versions = ">=3.7"
224 | files = [
225 | {file = "markdown-include-0.8.1.tar.gz", hash = "sha256:1d0623e0fc2757c38d35df53752768356162284259d259c486b4ab6285cdbbe3"},
226 | {file = "markdown_include-0.8.1-py3-none-any.whl", hash = "sha256:32f0635b9cfef46997b307e2430022852529f7a5b87c0075c504283e7cc7db53"},
227 | ]
228 |
229 | [package.dependencies]
230 | markdown = ">=3.0"
231 |
232 | [package.extras]
233 | tests = ["pytest"]
234 |
235 | [[package]]
236 | name = "markupsafe"
237 | version = "2.1.5"
238 | description = "Safely add untrusted strings to HTML/XML markup."
239 | optional = false
240 | python-versions = ">=3.7"
241 | files = [
242 | {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"},
243 | {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"},
244 | {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"},
245 | {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"},
246 | {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"},
247 | {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"},
248 | {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"},
249 | {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"},
250 | {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"},
251 | {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"},
252 | {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
253 | {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
254 | {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"},
255 | {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"},
256 | {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"},
257 | {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"},
258 | {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"},
259 | {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"},
260 | {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"},
261 | {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"},
262 | {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"},
263 | {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"},
264 | {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"},
265 | {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"},
266 | {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"},
267 | {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"},
268 | {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"},
269 | {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"},
270 | {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"},
271 | {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"},
272 | {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"},
273 | {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"},
274 | {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"},
275 | {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"},
276 | {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"},
277 | {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"},
278 | {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"},
279 | {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"},
280 | {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"},
281 | {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"},
282 | {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"},
283 | {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"},
284 | {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"},
285 | {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"},
286 | {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"},
287 | {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"},
288 | {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"},
289 | {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"},
290 | {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"},
291 | {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"},
292 | {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"},
293 | {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"},
294 | {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"},
295 | {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"},
296 | {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"},
297 | {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"},
298 | {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"},
299 | {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"},
300 | {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"},
301 | {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
302 | ]
303 |
304 | [[package]]
305 | name = "mergedeep"
306 | version = "1.3.4"
307 | description = "A deep merge function for 🐍."
308 | optional = false
309 | python-versions = ">=3.6"
310 | files = [
311 | {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"},
312 | {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"},
313 | ]
314 |
315 | [[package]]
316 | name = "mkdocs"
317 | version = "1.6.1"
318 | description = "Project documentation with Markdown."
319 | optional = false
320 | python-versions = ">=3.8"
321 | files = [
322 | {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"},
323 | {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"},
324 | ]
325 |
326 | [package.dependencies]
327 | click = ">=7.0"
328 | colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""}
329 | ghp-import = ">=1.0"
330 | jinja2 = ">=2.11.1"
331 | markdown = ">=3.3.6"
332 | markupsafe = ">=2.0.1"
333 | mergedeep = ">=1.3.4"
334 | mkdocs-get-deps = ">=0.2.0"
335 | packaging = ">=20.5"
336 | pathspec = ">=0.11.1"
337 | pyyaml = ">=5.1"
338 | pyyaml-env-tag = ">=0.1"
339 | watchdog = ">=2.0"
340 |
341 | [package.extras]
342 | i18n = ["babel (>=2.9.0)"]
343 | min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"]
344 |
345 | [[package]]
346 | name = "mkdocs-autorefs"
347 | version = "1.2.0"
348 | description = "Automatically link across pages in MkDocs."
349 | optional = false
350 | python-versions = ">=3.8"
351 | files = [
352 | {file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"},
353 | {file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"},
354 | ]
355 |
356 | [package.dependencies]
357 | Markdown = ">=3.3"
358 | markupsafe = ">=2.0.1"
359 | mkdocs = ">=1.1"
360 |
361 | [[package]]
362 | name = "mkdocs-gen-files"
363 | version = "0.4.0"
364 | description = "MkDocs plugin to programmatically generate documentation pages during the build"
365 | optional = false
366 | python-versions = ">=3.7,<4.0"
367 | files = [
368 | {file = "mkdocs-gen-files-0.4.0.tar.gz", hash = "sha256:377bff8ee8e93515916689f483d971643f83a94eed7e92318854da8f344f0163"},
369 | {file = "mkdocs_gen_files-0.4.0-py3-none-any.whl", hash = "sha256:3241a4c947ecd11763ca77cc645015305bf71a0e1b9b886801c114fcf9971e71"},
370 | ]
371 |
372 | [package.dependencies]
373 | mkdocs = ">=1.0.3,<2.0.0"
374 |
375 | [[package]]
376 | name = "mkdocs-get-deps"
377 | version = "0.2.0"
378 | description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file"
379 | optional = false
380 | python-versions = ">=3.8"
381 | files = [
382 | {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"},
383 | {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"},
384 | ]
385 |
386 | [package.dependencies]
387 | mergedeep = ">=1.3.4"
388 | platformdirs = ">=2.2.0"
389 | pyyaml = ">=5.1"
390 |
391 | [[package]]
392 | name = "mkdocs-literate-nav"
393 | version = "0.6.1"
394 | description = "MkDocs plugin to specify the navigation in Markdown instead of YAML"
395 | optional = false
396 | python-versions = ">=3.7"
397 | files = [
398 | {file = "mkdocs_literate_nav-0.6.1-py3-none-any.whl", hash = "sha256:e70bdc4a07050d32da79c0b697bd88e9a104cf3294282e9cb20eec94c6b0f401"},
399 | {file = "mkdocs_literate_nav-0.6.1.tar.gz", hash = "sha256:78a7ab6d878371728acb0cdc6235c9b0ffc6e83c997b037f4a5c6ff7cef7d759"},
400 | ]
401 |
402 | [package.dependencies]
403 | mkdocs = ">=1.0.3"
404 |
405 | [[package]]
406 | name = "mkdocs-section-index"
407 | version = "0.3.9"
408 | description = "MkDocs plugin to allow clickable sections that lead to an index page"
409 | optional = false
410 | python-versions = ">=3.8"
411 | files = [
412 | {file = "mkdocs_section_index-0.3.9-py3-none-any.whl", hash = "sha256:5e5eb288e8d7984d36c11ead5533f376fdf23498f44e903929d72845b24dfe34"},
413 | {file = "mkdocs_section_index-0.3.9.tar.gz", hash = "sha256:b66128d19108beceb08b226ee1ba0981840d14baf8a652b6c59e650f3f92e4f8"},
414 | ]
415 |
416 | [package.dependencies]
417 | mkdocs = ">=1.2"
418 |
419 | [[package]]
420 | name = "mkdocstrings"
421 | version = "0.26.0"
422 | description = "Automatic documentation from sources, for MkDocs."
423 | optional = false
424 | python-versions = ">=3.8"
425 | files = [
426 | {file = "mkdocstrings-0.26.0-py3-none-any.whl", hash = "sha256:1aa227fe94f88e80737d37514523aacd473fc4b50a7f6852ce41447ab23f2654"},
427 | {file = "mkdocstrings-0.26.0.tar.gz", hash = "sha256:ff9d0de28c8fa877ed9b29a42fe407cfe6736d70a1c48177aa84fcc3dc8518cd"},
428 | ]
429 |
430 | [package.dependencies]
431 | click = ">=7.0"
432 | Jinja2 = ">=2.11.1"
433 | Markdown = ">=3.6"
434 | MarkupSafe = ">=1.1"
435 | mkdocs = ">=1.4"
436 | mkdocs-autorefs = ">=1.2"
437 | platformdirs = ">=2.2"
438 | pymdown-extensions = ">=6.3"
439 |
440 | [package.extras]
441 | crystal = ["mkdocstrings-crystal (>=0.3.4)"]
442 | python = ["mkdocstrings-python (>=0.5.2)"]
443 | python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
444 |
445 | [[package]]
446 | name = "mkdocstrings-python"
447 | version = "0.8.3"
448 | description = "A Python handler for mkdocstrings."
449 | optional = false
450 | python-versions = ">=3.7"
451 | files = [
452 | {file = "mkdocstrings-python-0.8.3.tar.gz", hash = "sha256:9ae473f6dc599339b09eee17e4d2b05d6ac0ec29860f3fc9b7512d940fc61adf"},
453 | {file = "mkdocstrings_python-0.8.3-py3-none-any.whl", hash = "sha256:4e6e1cd6f37a785de0946ced6eb846eb2f5d891ac1cc2c7b832943d3529087a7"},
454 | ]
455 |
456 | [package.dependencies]
457 | griffe = ">=0.24"
458 | mkdocstrings = ">=0.19"
459 |
460 | [[package]]
461 | name = "nodeenv"
462 | version = "1.9.1"
463 | description = "Node.js virtual environment builder"
464 | optional = false
465 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
466 | files = [
467 | {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
468 | {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
469 | ]
470 |
471 | [[package]]
472 | name = "packaging"
473 | version = "24.1"
474 | description = "Core utilities for Python packages"
475 | optional = false
476 | python-versions = ">=3.8"
477 | files = [
478 | {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
479 | {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
480 | ]
481 |
482 | [[package]]
483 | name = "pathspec"
484 | version = "0.12.1"
485 | description = "Utility library for gitignore style pattern matching of file paths."
486 | optional = false
487 | python-versions = ">=3.8"
488 | files = [
489 | {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
490 | {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
491 | ]
492 |
493 | [[package]]
494 | name = "platformdirs"
495 | version = "4.2.2"
496 | description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
497 | optional = false
498 | python-versions = ">=3.8"
499 | files = [
500 | {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"},
501 | {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"},
502 | ]
503 |
504 | [package.extras]
505 | docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
506 | test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"]
507 | type = ["mypy (>=1.8)"]
508 |
509 | [[package]]
510 | name = "pluggy"
511 | version = "1.5.0"
512 | description = "plugin and hook calling mechanisms for python"
513 | optional = false
514 | python-versions = ">=3.8"
515 | files = [
516 | {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
517 | {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
518 | ]
519 |
520 | [package.extras]
521 | dev = ["pre-commit", "tox"]
522 | testing = ["pytest", "pytest-benchmark"]
523 |
524 | [[package]]
525 | name = "pre-commit"
526 | version = "2.21.0"
527 | description = "A framework for managing and maintaining multi-language pre-commit hooks."
528 | optional = false
529 | python-versions = ">=3.7"
530 | files = [
531 | {file = "pre_commit-2.21.0-py2.py3-none-any.whl", hash = "sha256:e2f91727039fc39a92f58a588a25b87f936de6567eed4f0e673e0507edc75bad"},
532 | {file = "pre_commit-2.21.0.tar.gz", hash = "sha256:31ef31af7e474a8d8995027fefdfcf509b5c913ff31f2015b4ec4beb26a6f658"},
533 | ]
534 |
535 | [package.dependencies]
536 | cfgv = ">=2.0.0"
537 | identify = ">=1.0.0"
538 | nodeenv = ">=0.11.1"
539 | pyyaml = ">=5.1"
540 | virtualenv = ">=20.10.0"
541 |
542 | [[package]]
543 | name = "py4j"
544 | version = "0.10.9.7"
545 | description = "Enables Python programs to dynamically access arbitrary Java objects"
546 | optional = false
547 | python-versions = "*"
548 | files = [
549 | {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"},
550 | {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"},
551 | ]
552 |
553 | [[package]]
554 | name = "pymdown-extensions"
555 | version = "10.9"
556 | description = "Extension pack for Python Markdown."
557 | optional = false
558 | python-versions = ">=3.8"
559 | files = [
560 | {file = "pymdown_extensions-10.9-py3-none-any.whl", hash = "sha256:d323f7e90d83c86113ee78f3fe62fc9dee5f56b54d912660703ea1816fed5626"},
561 | {file = "pymdown_extensions-10.9.tar.gz", hash = "sha256:6ff740bcd99ec4172a938970d42b96128bdc9d4b9bcad72494f29921dc69b753"},
562 | ]
563 |
564 | [package.dependencies]
565 | markdown = ">=3.6"
566 | pyyaml = "*"
567 |
568 | [package.extras]
569 | extra = ["pygments (>=2.12)"]
570 |
571 | [[package]]
572 | name = "pyspark"
573 | version = "3.5.0"
574 | description = "Apache Spark Python API"
575 | optional = false
576 | python-versions = ">=3.8"
577 | files = [
578 | {file = "pyspark-3.5.0.tar.gz", hash = "sha256:d41a9b76bd2aca370a6100d075c029e22ba44c5940927877e9435a3a9c566558"},
579 | ]
580 |
581 | [package.dependencies]
582 | py4j = "0.10.9.7"
583 |
584 | [package.extras]
585 | connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"]
586 | ml = ["numpy (>=1.15)"]
587 | mllib = ["numpy (>=1.15)"]
588 | pandas-on-spark = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"]
589 | sql = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"]
590 |
591 | [[package]]
592 | name = "pytest"
593 | version = "7.2.0"
594 | description = "pytest: simple powerful testing with Python"
595 | optional = false
596 | python-versions = ">=3.7"
597 | files = [
598 | {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"},
599 | {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"},
600 | ]
601 |
602 | [package.dependencies]
603 | attrs = ">=19.2.0"
604 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
605 | iniconfig = "*"
606 | packaging = "*"
607 | pluggy = ">=0.12,<2.0"
608 |
609 | [package.extras]
610 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
611 |
612 | [[package]]
613 | name = "pytest-describe"
614 | version = "1.0.0"
615 | description = "Describe-style plugin for pytest"
616 | optional = false
617 | python-versions = "*"
618 | files = [
619 | {file = "pytest-describe-1.0.0.tar.gz", hash = "sha256:3e2ea0e77efa09edb98cf90423bf1da21a462ed90bd3120f8f98fe7519a167d5"},
620 | {file = "pytest_describe-1.0.0-py2-none-any.whl", hash = "sha256:cc3862662faa5a6fb721927aaef46b46cf787e4a8163e5459fc8778e650fabad"},
621 | {file = "pytest_describe-1.0.0-py3-none-any.whl", hash = "sha256:95fe78639d4d16c4a1e7d62c70f63030b217c08d2ee6dca49559fe6e730c6696"},
622 | ]
623 |
624 | [package.dependencies]
625 | pytest = ">=2.6.0"
626 |
627 | [[package]]
628 | name = "python-dateutil"
629 | version = "2.9.0.post0"
630 | description = "Extensions to the standard Python datetime module"
631 | optional = false
632 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
633 | files = [
634 | {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
635 | {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
636 | ]
637 |
638 | [package.dependencies]
639 | six = ">=1.5"
640 |
641 | [[package]]
642 | name = "pyyaml"
643 | version = "6.0.2"
644 | description = "YAML parser and emitter for Python"
645 | optional = false
646 | python-versions = ">=3.8"
647 | files = [
648 | {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
649 | {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
650 | {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
651 | {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
652 | {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
653 | {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
654 | {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
655 | {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
656 | {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
657 | {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
658 | {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
659 | {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
660 | {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
661 | {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
662 | {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
663 | {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
664 | {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
665 | {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
666 | {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
667 | {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
668 | {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
669 | {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
670 | {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
671 | {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
672 | {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
673 | {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
674 | {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
675 | {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
676 | {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
677 | {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
678 | {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
679 | {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
680 | {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
681 | {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
682 | {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
683 | {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
684 | {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
685 | {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
686 | {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
687 | {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
688 | {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
689 | {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
690 | {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
691 | {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
692 | {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
693 | {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
694 | {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
695 | {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
696 | {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
697 | {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
698 | {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
699 | {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
700 | {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
701 | ]
702 |
703 | [[package]]
704 | name = "pyyaml-env-tag"
705 | version = "0.1"
706 | description = "A custom YAML tag for referencing environment variables in YAML files. "
707 | optional = false
708 | python-versions = ">=3.6"
709 | files = [
710 | {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"},
711 | {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"},
712 | ]
713 |
714 | [package.dependencies]
715 | pyyaml = "*"
716 |
717 | [[package]]
718 | name = "ruff"
719 | version = "0.0.254"
720 | description = "An extremely fast Python linter, written in Rust."
721 | optional = false
722 | python-versions = ">=3.7"
723 | files = [
724 | {file = "ruff-0.0.254-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:dd58c500d039fb381af8d861ef456c3e94fd6855c3d267d6c6718c9a9fe07be0"},
725 | {file = "ruff-0.0.254-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:688379050ae05394a6f9f9c8471587fd5dcf22149bd4304a4ede233cc4ef89a1"},
726 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac1429be6d8bd3db0bf5becac3a38bd56f8421447790c50599cd90fd53417ec4"},
727 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:059a380c08e849b6f312479b18cc63bba2808cff749ad71555f61dd930e3c9a2"},
728 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3f15d5d033fd3dcb85d982d6828ddab94134686fac2c02c13a8822aa03e1321"},
729 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:8deba44fd563361c488dedec90dc330763ee0c01ba54e17df54ef5820079e7e0"},
730 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ef20bf798ffe634090ad3dc2e8aa6a055f08c448810a2f800ab716cc18b80107"},
731 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0deb1d7226ea9da9b18881736d2d96accfa7f328c67b7410478cc064ad1fa6aa"},
732 | {file = "ruff-0.0.254-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27d39d697fdd7df1f2a32c1063756ee269ad8d5345c471ee3ca450636d56e8c6"},
733 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2fc21d060a3197ac463596a97d9b5db2d429395938b270ded61dd60f0e57eb21"},
734 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f70dc93bc9db15cccf2ed2a831938919e3e630993eeea6aba5c84bc274237885"},
735 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_i686.whl", hash = "sha256:09c764bc2bd80c974f7ce1f73a46092c286085355a5711126af351b9ae4bea0c"},
736 | {file = "ruff-0.0.254-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d4385cdd30153b7aa1d8f75dfd1ae30d49c918ead7de07e69b7eadf0d5538a1f"},
737 | {file = "ruff-0.0.254-py3-none-win32.whl", hash = "sha256:c38291bda4c7b40b659e8952167f386e86ec29053ad2f733968ff1d78b4c7e15"},
738 | {file = "ruff-0.0.254-py3-none-win_amd64.whl", hash = "sha256:e15742df0f9a3615fbdc1ee9a243467e97e75bf88f86d363eee1ed42cedab1ec"},
739 | {file = "ruff-0.0.254-py3-none-win_arm64.whl", hash = "sha256:b435afc4d65591399eaf4b2af86e441a71563a2091c386cadf33eaa11064dc09"},
740 | {file = "ruff-0.0.254.tar.gz", hash = "sha256:0eb66c9520151d3bd950ea43b3a088618a8e4e10a5014a72687881e6f3606312"},
741 | ]
742 |
743 | [[package]]
744 | name = "six"
745 | version = "1.16.0"
746 | description = "Python 2 and 3 compatibility utilities"
747 | optional = false
748 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
749 | files = [
750 | {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
751 | {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
752 | ]
753 |
754 | [[package]]
755 | name = "virtualenv"
756 | version = "20.26.3"
757 | description = "Virtual Python Environment builder"
758 | optional = false
759 | python-versions = ">=3.7"
760 | files = [
761 | {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"},
762 | {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"},
763 | ]
764 |
765 | [package.dependencies]
766 | distlib = ">=0.3.7,<1"
767 | filelock = ">=3.12.2,<4"
768 | platformdirs = ">=3.9.1,<5"
769 |
770 | [package.extras]
771 | docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
772 | test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
773 |
774 | [[package]]
775 | name = "watchdog"
776 | version = "5.0.2"
777 | description = "Filesystem events monitoring"
778 | optional = false
779 | python-versions = ">=3.9"
780 | files = [
781 | {file = "watchdog-5.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d961f4123bb3c447d9fcdcb67e1530c366f10ab3a0c7d1c0c9943050936d4877"},
782 | {file = "watchdog-5.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72990192cb63872c47d5e5fefe230a401b87fd59d257ee577d61c9e5564c62e5"},
783 | {file = "watchdog-5.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6bec703ad90b35a848e05e1b40bf0050da7ca28ead7ac4be724ae5ac2653a1a0"},
784 | {file = "watchdog-5.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:dae7a1879918f6544201d33666909b040a46421054a50e0f773e0d870ed7438d"},
785 | {file = "watchdog-5.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c4a440f725f3b99133de610bfec93d570b13826f89616377715b9cd60424db6e"},
786 | {file = "watchdog-5.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8b2918c19e0d48f5f20df458c84692e2a054f02d9df25e6c3c930063eca64c1"},
787 | {file = "watchdog-5.0.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:aa9cd6e24126d4afb3752a3e70fce39f92d0e1a58a236ddf6ee823ff7dba28ee"},
788 | {file = "watchdog-5.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f627c5bf5759fdd90195b0c0431f99cff4867d212a67b384442c51136a098ed7"},
789 | {file = "watchdog-5.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d7594a6d32cda2b49df3fd9abf9b37c8d2f3eab5df45c24056b4a671ac661619"},
790 | {file = "watchdog-5.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba32efcccfe2c58f4d01115440d1672b4eb26cdd6fc5b5818f1fb41f7c3e1889"},
791 | {file = "watchdog-5.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:963f7c4c91e3f51c998eeff1b3fb24a52a8a34da4f956e470f4b068bb47b78ee"},
792 | {file = "watchdog-5.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8c47150aa12f775e22efff1eee9f0f6beee542a7aa1a985c271b1997d340184f"},
793 | {file = "watchdog-5.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:14dd4ed023d79d1f670aa659f449bcd2733c33a35c8ffd88689d9d243885198b"},
794 | {file = "watchdog-5.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b84bff0391ad4abe25c2740c7aec0e3de316fdf7764007f41e248422a7760a7f"},
795 | {file = "watchdog-5.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e8d5ff39f0a9968952cce548e8e08f849141a4fcc1290b1c17c032ba697b9d7"},
796 | {file = "watchdog-5.0.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fb223456db6e5f7bd9bbd5cd969f05aae82ae21acc00643b60d81c770abd402b"},
797 | {file = "watchdog-5.0.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9814adb768c23727a27792c77812cf4e2fd9853cd280eafa2bcfa62a99e8bd6e"},
798 | {file = "watchdog-5.0.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:901ee48c23f70193d1a7bc2d9ee297df66081dd5f46f0ca011be4f70dec80dab"},
799 | {file = "watchdog-5.0.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:638bcca3d5b1885c6ec47be67bf712b00a9ab3d4b22ec0881f4889ad870bc7e8"},
800 | {file = "watchdog-5.0.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5597c051587f8757798216f2485e85eac583c3b343e9aa09127a3a6f82c65ee8"},
801 | {file = "watchdog-5.0.2-py3-none-manylinux2014_armv7l.whl", hash = "sha256:53ed1bf71fcb8475dd0ef4912ab139c294c87b903724b6f4a8bd98e026862e6d"},
802 | {file = "watchdog-5.0.2-py3-none-manylinux2014_i686.whl", hash = "sha256:29e4a2607bd407d9552c502d38b45a05ec26a8e40cc7e94db9bb48f861fa5abc"},
803 | {file = "watchdog-5.0.2-py3-none-manylinux2014_ppc64.whl", hash = "sha256:b6dc8f1d770a8280997e4beae7b9a75a33b268c59e033e72c8a10990097e5fde"},
804 | {file = "watchdog-5.0.2-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:d2ab34adc9bf1489452965cdb16a924e97d4452fcf88a50b21859068b50b5c3b"},
805 | {file = "watchdog-5.0.2-py3-none-manylinux2014_s390x.whl", hash = "sha256:7d1aa7e4bb0f0c65a1a91ba37c10e19dabf7eaaa282c5787e51371f090748f4b"},
806 | {file = "watchdog-5.0.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:726eef8f8c634ac6584f86c9c53353a010d9f311f6c15a034f3800a7a891d941"},
807 | {file = "watchdog-5.0.2-py3-none-win32.whl", hash = "sha256:bda40c57115684d0216556671875e008279dea2dc00fcd3dde126ac8e0d7a2fb"},
808 | {file = "watchdog-5.0.2-py3-none-win_amd64.whl", hash = "sha256:d010be060c996db725fbce7e3ef14687cdcc76f4ca0e4339a68cc4532c382a73"},
809 | {file = "watchdog-5.0.2-py3-none-win_ia64.whl", hash = "sha256:3960136b2b619510569b90f0cd96408591d6c251a75c97690f4553ca88889769"},
810 | {file = "watchdog-5.0.2.tar.gz", hash = "sha256:dcebf7e475001d2cdeb020be630dc5b687e9acdd60d16fea6bb4508e7b94cf76"},
811 | ]
812 |
813 | [package.extras]
814 | watchmedo = ["PyYAML (>=3.10)"]
815 |
816 | [[package]]
817 | name = "zipp"
818 | version = "3.20.1"
819 | description = "Backport of pathlib-compatible object wrapper for zip files"
820 | optional = false
821 | python-versions = ">=3.8"
822 | files = [
823 | {file = "zipp-3.20.1-py3-none-any.whl", hash = "sha256:9960cd8967c8f85a56f920d5d507274e74f9ff813a0ab8889a5b5be2daf44064"},
824 | {file = "zipp-3.20.1.tar.gz", hash = "sha256:c22b14cc4763c5a5b04134207736c107db42e9d3ef2d9779d465f5f1bcba572b"},
825 | ]
826 |
827 | [package.extras]
828 | check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
829 | cover = ["pytest-cov"]
830 | doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
831 | enabler = ["pytest-enabler (>=2.2)"]
832 | test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
833 | type = ["pytest-mypy"]
834 |
835 | [metadata]
836 | lock-version = "2.0"
837 | python-versions = "^3.11"
838 | content-hash = "b2bbb36ae36bc6bb07e5b595bbd95c8e0c2a980e2628710bfb908e946c2a0dbc"
839 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "mack"
3 | version = "0.5.0"
4 | description = ""
5 | authors = ["Matthew Powers "]
6 | readme = "README.md"
7 |
8 | [tool.poetry.dependencies]
9 | python = "^3.11"
10 |
11 | [tool.poetry.dev-dependencies]
12 | pre-commit = "^2.20.0"
13 | pyspark = "3.5.0"
14 | delta-spark = "3.2.0"
15 | pytest = "7.2.0"
16 | chispa = "0.9.2"
17 | pytest-describe = "^1.0.0"
18 | ruff = "^0.0.254"
19 |
20 | [tool.poetry.group.mkdocs]
21 | optional = true
22 |
23 | [tool.poetry.group.mkdocs.dependencies]
24 | mkdocstrings-python = "^0.8.3"
25 | mkdocs-gen-files = "^0.4.0"
26 | mkdocs-literate-nav = "^0.6.0"
27 | mkdocs-section-index = "^0.3.5"
28 | markdown-include = "^0.8.1"
29 | mkdocs = "^1.4.2"
30 |
31 | [build-system]
32 | requires = ["poetry-core"]
33 | build-backend = "poetry.core.masonry.api"
34 |
35 | [tool.black]
36 | include = '\.pyi?$'
37 | exclude = '''
38 | /(
39 | \.git
40 | | \.hg
41 | | \.mypy_cache
42 | | \.tox
43 | | \.venv
44 | | _build
45 | | buck-out
46 | | build
47 | | dist
48 | )/
49 | '''
50 |
51 | [tool.ruff]
52 | line-length = 150
53 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MrPowers/mack/396f297d7e4db7feb1d1b1825c27d0076aa8e3e0/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_public_interface.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import chispa
3 | import pyspark
4 | from delta import DeltaTable, configure_spark_with_delta_pip
5 | from datetime import datetime as dt
6 | from pyspark.sql.types import (
7 | StructType,
8 | StructField,
9 | StringType,
10 | IntegerType,
11 | BooleanType,
12 | DateType,
13 | TimestampType,
14 | )
15 | import mack
16 |
17 | builder = (
18 | pyspark.sql.SparkSession.builder.appName("MyApp")
19 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
20 | .config(
21 | "spark.sql.catalog.spark_catalog",
22 | "org.apache.spark.sql.delta.catalog.DeltaCatalog",
23 | )
24 | .config("spark.sql.shuffle.partitions", "2")
25 | )
26 |
27 | spark = configure_spark_with_delta_pip(builder).getOrCreate()
28 |
29 |
30 | # upsert
31 | def test_upserts_with_single_attribute(tmp_path):
32 | path = f"{tmp_path}/tmp/delta-upsert-single-attr"
33 | data2 = [
34 | (1, "A", True, dt(2019, 1, 1), None),
35 | (2, "B", True, dt(2019, 1, 1), None),
36 | (4, "D", True, dt(2019, 1, 1), None),
37 | ]
38 | schema = StructType(
39 | [
40 | StructField("pkey", IntegerType(), True),
41 | StructField("attr", StringType(), True),
42 | StructField("is_current", BooleanType(), True),
43 | StructField("effective_time", TimestampType(), True),
44 | StructField("end_time", TimestampType(), True),
45 | ]
46 | )
47 | df = spark.createDataFrame(data=data2, schema=schema)
48 | df.write.format("delta").save(path)
49 |
50 | updates_data = [
51 | (2, "Z", dt(2020, 1, 1)), # value to upsert
52 | (3, "C", dt(2020, 9, 15)), # new value
53 | ]
54 | updates_schema = StructType(
55 | [
56 | StructField("pkey", IntegerType(), True),
57 | StructField("attr", StringType(), True),
58 | StructField("effective_time", TimestampType(), True),
59 | ]
60 | )
61 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema)
62 |
63 | delta_table = DeltaTable.forPath(spark, path)
64 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr"])
65 |
66 | actual_df = spark.read.format("delta").load(path)
67 |
68 | expected_df = spark.createDataFrame(
69 | [
70 | (2, "B", False, dt(2019, 1, 1), dt(2020, 1, 1)),
71 | (3, "C", True, dt(2020, 9, 15), None),
72 | (2, "Z", True, dt(2020, 1, 1), None),
73 | (4, "D", True, dt(2019, 1, 1), None),
74 | (1, "A", True, dt(2019, 1, 1), None),
75 | ],
76 | schema,
77 | )
78 |
79 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True)
80 |
81 |
82 | def test_errors_out_if_base_df_does_not_have_all_required_columns(tmp_path):
83 | path = f"{tmp_path}/tmp/delta-incomplete"
84 | data2 = [
85 | ("A", True, dt(2019, 1, 1), None),
86 | ("B", True, dt(2019, 1, 1), None),
87 | ("D", True, dt(2019, 1, 1), None),
88 | ]
89 | schema = StructType(
90 | [
91 | # pkey is missing from base!
92 | StructField("attr", StringType(), True),
93 | StructField("is_current", BooleanType(), True),
94 | StructField("effective_time", TimestampType(), True),
95 | StructField("end_time", TimestampType(), True),
96 | ]
97 | )
98 | df = spark.createDataFrame(data=data2, schema=schema)
99 | df.write.format("delta").save(path)
100 |
101 | updates_data = [
102 | (2, "Z", dt(2020, 1, 1)), # value to upsert
103 | (3, "C", dt(2020, 9, 15)), # new value
104 | ]
105 | updates_schema = StructType(
106 | [
107 | StructField("pkey", IntegerType(), True),
108 | StructField("attr", StringType(), True),
109 | StructField("effective_time", TimestampType(), True),
110 | ]
111 | )
112 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema)
113 |
114 | delta_table = DeltaTable.forPath(spark, path)
115 | with pytest.raises(TypeError):
116 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr"])
117 |
118 |
119 | def test_errors_out_if_updates_table_does_not_contain_all_required_columns(tmp_path):
120 | path = f"{tmp_path}/tmp/delta-error-udpate-missing-col"
121 | data2 = [
122 | (1, "A", True, dt(2019, 1, 1), None),
123 | (2, "B", True, dt(2019, 1, 1), None),
124 | (4, "D", True, dt(2019, 1, 1), None),
125 | ]
126 | schema = StructType(
127 | [
128 | StructField("pkey", IntegerType(), True),
129 | StructField("attr", StringType(), True),
130 | StructField("is_current", BooleanType(), True),
131 | StructField("effective_time", TimestampType(), True),
132 | StructField("end_time", TimestampType(), True),
133 | ]
134 | )
135 | df = spark.createDataFrame(data=data2, schema=schema)
136 | df.write.format("delta").save(path)
137 |
138 | updates_data = [
139 | ("Z", dt(2020, 1, 1)), # value to upsert
140 | ("C", dt(2020, 9, 15)), # new value
141 | ]
142 | updates_schema = StructType(
143 | [
144 | # pkey is missing from updates DataFrame
145 | StructField("attr", StringType(), True),
146 | StructField("effective_time", TimestampType(), True),
147 | ]
148 | )
149 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema)
150 |
151 | delta_table = DeltaTable.forPath(spark, path)
152 | with pytest.raises(TypeError):
153 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr"])
154 |
155 |
156 | def test_upserts_based_on_multiple_attributes(tmp_path):
157 | path = f"{tmp_path}/tmp/delta-upsert-multiple-attr"
158 | data2 = [
159 | (1, "A", "A", True, dt(2019, 1, 1), None),
160 | (2, "B", "B", True, dt(2019, 1, 1), None),
161 | (4, "D", "D", True, dt(2019, 1, 1), None),
162 | ]
163 | schema = StructType(
164 | [
165 | StructField("pkey", IntegerType(), True),
166 | StructField("attr1", StringType(), True),
167 | StructField("attr2", StringType(), True),
168 | StructField("is_current", BooleanType(), True),
169 | StructField("effective_time", TimestampType(), True),
170 | StructField("end_time", TimestampType(), True),
171 | ]
172 | )
173 | df = spark.createDataFrame(data=data2, schema=schema)
174 | df.write.format("delta").save(path)
175 |
176 | updates_data = [
177 | (2, "Z", None, dt(2020, 1, 1)), # value to upsert
178 | (3, "C", "C", dt(2020, 9, 15)), # new value
179 | ]
180 | updates_schema = StructType(
181 | [
182 | StructField("pkey", IntegerType(), True),
183 | StructField("attr1", StringType(), True),
184 | StructField("attr2", StringType(), True),
185 | StructField("effective_time", TimestampType(), True),
186 | ]
187 | )
188 | updates_df = spark.createDataFrame(data=updates_data, schema=updates_schema)
189 |
190 | delta_table = DeltaTable.forPath(spark, path)
191 | mack.type_2_scd_upsert(delta_table, updates_df, "pkey", ["attr1", "attr2"])
192 |
193 | actual_df = spark.read.format("delta").load(path)
194 |
195 | expected_df = spark.createDataFrame(
196 | [
197 | (2, "B", "B", False, dt(2019, 1, 1), dt(2020, 1, 1)),
198 | (3, "C", "C", True, dt(2020, 9, 15), None),
199 | (2, "Z", None, True, dt(2020, 1, 1), None),
200 | (4, "D", "D", True, dt(2019, 1, 1), None),
201 | (1, "A", "A", True, dt(2019, 1, 1), None),
202 | ],
203 | schema,
204 | )
205 |
206 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True)
207 |
208 |
209 | # def describe_type_2_scd_generic_upsert():
210 | # type_2_scd_generic_upsert
211 | def test_upserts_based_on_date_columns(tmp_path):
212 | path = f"{tmp_path}/tmp/delta-upsert-date"
213 | # // create Delta Lake
214 | data2 = [
215 | (1, "A", True, dt(2019, 1, 1), None),
216 | (2, "B", True, dt(2019, 1, 1), None),
217 | (4, "D", True, dt(2019, 1, 1), None),
218 | ]
219 |
220 | schema = StructType(
221 | [
222 | StructField("pkey", IntegerType(), True),
223 | StructField("attr", StringType(), True),
224 | StructField("cur", BooleanType(), True),
225 | StructField("effective_date", DateType(), True),
226 | StructField("end_date", DateType(), True),
227 | ]
228 | )
229 |
230 | df = spark.createDataFrame(data=data2, schema=schema)
231 | df.write.format("delta").save(path)
232 |
233 | # create updates DF
234 | updates_df = spark.createDataFrame(
235 | [
236 | (3, "C", dt(2020, 9, 15)), # new value
237 | (2, "Z", dt(2020, 1, 1)), # value to upsert
238 | ]
239 | ).toDF("pkey", "attr", "effective_date")
240 |
241 | # perform upsert
242 | delta_table = DeltaTable.forPath(spark, path)
243 | mack.type_2_scd_generic_upsert(
244 | delta_table, updates_df, "pkey", ["attr"], "cur", "effective_date", "end_date"
245 | )
246 |
247 | actual_df = spark.read.format("delta").load(path)
248 |
249 | expected_df = spark.createDataFrame(
250 | [
251 | (2, "B", False, dt(2019, 1, 1), dt(2020, 1, 1)),
252 | (3, "C", True, dt(2020, 9, 15), None),
253 | (2, "Z", True, dt(2020, 1, 1), None),
254 | (4, "D", True, dt(2019, 1, 1), None),
255 | (1, "A", True, dt(2019, 1, 1), None),
256 | ],
257 | schema,
258 | )
259 |
260 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True)
261 |
262 |
263 | def test_upserts_based_on_version_number(tmp_path):
264 | path = f"{tmp_path}/tmp/delta-upsert-version"
265 | # create Delta Lake
266 | data2 = [
267 | (1, "A", True, 1, None),
268 | (2, "B", True, 1, None),
269 | (4, "D", True, 1, None),
270 | ]
271 |
272 | schema = StructType(
273 | [
274 | StructField("pkey", IntegerType(), True),
275 | StructField("attr", StringType(), True),
276 | StructField("is_current", BooleanType(), True),
277 | StructField("effective_ver", IntegerType(), True),
278 | StructField("end_ver", IntegerType(), True),
279 | ]
280 | )
281 |
282 | df = spark.createDataFrame(data=data2, schema=schema)
283 |
284 | df.write.format("delta").save(path)
285 |
286 | # create updates DF
287 | updates_df = spark.createDataFrame(
288 | [
289 | (2, "Z", 2), # value to upsert
290 | (3, "C", 3), # new value
291 | ]
292 | ).toDF("pkey", "attr", "effective_ver")
293 |
294 | # perform upsert
295 | delta_table = DeltaTable.forPath(spark, path)
296 | mack.type_2_scd_generic_upsert(
297 | delta_table,
298 | updates_df,
299 | "pkey",
300 | ["attr"],
301 | "is_current",
302 | "effective_ver",
303 | "end_ver",
304 | )
305 |
306 | # show result
307 | res = spark.read.format("delta").load(path)
308 |
309 | expected_data = [
310 | (2, "B", False, 1, 2),
311 | (3, "C", True, 3, None),
312 | (2, "Z", True, 2, None),
313 | (4, "D", True, 1, None),
314 | (1, "A", True, 1, None),
315 | ]
316 |
317 | expected = spark.createDataFrame(expected_data, schema)
318 |
319 | chispa.assert_df_equality(res, expected, ignore_row_order=True)
320 |
321 |
322 | def test_upserts_does_not_insert_duplicate(tmp_path):
323 | path = f"{tmp_path}/tmp/delta-no-duplicate"
324 | # create Delta Lake
325 | data2 = [
326 | (1, "A", True, dt(2019, 1, 1), None),
327 | (2, "B", True, dt(2019, 1, 1), None),
328 | (4, "D", True, dt(2019, 1, 1), None),
329 | ]
330 |
331 | schema = StructType(
332 | [
333 | StructField("pkey", IntegerType(), True),
334 | StructField("attr", StringType(), True),
335 | StructField("cur", BooleanType(), True),
336 | StructField("effective_date", DateType(), True),
337 | StructField("end_date", DateType(), True),
338 | ]
339 | )
340 |
341 | df = spark.createDataFrame(data=data2, schema=schema)
342 | df.write.format("delta").save(path)
343 |
344 | # create updates DF
345 | updates_df = spark.createDataFrame(
346 | [
347 | (1, "A", dt(2019, 1, 1)), # duplicate row
348 | ]
349 | ).toDF("pkey", "attr", "effective_date")
350 |
351 | # perform upsert
352 | delta_table = DeltaTable.forPath(spark, path)
353 | mack.type_2_scd_generic_upsert(
354 | delta_table, updates_df, "pkey", ["attr"], "cur", "effective_date", "end_date"
355 | )
356 |
357 | actual_df = spark.read.format("delta").load(path)
358 |
359 | expected_df = spark.createDataFrame(
360 | [
361 | (1, "A", True, dt(2019, 1, 1), None),
362 | (2, "B", True, dt(2019, 1, 1), None),
363 | (4, "D", True, dt(2019, 1, 1), None),
364 | ],
365 | schema,
366 | )
367 |
368 | chispa.assert_df_equality(actual_df, expected_df, ignore_row_order=True)
369 |
370 |
371 | # def describe_kill_duplicates():
372 | def test_kills_duplicates_in_a_delta_table(tmp_path):
373 | path = f"{tmp_path}/deduplicate1"
374 | data = [
375 | (1, "A", "A"), # duplicate
376 | (2, "A", "B"),
377 | (3, "A", "A"), # duplicate
378 | (4, "A", "A"), # duplicate
379 | (5, "B", "B"), # duplicate
380 | (6, "D", "D"),
381 | (9, "B", "B"), # duplicate
382 | ]
383 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
384 | df.write.format("delta").save(path)
385 |
386 | delta_table = DeltaTable.forPath(spark, path)
387 |
388 | mack.kill_duplicates(delta_table, ["col3", "col2"])
389 |
390 | res = spark.read.format("delta").load(path)
391 |
392 | expected_data = [
393 | (2, "A", "B"),
394 | (6, "D", "D"),
395 | ]
396 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3"])
397 |
398 | chispa.assert_df_equality(res, expected, ignore_row_order=True)
399 |
400 |
401 | def test_drop_duplicates_pkey_in_a_delta_table(tmp_path):
402 | path = f"{tmp_path}/drop_duplicates_pkey"
403 | data = [
404 | (1, "A", "A", "C"), # duplicate
405 | (2, "A", "B", "C"),
406 | (3, "A", "A", "D"), # duplicate
407 | (4, "A", "A", "E"), # duplicate
408 | (5, "B", "B", "C"), # duplicate
409 | (6, "D", "D", "C"),
410 | (9, "B", "B", "E"), # duplicate
411 | ]
412 | df = spark.createDataFrame(data, ["col1", "col2", "col3", "col4"])
413 | df.write.format("delta").save(path)
414 |
415 | delta_table = DeltaTable.forPath(spark, path)
416 |
417 | mack.drop_duplicates_pkey(delta_table, "col1", ["col2", "col3"])
418 |
419 | res = spark.read.format("delta").load(path)
420 |
421 | expected_data = [
422 | (1, "A", "A", "C"),
423 | (2, "A", "B", "C"),
424 | (5, "B", "B", "C"),
425 | (6, "D", "D", "C"),
426 | ]
427 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3", "col4"])
428 |
429 | chispa.assert_df_equality(res, expected, ignore_row_order=True)
430 |
431 |
432 | def test_drop_duplicates_pkey_in_a_delta_table_no_duplication_cols(tmp_path):
433 | path = f"{tmp_path}/drop_duplicates_pkey_no_duplication_cols"
434 | data = [
435 | (1, "A", "A", "C"), # duplicate
436 | (1, "A", "A", "C"), # duplicate
437 | (1, "A", "A", "C"), # duplicate
438 | (1, "A", "A", "C"), # duplicate
439 | ]
440 | df = spark.createDataFrame(data, ["col1", "col2", "col3", "col4"])
441 | df.write.format("delta").save(path)
442 |
443 | delta_table = DeltaTable.forPath(spark, path)
444 |
445 | with pytest.raises(TypeError):
446 | mack.drop_duplicates_pkey(delta_table, "col1", [])
447 |
448 |
449 | def test_drop_duplicates_in_a_delta_table(tmp_path):
450 | path = f"{tmp_path}/drop_duplicates"
451 | data = [
452 | (1, "A", "A", "C"), # duplicate
453 | (1, "A", "A", "C"), # duplicate
454 | (1, "A", "A", "C"), # duplicate
455 | (1, "A", "A", "C"), # duplicate
456 | ]
457 | df = spark.createDataFrame(data, ["col1", "col2", "col3", "col4"])
458 | df.write.format("delta").save(path)
459 |
460 | delta_table = DeltaTable.forPath(spark, path)
461 |
462 | mack.drop_duplicates(delta_table, ["col1"]),
463 |
464 | res = spark.read.format("delta").load(path)
465 |
466 | expected_data = [
467 | (1, "A", "A", "C"),
468 | ]
469 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3", "col4"])
470 |
471 | chispa.assert_df_equality(res, expected, ignore_row_order=True)
472 |
473 |
474 | def test_copy_delta_table(tmp_path):
475 | path = f"{tmp_path}/copy_test_1"
476 | data = [
477 | (1, "A", "A"),
478 | (2, "A", "B"),
479 | ]
480 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
481 |
482 | (
483 | df.write.format("delta")
484 | .partitionBy(["col1"])
485 | .option("delta.logRetentionDuration", "interval 30 days")
486 | .save(path)
487 | )
488 |
489 | origin_table = DeltaTable.forPath(spark, path)
490 | origin_details = origin_table.detail().select("partitionColumns", "properties")
491 |
492 | mack.copy_table(origin_table, f"{tmp_path}/copy_test_2")
493 |
494 | copied_table = DeltaTable.forPath(spark, f"{tmp_path}/copy_test_2")
495 | copied_details = copied_table.detail().select("partitionColumns", "properties")
496 |
497 | chispa.assert_df_equality(origin_details, copied_details)
498 | chispa.assert_df_equality(
499 | origin_table.toDF(), copied_table.toDF(), ignore_row_order=True
500 | )
501 |
502 |
503 | # append without duplicates
504 | def test_append_without_duplicates_single_column(tmp_path):
505 | path = f"{tmp_path}/append_without_duplicates"
506 | data = [
507 | (1, "A", "B"),
508 | (2, "C", "D"),
509 | (3, "E", "F"),
510 | ]
511 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
512 | df.write.format("delta").save(path)
513 |
514 | delta_table = DeltaTable.forPath(spark, path)
515 |
516 | append_df = spark.createDataFrame(
517 | [
518 | (2, "R", "T"), # duplicate
519 | (8, "A", "B"),
520 | (8, "B", "C"), # duplicate
521 | (10, "X", "Y"),
522 | ],
523 | ["col1", "col2", "col3"],
524 | )
525 |
526 | mack.append_without_duplicates(delta_table, append_df, ["col1"])
527 |
528 | appended_data = spark.read.format("delta").load(path)
529 |
530 | expected_data = [
531 | (1, "A", "B"),
532 | (2, "C", "D"),
533 | (3, "E", "F"),
534 | (8, "A", "B"),
535 | (10, "X", "Y"),
536 | ]
537 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3"])
538 | chispa.assert_df_equality(appended_data, expected, ignore_row_order=True)
539 |
540 |
541 | def test_validate_append(tmp_path):
542 | path = f"{tmp_path}/validate_append"
543 |
544 | def append_fun(delta_table, append_df):
545 | mack.validate_append(
546 | delta_table,
547 | append_df,
548 | required_cols=["col1", "col2"],
549 | optional_cols=["col4"],
550 | )
551 |
552 | # Create Delta table
553 | data = [
554 | (1, "a", "A"),
555 | (2, "b", "B"),
556 | ]
557 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
558 | df.write.format("delta").save(path)
559 |
560 | # Demonstrate that certain DataFrames with optional columns can be appended
561 | delta_table = DeltaTable.forPath(spark, path)
562 | append_df = spark.createDataFrame(
563 | [
564 | (3, "c", "cat"),
565 | (4, "d", "dog"),
566 | ],
567 | ["col1", "col2", "col4"],
568 | )
569 | append_fun(delta_table, append_df)
570 |
571 | expected_data = [
572 | (1, "a", "A", None),
573 | (2, "b", "B", None),
574 | (3, "c", None, "cat"),
575 | (4, "d", None, "dog"),
576 | ]
577 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3", "col4"])
578 | chispa.assert_df_equality(
579 | spark.read.format("delta").load(path), expected, ignore_row_order=True
580 | )
581 |
582 | # demonstrate that DataFrames with columns that are not on the accept list cannot be appended
583 | append_df = spark.createDataFrame(
584 | [
585 | (4, "b", "A"),
586 | (5, "y", "C"),
587 | (6, "z", "D"),
588 | ],
589 | ["col1", "col2", "col5"],
590 | )
591 | with pytest.raises(TypeError):
592 | mack.validate_append(
593 | delta_table,
594 | append_df,
595 | required_cols=["col1", "col2"],
596 | optional_cols=["col4"],
597 | )
598 |
599 | # demonstrate that DataFrames with missing required columns cannot be appended
600 | append_df = spark.createDataFrame(
601 | [
602 | (4, "A"),
603 | (5, "C"),
604 | (6, "D"),
605 | ],
606 | ["col1", "col4"],
607 | )
608 | with pytest.raises(TypeError):
609 | mack.validate_append(
610 | delta_table,
611 | append_df,
612 | required_cols=["col1", "col2"],
613 | optional_cols=["col4"],
614 | )
615 |
616 |
617 | def test_append_without_duplicates_multi_column(tmp_path):
618 | path = f"{tmp_path}/append_without_duplicates"
619 | data = [
620 | (1, "a", "A"),
621 | (2, "b", "R"),
622 | (3, "c", "X"),
623 | ]
624 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
625 | df.write.format("delta").save(path)
626 |
627 | delta_table = DeltaTable.forPath(spark, path)
628 |
629 | append_data = spark.createDataFrame(
630 | [
631 | (2, "b", "R"), # duplicate col1, col2
632 | (2, "x", "R"), # NOT duplicate col1, col2
633 | (8, "y", "F"),
634 | (10, "z", "U"),
635 | ],
636 | ["col1", "col2", "col3"],
637 | )
638 |
639 | mack.append_without_duplicates(delta_table, append_data, ["col1", "col2"])
640 |
641 | appended_data = spark.read.format("delta").load(path)
642 |
643 | expected_data = [
644 | (1, "a", "A"),
645 | (2, "b", "R"),
646 | (2, "x", "R"),
647 | (3, "c", "X"),
648 | (8, "y", "F"),
649 | (10, "z", "U"),
650 | ]
651 | expected = spark.createDataFrame(expected_data, ["col1", "col2", "col3"])
652 | chispa.assert_df_equality(appended_data, expected, ignore_row_order=True)
653 |
654 |
655 | def test_is_composite_key_candidate(tmp_path):
656 | path = f"{tmp_path}/is_composite_key_candidate"
657 | data = [
658 | (1, "a", "A"),
659 | (2, "b", "R"),
660 | (2, "c", "D"),
661 | (3, "e", "F"),
662 | ]
663 |
664 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
665 | df.write.format("delta").save(path)
666 |
667 | delta_table = DeltaTable.forPath(spark, path)
668 |
669 | assert not mack.is_composite_key_candidate(delta_table, ["col1"])
670 | assert mack.is_composite_key_candidate(delta_table, ["col1", "col2"])
671 |
672 |
673 | def test_delta_file_sizes(tmp_path):
674 | path = f"{tmp_path}/delta_file_sizes"
675 | data = [
676 | (1, "A", "A"),
677 | (2, "A", "B"),
678 | ]
679 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
680 |
681 | (
682 | df.write.format("delta")
683 | .partitionBy(["col1"])
684 | .option("delta.logRetentionDuration", "interval 30 days")
685 | .save(path)
686 | )
687 |
688 | delta_table = DeltaTable.forPath(spark, path)
689 |
690 | result = mack.delta_file_sizes(delta_table)
691 |
692 | expected_result = {
693 | "size_in_bytes": 1320,
694 | "number_of_files": 2,
695 | "average_file_size_in_bytes": 660,
696 | }
697 |
698 | assert result == expected_result
699 |
700 |
701 | def test_show_delta_file_sizes(capfd, tmp_path):
702 | path = f"{tmp_path}/show_delta_file_sizes"
703 | data = [
704 | (1, "A", "A"),
705 | (2, "A", "B"),
706 | ]
707 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
708 |
709 | (df.write.format("delta").partitionBy(["col1"]).save(path))
710 |
711 | delta_table = DeltaTable.forPath(spark, path)
712 |
713 | mack.show_delta_file_sizes(delta_table)
714 |
715 | out, _ = capfd.readouterr()
716 |
717 | assert (
718 | out
719 | == "The delta table contains 2 files with a size of 1.32 kB. The average file size is 660.0 B\n"
720 | )
721 |
722 |
723 | def test_humanize_bytes_formats_nicely():
724 | assert mack.humanize_bytes(12345678) == "12.35 MB"
725 | assert mack.humanize_bytes(1234567890) == "1.23 GB"
726 | assert mack.humanize_bytes(1234567890000) == "1.23 TB"
727 | assert mack.humanize_bytes(1234567890000000) == "1.23 PB"
728 |
729 |
730 | def test_humanize_bytes_binary_formats_nicely():
731 | assert mack.humanize_bytes_binary(12345678) == "11.77 MB"
732 | assert mack.humanize_bytes_binary(1234567890) == "1.15 GB"
733 | assert mack.humanize_bytes_binary(1234567890000) == "1.12 TB"
734 | assert mack.humanize_bytes_binary(1234567890000000) == "1.10 PB"
735 |
736 |
737 | def test_find_composite_key(tmp_path):
738 | path = f"{tmp_path}/find_composite_key"
739 | data = [
740 | (1, "a", "z"),
741 | (1, "a", "b"),
742 | (3, "c", "b"),
743 | ]
744 | df = spark.createDataFrame(
745 | data,
746 | [
747 | "col1",
748 | "col2",
749 | "col3",
750 | ],
751 | )
752 | df.write.format("delta").save(path)
753 |
754 | delta_table = DeltaTable.forPath(spark, path)
755 |
756 | composite_keys = mack.find_composite_key_candidates(delta_table)
757 |
758 | expected_keys = ["col1", "col3"]
759 |
760 | assert composite_keys == expected_keys
761 |
762 |
763 | def test_find_composite_key_with_value_error(tmp_path):
764 | path = f"{tmp_path}/find_composite_key"
765 | data = [
766 | (1, "a", "A"),
767 | (2, "b", "R"),
768 | (2, "c", "D"),
769 | (3, "e", "F"),
770 | ]
771 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
772 | df.write.format("delta").save(path)
773 |
774 | delta_table = DeltaTable.forPath(spark, path)
775 | with pytest.raises(
776 | ValueError, match="No composite key candidates could be identified."
777 | ):
778 | mack.find_composite_key_candidates(delta_table, ["col2", "col3"])
779 |
780 |
781 | def test_with_md5_cols(tmp_path):
782 | path = f"{tmp_path}/find_composite_key"
783 | data = [
784 | (1, "a", None),
785 | (2, "b", "b"),
786 | (3, "c", "c"),
787 | ]
788 | df = spark.createDataFrame(
789 | data,
790 | [
791 | "col1",
792 | "col2",
793 | "col3",
794 | ],
795 | )
796 | df.write.format("delta").save(path)
797 |
798 | delta_table = DeltaTable.forPath(spark, path)
799 | with_md5 = mack.with_md5_cols(delta_table, ["col2", "col3"])
800 |
801 | expected_data = [
802 | (1, "a", None, "0cc175b9c0f1b6a831c399e269772661"),
803 | (2, "b", "b", "1eeaac3814eb80cc40efb005cf0b9141"),
804 | (3, "c", "c", "4e202f8309e7b00349c70845ab02fce9"),
805 | ]
806 | expected_df = spark.createDataFrame(
807 | expected_data,
808 | ["col1", "col2", "col3", "md5_col2_col3"],
809 | )
810 | chispa.assert_df_equality(
811 | with_md5, expected_df, ignore_row_order=True, ignore_nullable=True
812 | )
813 |
814 |
815 | def test_lastest_version(tmp_path):
816 | path = f"{tmp_path}/latestversion"
817 |
818 | data = [
819 | (1, "a", None),
820 | (2, "b", "b"),
821 | (3, "c", "c"),
822 | ]
823 | df = spark.createDataFrame(
824 | data,
825 | [
826 | "col1",
827 | "col2",
828 | "col3",
829 | ],
830 | )
831 | df.write.format("delta").save(path)
832 |
833 | # write the same dataframe twice
834 | df.write.format("delta").mode("append").save(path)
835 | df.write.format("delta").mode("append").save(path)
836 |
837 | delta_table = DeltaTable.forPath(spark, path)
838 | latest_version = mack.latest_version(delta_table)
839 | assert latest_version == 2
840 |
841 |
842 | def test_constraint_append_no_constraint(tmp_path):
843 |
844 | target_path = f"{tmp_path}/constraint_append/target_table"
845 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table"
846 |
847 | data = [
848 | (1, "A", "B"),
849 | (2, "C", "D"),
850 | (3, "E", "F"),
851 | ]
852 |
853 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
854 | df.write.format("delta").save(target_path)
855 |
856 | df2 = spark.createDataFrame([], df.schema)
857 | df2.write.format("delta").save(quarantine_path)
858 |
859 | target_table = DeltaTable.forPath(spark, target_path)
860 | append_df = spark.createDataFrame([], df.schema)
861 | quarantine_table = DeltaTable.forPath(spark, quarantine_path)
862 |
863 | # demonstrate that the function cannot be run with target table not having constraints
864 | with pytest.raises(
865 | TypeError, match="There are no constraints present in the target delta table"
866 | ):
867 | mack.constraint_append(target_table, append_df, quarantine_table)
868 |
869 |
870 | def test_constraint_append_multi_constraint(tmp_path):
871 |
872 | target_path = f"{tmp_path}/constraint_append/target_table"
873 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table"
874 |
875 | data = [
876 | (1, "A", "B"),
877 | (2, "C", "D"),
878 | (3, "E", "F"),
879 | ]
880 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
881 |
882 | df.write.format("delta").save(target_path)
883 |
884 | df2 = spark.createDataFrame([], df.schema)
885 | df2.write.format("delta").save(quarantine_path)
886 |
887 | target_table = DeltaTable.forPath(spark, target_path)
888 |
889 | # adding two constraints
890 | spark.sql(
891 | f"ALTER TABLE delta.`{target_path}` ADD CONSTRAINT col1_constraint CHECK (col1 > 0) "
892 | )
893 | spark.sql(
894 | f"ALTER TABLE delta.`{target_path}` ADD CONSTRAINT col2_constraint CHECK (col2 != 'Z') "
895 | )
896 |
897 | # adding other table properties
898 | spark.sql(
899 | f"ALTER TABLE delta.`{target_path}` SET TBLPROPERTIES('this.is.my.key' = 12, this.is.my.key2 = true)"
900 | )
901 |
902 | append_data = [
903 | (0, "Z", "Z"),
904 | (4, "A", "B"),
905 | (5, "C", "D"),
906 | (6, "E", "F"),
907 | (9, "G", "G"),
908 | (11, "Z", "Z"),
909 | ]
910 | append_df = spark.createDataFrame(append_data, ["col1", "col2", "col3"])
911 |
912 | # testing with two constraints
913 | target_table = DeltaTable.forPath(spark, target_path)
914 | quarantine_table = DeltaTable.forPath(spark, quarantine_path)
915 | mack.constraint_append(target_table, append_df, quarantine_table)
916 |
917 | expected_data = [
918 | (1, "A", "B"),
919 | (2, "C", "D"),
920 | (3, "E", "F"),
921 | (4, "A", "B"),
922 | (5, "C", "D"),
923 | (6, "E", "F"),
924 | (9, "G", "G"),
925 | ]
926 | expected_df = spark.createDataFrame(expected_data, ["col1", "col2", "col3"])
927 |
928 | appended_data = spark.read.format("delta").load(target_path)
929 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True)
930 |
931 | expected_quarantined_data = [(0, "Z", "Z"), (11, "Z", "Z")]
932 | expected_quarantined_df = spark.createDataFrame(
933 | expected_quarantined_data, ["col1", "col2", "col3"]
934 | )
935 |
936 | quarantined_data = spark.read.format("delta").load(quarantine_path)
937 | chispa.assert_df_equality(
938 | quarantined_data, expected_quarantined_df, ignore_row_order=True
939 | )
940 |
941 |
942 | def test_constraint_append_single_constraint(tmp_path):
943 |
944 | target_path = f"{tmp_path}/constraint_append/target_table"
945 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table"
946 |
947 | data = [
948 | (1, "A", "B"),
949 | (2, "C", "D"),
950 | (3, "E", "F"),
951 | ]
952 | df = spark.createDataFrame(data, ["col1", "col2", "col3"])
953 |
954 | df.write.format("delta").save(target_path)
955 |
956 | df2 = spark.createDataFrame([], df.schema)
957 | df2.write.format("delta").save(quarantine_path)
958 |
959 | target_table = DeltaTable.forPath(spark, target_path)
960 |
961 | # adding two constraints
962 | spark.sql(
963 | f"ALTER TABLE delta.`{target_path}` ADD CONSTRAINT col1_constraint CHECK (col1 > 0) "
964 | )
965 |
966 | append_data = [
967 | (0, "Z", "Z"),
968 | (4, "A", "B"),
969 | (5, "C", "D"),
970 | (6, "E", "F"),
971 | (9, "G", "G"),
972 | (11, "Z", "Z"),
973 | ]
974 | append_df = spark.createDataFrame(append_data, ["col1", "col2", "col3"])
975 |
976 | # testing with two constraints
977 | target_table = DeltaTable.forPath(spark, target_path)
978 | quarantine_table = DeltaTable.forPath(spark, quarantine_path)
979 | mack.constraint_append(target_table, append_df, quarantine_table)
980 |
981 | expected_data = [
982 | (1, "A", "B"),
983 | (2, "C", "D"),
984 | (3, "E", "F"),
985 | (4, "A", "B"),
986 | (5, "C", "D"),
987 | (6, "E", "F"),
988 | (9, "G", "G"),
989 | (11, "Z", "Z"),
990 | ]
991 | expected_df = spark.createDataFrame(expected_data, ["col1", "col2", "col3"])
992 |
993 | appended_data = spark.read.format("delta").load(target_path)
994 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True)
995 |
996 | expected_quarantined_data = [(0, "Z", "Z")]
997 | expected_quarantined_df = spark.createDataFrame(
998 | expected_quarantined_data, ["col1", "col2", "col3"]
999 | )
1000 |
1001 | quarantined_data = spark.read.format("delta").load(quarantine_path)
1002 | chispa.assert_df_equality(
1003 | quarantined_data, expected_quarantined_df, ignore_row_order=True
1004 | )
1005 |
1006 |
1007 | def test_constraint_append_notnull_constraint(tmp_path):
1008 |
1009 | target_path = f"{tmp_path}/constraint_append/target_table"
1010 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table"
1011 |
1012 | target_schema = StructType(
1013 | [
1014 | StructField("col1", IntegerType(), False),
1015 | StructField("col2", StringType(), True),
1016 | StructField("col3", StringType(), False),
1017 | ]
1018 | )
1019 |
1020 | df = spark.createDataFrame([], target_schema)
1021 |
1022 | target_table = (
1023 | DeltaTable.create(spark).location(target_path).addColumns(df.schema).execute()
1024 | )
1025 |
1026 | quarantine_schema = StructType(
1027 | [
1028 | StructField("col1", IntegerType(), True),
1029 | StructField("col2", StringType(), True),
1030 | StructField("col3", StringType(), True),
1031 | ]
1032 | )
1033 |
1034 | qdf = spark.createDataFrame([], quarantine_schema)
1035 |
1036 | quarantine_table = (
1037 | DeltaTable.create(spark)
1038 | .location(quarantine_path)
1039 | .addColumns(qdf.schema)
1040 | .execute()
1041 | )
1042 |
1043 | data = [(None, "A", "B"), (2, "C", None), (3, "E", "F"), (4, "G", "H")]
1044 | append_df = spark.createDataFrame(data, quarantine_schema)
1045 |
1046 | mack.constraint_append(target_table, append_df, quarantine_table)
1047 |
1048 | # target data equality check
1049 | expected_data = [(3, "E", "F"), (4, "G", "H")]
1050 | expected_df = spark.createDataFrame(expected_data, target_schema)
1051 |
1052 | appended_data = spark.read.format("delta").load(target_path)
1053 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True)
1054 |
1055 | # quarantined data equality check
1056 | expected_quarantined_data = [
1057 | (None, "A", "B"),
1058 | (2, "C", None),
1059 | ]
1060 | expected_quarantined_df = spark.createDataFrame(
1061 | expected_quarantined_data, quarantine_schema
1062 | )
1063 |
1064 | quarantined_data = spark.read.format("delta").load(quarantine_path)
1065 | chispa.assert_df_equality(
1066 | quarantined_data, expected_quarantined_df, ignore_row_order=True
1067 | )
1068 |
1069 |
1070 | def test_constraint_append_notnull_and_check_constraint(tmp_path):
1071 |
1072 | target_path = f"{tmp_path}/constraint_append/target_table"
1073 | quarantine_path = f"{tmp_path}/constraint_append/quarantine_table"
1074 |
1075 | target_schema = StructType(
1076 | [
1077 | StructField("col1", IntegerType(), False),
1078 | StructField("col2", StringType(), True),
1079 | StructField("col3", StringType(), False),
1080 | ]
1081 | )
1082 |
1083 | df = spark.createDataFrame([], target_schema)
1084 |
1085 | target_table = (
1086 | DeltaTable.create(spark)
1087 | .location(target_path)
1088 | .addColumns(df.schema)
1089 | .property("delta.constraints.col1_constraint", "col1 > 0")
1090 | .execute()
1091 | )
1092 |
1093 | quarantine_schema = StructType(
1094 | [
1095 | StructField("col1", IntegerType(), True),
1096 | StructField("col2", StringType(), True),
1097 | StructField("col3", StringType(), True),
1098 | ]
1099 | )
1100 |
1101 | qdf = spark.createDataFrame([], quarantine_schema)
1102 |
1103 | quarantine_table = (
1104 | DeltaTable.create(spark)
1105 | .location(quarantine_path)
1106 | .addColumns(qdf.schema)
1107 | .execute()
1108 | )
1109 |
1110 | data = [
1111 | (0, "A", "B"),
1112 | (0, "A", None),
1113 | (None, "A", "B"),
1114 | (2, "C", None),
1115 | (3, "E", "F"),
1116 | (4, "G", "H"),
1117 | ]
1118 | append_df = spark.createDataFrame(data, quarantine_schema)
1119 |
1120 | mack.constraint_append(target_table, append_df, quarantine_table)
1121 |
1122 | # target data equality check
1123 | expected_data = [(3, "E", "F"), (4, "G", "H")]
1124 | expected_df = spark.createDataFrame(expected_data, target_schema)
1125 |
1126 | appended_data = spark.read.format("delta").load(target_path)
1127 | chispa.assert_df_equality(appended_data, expected_df, ignore_row_order=True)
1128 |
1129 | # quarantined data equality check
1130 | expected_quarantined_data = [
1131 | (0, "A", "B"),
1132 | (0, "A", None),
1133 | (None, "A", "B"),
1134 | (2, "C", None),
1135 | ]
1136 | expected_quarantined_df = spark.createDataFrame(
1137 | expected_quarantined_data, quarantine_schema
1138 | )
1139 |
1140 | quarantined_data = spark.read.format("delta").load(quarantine_path)
1141 | chispa.assert_df_equality(
1142 | quarantined_data, expected_quarantined_df, ignore_row_order=True
1143 | )
1144 |
1145 |
1146 | def test_rename_delta_table(tmp_path):
1147 | # Create a temporary directory to hold the Delta table
1148 | # Create a sample DataFrame
1149 | data = [("Alice", 1), ("Bob", 2)]
1150 | df = spark.createDataFrame(data, ["Name", "Age"])
1151 |
1152 | # Write the DataFrame to a Delta table
1153 | old_table_path = f"{tmp_path}/old_table"
1154 | df.write.format("delta").save(old_table_path)
1155 |
1156 | # Load the Delta table
1157 | old_table = DeltaTable.forPath(spark, old_table_path)
1158 |
1159 | # Call the function to rename the Delta table
1160 | new_table_name = "new_table"
1161 | mack.rename_delta_table(
1162 | old_table, new_table_name, databricks=False, spark_session=spark
1163 | )
1164 |
1165 | # Verify the table has been renamed
1166 | assert spark._jsparkSession.catalog().tableExists(new_table_name)
1167 |
1168 | # Clean up: Drop the new table
1169 | spark.sql(f"DROP TABLE IF EXISTS {new_table_name}")
1170 |
1171 |
--------------------------------------------------------------------------------