├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── docs_improvement.yml
    │   └── feature_request.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── assign-on-comment.yml
    │   ├── ci.yml
    │   ├── lint.yaml
    │   ├── mkdocs.yml
    │   └── pr.yml
├── .gitignore
├── .licenserc.yaml
├── .pre-commit-config.yaml
├── .python-version
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── benchmarks
    ├── __init__.py
    ├── benchmark_column_performance.py
    ├── create_benchmark_df.py
    ├── results
    │   ├── collectlist_large.json
    │   ├── collectlist_medium.json
    │   ├── collectlist_small.json
    │   ├── collectlist_xsmall.json
    │   ├── flatmap_large.json
    │   ├── flatmap_medium.json
    │   ├── flatmap_small.json
    │   ├── flatmap_xsmall.json
    │   ├── localIterator_large.json
    │   ├── localIterator_medium.json
    │   ├── localIterator_small.json
    │   ├── localIterator_xsmall.json
    │   ├── map_large.json
    │   ├── map_medium.json
    │   ├── map_small.json
    │   ├── map_xsmall.json
    │   ├── toPandas_large.json
    │   ├── toPandas_medium.json
    │   ├── toPandas_small.json
    │   └── toPandas_xsmall.json
    └── visualize_benchmarks.py
├── docs
    ├── examples
    │   └── index.md
    ├── gen_ref_pages.py
    ├── images
    │   ├── column_to_list_boxplot.svg
    │   ├── column_to_list_line_plot.svg
    │   └── quinn.png
    ├── index.md
    ├── learn_more
    │   ├── column_to_list.md
    │   └── index.md
    ├── notebooks
    │   └── schema_as_code.ipynb
    └── usage.md
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
├── quinn.iml
├── quinn.png
├── quinn
    ├── __init__.py
    ├── append_if_schema_identical.py
    ├── dataframe_helpers.py
    ├── dataframe_validator.py
    ├── extensions
    │   ├── __init__.py
    │   ├── dataframe_ext.py
    │   └── spark_session_ext.py
    ├── functions.py
    ├── keyword_finder.py
    ├── math.py
    ├── schema_helpers.py
    ├── split_columns.py
    └── transformations.py
└── tests
    ├── __init__.py
    ├── extensions
        ├── __init__.py
        ├── dataframe_transformations.py
        ├── test_dataframe_ext.py
        └── test_spark_session_ext.py
    ├── spark.py
    ├── test_append_if_schema_identical.py
    ├── test_dataframe_helpers.py
    ├── test_dataframe_validator.py
    ├── test_files
        ├── bad_schema.csv
        ├── good_schema1.csv
        ├── good_schema2.csv
        └── some_pyspark.py
    ├── test_functions.py
    ├── test_keyword_finder.py
    ├── test_math.py
    ├── test_schema_helpers.py
    ├── test_split_columns.py
    └── test_transformations.py


/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: Report incorrect behavior in the quinn library
 3 | title: "BUG: "
 4 | labels: [Bug]
 5 | 
 6 | body:
 7 |   - type: checkboxes
 8 |     id: checks
 9 |     attributes:
10 |       label: Quinn version checks
11 |       options:
12 |         - label: >
13 |             I have checked that this issue has not already been reported.
14 |           required: true
15 |         - label: >
16 |             I have confirmed this bug exists on the
17 |             [latest version](https://pypi.org/project/quinn/) of quinn.
18 |           required: true
19 |         - label: >
20 |             I have confirmed this bug exists on the
21 |             [main branch](https://github.com/MrPowers/quinn) of quinn.
22 |   - type: textarea
23 |     id: example
24 |     attributes:
25 |       label: Reproducible Example
26 |       description: >
27 |         Please follow [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) on how to
28 |         provide a minimal, copy-pastable example.
29 |       placeholder: >
30 |         import quinn
31 | 
32 | 
33 |         quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"])
34 | 
35 |         ...
36 |       render: python
37 |     validations:
38 |       required: true
39 |   - type: textarea
40 |     id: problem
41 |     attributes:
42 |       label: Issue Description
43 |       description: >
44 |         Please provide a description of the issue shown in the reproducible example.
45 |     validations:
46 |       required: true
47 |   - type: textarea
48 |     id: expected-behavior
49 |     attributes:
50 |       label: Expected Behavior
51 |       description: >
52 |         Please describe or show a code example of the expected behavior.
53 |     validations:
54 |       required: true
55 |   - type: textarea
56 |     id: version
57 |     attributes:
58 |       label: Installed Versions
59 |       description: >
60 |         Please paste the output of ``quinn.__version__``
61 |       value: >
62 |         <details>
63 | 
64 |         Replace this line with the output of quinn.__version__
65 | 
66 |         </details>
67 |     validations:
68 |       required: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/docs_improvement.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation Improvement
 2 | description: Report wrong or missing documentation
 3 | title: "DOC: "
 4 | labels: [Docs]
 5 | 
 6 | body:
 7 |   - type: checkboxes
 8 |     attributes:
 9 |       label: Quinn version checks
10 |       options:
11 |         - label: >
12 |             I have checked that the issue still exists on the latest versions of the docs
13 |             on `main` [here](https://mrpowers.github.io/quinn/)
14 |           required: true
15 |   - type: textarea
16 |     id: location
17 |     attributes:
18 |       label: Location of the documentation
19 |       description: >
20 |         Please provide the location of the documentation, e.g. "quinn.validate_schema()
21 |     validations:
22 |       required: true
23 |   - type: textarea
24 |     id: problem
25 |     attributes:
26 |       label: Documentation problem
27 |       description: >
28 |         Please provide a description of what documentation you believe needs to be fixed/improved
29 |     validations:
30 |       required: true
31 |   - type: textarea
32 |     id: suggested-fix
33 |     attributes:
34 |       label: Suggested fix for documentation
35 |       description: >
36 |         Please explain the suggested fix and **why** it's better than the existing documentation
37 |     validations:
38 |       required: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature Request
 2 | description: Suggest an idea for quinn
 3 | title: "ENH: "
 4 | labels: [Enhancement]
 5 | body:
 6 |   - type: checkboxes
 7 |     id: checks
 8 |     attributes:
 9 |       label: Feature Type
10 |       description: Please check what type of feature request you would like to propose.
11 |       options:
12 |         - label: >
13 |             Adding new functionality to quinn
14 |         - label: >
15 |             Changing existing functionality in quinn
16 |         - label: >
17 |             Removing existing functionality in quinn
18 |   - type: textarea
19 |     id: description
20 |     attributes:
21 |       label: Problem Description
22 |       description: >
23 |         Please describe what problem the feature would solve, e.g. "I wish I could use quinn to ..."
24 |     validations:
25 |       required: true
26 |   - type: textarea
27 |     id: feature
28 |     attributes:
29 |       label: Feature Description
30 |       description: >
31 |         Please describe how the new feature would be implemented, using psudocode if relevant.
32 |     validations:
33 |       required: true
34 |   - type: textarea
35 |     id: context
36 |     attributes:
37 |       label: Additional Context
38 |       description: >
39 |         Please provide any relevant GitHub issues, code examples or references that help describe and support
40 |         the feature request.


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Proposed changes
 2 | 
 3 | Describe the big picture of your changes here to communicate to the maintainers. If it fixes a bug or resolves a feature request, please provide a link to that issue.
 4 | 
 5 | ## Types of changes
 6 | 
 7 | What types of changes does your code introduce to quinn?
 8 | _Put an `x` in the boxes that apply_
 9 | 
10 | - [ ] Bugfix (non-breaking change which fixes an issue)
11 | - [ ] New feature (non-breaking change which adds functionality)
12 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
13 | - [ ] Documentation Update (if none of the other choices apply)
14 | 
15 | ## Further comments
16 | 
17 | If this is a relatively large or complex change, kick off the discussion by explaining why you chose the solution you did and what alternatives you considered, etc...


--------------------------------------------------------------------------------
/.github/workflows/assign-on-comment.yml:
--------------------------------------------------------------------------------
 1 | # This workflow was inspired by the issue_comments.yml workflow from the delta-io/delta-rs repository.
 2 | # Source: https://github.com/delta-io/delta-rs/blob/main/.github/workflows/issue_comments.yml
 3 | name: Auto-assign issue on comment
 4 | 
 5 | on:
 6 |   issue_comment:
 7 |     types: [created]
 8 | 
 9 | permissions:
10 |   issues: write
11 | 
12 | jobs:
13 |   auto-assign-issue:
14 |     runs-on: ubuntu-latest
15 |     if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
16 |     concurrency:
17 |       # Only run one a time per user
18 |       group: ${{ github.actor }}-auto-assign-issue
19 |     steps:
20 |       - name: Check if issue can be assigned
21 |         id: check-assignee
22 |         run: |
23 |           RESPONSE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}' -s)
24 |           echo "HTTP_CODE=$RESPONSE" >> $GITHUB_ENV
25 | 
26 |       - name: Assign issue to commenter
27 |         if: env.HTTP_CODE == '204'
28 |         run: |
29 |           echo "Assigning issue #${{ github.event.issue.number }} to @${{ github.event.comment.user.login }}"
30 |           curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
31 |         env:
32 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
33 | 
34 |       - name: Log failure to assign
35 |         if: env.HTTP_CODE != '204'
36 |         run: |
37 |           echo "Issue #${{ github.event.issue.number }} cannot be assigned to @${{ github.event.comment.user.login }}. HTTP response code: ${{ env.HTTP_CODE }}"


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: Unit tests
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |     - main
  7 |   pull_request:
  8 |     branches:
  9 |     - main
 10 |   workflow_dispatch:
 11 | 
 12 | jobs:
 13 | 
 14 |   test:
 15 |     runs-on: ubuntu-latest
 16 |     strategy:
 17 |       fail-fast: false
 18 |       matrix:
 19 |         include:
 20 |           - pyspark-version: 2.4.8  # latest published 2.x version
 21 |             pip-packages: "pypandoc==1.7 pyspark==2.4.8"  # downgrade of pypandoc necessary
 22 |           - pyspark-version: 3.0.3
 23 |             pip-packages: "pyspark==3.0.3"
 24 |           - pyspark-version: 3.1.3
 25 |             pip-packages: "pyspark==3.1.3"
 26 |           - pyspark-version: 3.2.4
 27 |             pip-packages: "pyspark==3.2.4"
 28 |           - pyspark-version: 3.3.2
 29 |             pip-packages: "pyspark==3.3.2"
 30 |           - pyspark-version: 3.4.0
 31 |             pip-packages: "pyspark==3.4.0"
 32 | 
 33 |     steps:
 34 |       - uses: actions/checkout@v1
 35 |         with:
 36 |           fetch-depth: 1
 37 | 
 38 |       - name: Setup Java
 39 |         uses: actions/setup-java@v3
 40 |         with:
 41 |           distribution: 'zulu'
 42 |           java-version: '8'  # Supported by Spark 2.x & 3.x
 43 | 
 44 |       - name: Get supported Python Version depending on PySpark
 45 |         uses: haya14busa/action-cond@v1
 46 |         id: python_version
 47 |         with:
 48 |           cond: ${{ startsWith(matrix.pyspark-version, '2.') }}
 49 |           if_true: '3.7'  # latest supported version for PySpark 2.x
 50 |           if_false: '3.9'  # PySpark 3+
 51 | 
 52 |       - name: Set up Python ${{ steps.python_version.outputs.value }}
 53 |         uses: actions/setup-python@v2
 54 |         with:
 55 |           python-version: ${{ steps.python_version.outputs.value }}
 56 | 
 57 |       - name: Get supported Poetry version
 58 |         uses: haya14busa/action-cond@v1
 59 |         id: poetry_version
 60 |         with:
 61 |           cond: ${{ startsWith(matrix.pyspark-version, '2.') }}
 62 |           if_true: '1.5.1'  # latest supported version for PySpark 2.x
 63 |           if_false: '1.6.1'  # PySpark 3+
 64 | 
 65 |       - name: Install Poetry
 66 |         uses: snok/install-poetry@v1
 67 |         with:
 68 |           version: ${{ steps.poetry_version.outputs.value }}
 69 | 
 70 |       - name: Cache Poetry virtualenv
 71 |         uses: actions/cache@v1
 72 |         id: cache
 73 |         with:
 74 |           path: ~/.virtualenvs
 75 |           key: poetry-${{ hashFiles('**/poetry.lock') }}
 76 |           restore-keys: |
 77 |             poetry-${{ hashFiles('**/poetry.lock') }}
 78 | 
 79 |       - name: Install dependencies
 80 |         run: make install_test
 81 |         if: steps.cache.outputs.cache-hit != 'true'
 82 | 
 83 |       - name: Change PySpark to version ${{ matrix.pyspark-version }}
 84 |         env:
 85 |           PIP_PACKAGES: ${{ matrix.pip-packages }}
 86 |         run: poetry run pip install $PIP_PACKAGES  # Using pip shouldn't mess up poetry cache
 87 | 
 88 |       - name: Run tests with pytest against PySpark ${{ matrix.pyspark-version }}
 89 |         run: make test
 90 | 
 91 |   check-license-headers:
 92 |     runs-on: ubuntu-latest
 93 |     steps:
 94 |       - uses: actions/checkout@v2
 95 |         with:
 96 |           fetch-depth: 0
 97 | 
 98 |       - name: Check License Header
 99 |         uses: apache/skywalking-eyes/dependency@main
100 |         with:
101 |           log: debug
102 |           config: .licenserc.yaml
103 |           mode: check
104 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |   pull_request:
 8 |     branches:
 9 |     - main
10 |   workflow_dispatch:
11 |  
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v3
18 |       - uses: actions/setup-python@v4
19 |         with:
20 |           python-version: 3.9
21 |       - name: Run Ruff
22 |         uses: chartboost/ruff-action@v1
23 |         with: 
24 |           version: 0.0.291
25 | 


--------------------------------------------------------------------------------
/.github/workflows/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | name: MKDocs deploy
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - name: Set up Python 3.9
16 |         uses: actions/setup-python@v2
17 |         with:
18 |           python-version: 3.9
19 |       - name: Set up Poetry
20 |         uses: abatilo/actions-poetry@v2
21 |         with:
22 |           poetry-version: 1.4.0
23 |       - name: Cache Poetry virtualenv
24 |         uses: actions/cache@v1
25 |         id: cache
26 |         with:
27 |           path: ~/.virtualenvs
28 |           key: poetry-${{ hashFiles('**/poetry.lock') }}
29 |           restore-keys: |
30 |             poetry-${{ hashFiles('**/poetry.lock') }}
31 |       - name: Install dependencies
32 |         run:
33 |           make install_deps
34 |         if: steps.cache.outputs.cache-hit != 'true'
35 |       - name: Setup GH
36 |         run: |
37 |           sudo apt update && sudo apt install -y git
38 |           git config user.name 'github-actions[bot]'
39 |           git config user.email 'github-actions[bot]@users.noreply.github.com'
40 |       - name: Build and Deploy
41 |         run:
42 |           poetry run mkdocs gh-deploy --force
43 | 


--------------------------------------------------------------------------------
/.github/workflows/pr.yml:
--------------------------------------------------------------------------------
 1 | name: Testing against single PySpark version
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 | 
11 |   detect_code_changes:
12 |     runs-on: ubuntu-latest
13 |     outputs:
14 |       code_changes: ${{ steps.changes.outputs.code_changes }}
15 |     steps:
16 |       - uses: dorny/paths-filter@v3
17 |         id: changes
18 |         with:
19 |           filters: |
20 |             code_changes:
21 |               - 'quinn/**'
22 |               - 'tests/**'
23 |               - 'benchmarks/**'
24 |               - '.github/**'
25 |               - 'poetry.lock'
26 |               - 'pyproject.toml'
27 | 
28 |   test:
29 |     runs-on: ubuntu-latest
30 |     needs: [detect_code_changes]
31 |     steps:
32 |       - uses: actions/checkout@v3
33 |         if: needs.detect_code_changes.outputs.code_changes == 'true'
34 |         with:
35 |           fetch-depth: 1
36 | 
37 |       - name: Setup Java
38 |         uses: actions/setup-java@v3
39 |         if: needs.detect_code_changes.outputs.code_changes == 'true'
40 |         with:
41 |           distribution: 'zulu'
42 |           java-version: '8'  # Supported by Spark 2.x & 3.x
43 | 
44 |       - name: Set up Python 3.9
45 |         uses: actions/setup-python@v4
46 |         if: needs.detect_code_changes.outputs.code_changes == 'true'
47 |         with:
48 |           python-version: 3.9
49 | 
50 |       - name: Install Poetry
51 |         uses: snok/install-poetry@v1
52 |         if: needs.detect_code_changes.outputs.code_changes == 'true'
53 |         with:
54 |           version: 1.6.1
55 | 
56 |       - name: Cache Poetry virtualenv
57 |         uses: actions/cache@v1
58 |         if: needs.detect_code_changes.outputs.code_changes == 'true'
59 |         id: cache
60 |         with:
61 |           path: ~/.virtualenvs
62 |           key: poetry-${{ hashFiles('**/poetry.lock') }}
63 |           restore-keys: |
64 |             poetry-${{ hashFiles('**/poetry.lock') }}
65 | 
66 |       - name: Install dependencies
67 |         if: |
68 |           needs.detect_code_changes.outputs.code_changes == 'true' &&
69 |           steps.cache.outputs.cache-hit != 'true'
70 |         run: make install_test
71 |         # if: steps.cache.outputs.cache-hit != 'true'
72 | 
73 |       - name: Run tests with pytest
74 |         if: needs.detect_code_changes.outputs.code_changes == 'true'
75 |         run: make test
76 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | quinn.egg-info/
 4 | .cache/
 5 | tmp/
 6 | .idea/
 7 | .DS_Store
 8 | 
 9 | .coverage*
10 | 
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | 
15 | .pytest_cache/
16 | 
17 | # PyVenv
18 | .env
19 | .venv
20 | venv
21 | 
22 | # Linters cache
23 | .mypy_cache
24 | .ruff_cache
25 | 
26 | # MKDocs
27 | site
28 | 
29 | # VSCode
30 | .vscode
31 | 
32 | # Emacs
33 | .dir_locals.el
34 | 
35 | # Jupyter notebooks
36 | .ipynb_checkpoints
37 | 
38 | # Benchmarking
39 | *.crc
40 | *.parquet
41 | _SUCCESS


--------------------------------------------------------------------------------
/.licenserc.yaml:
--------------------------------------------------------------------------------
 1 | license:
 2 |   type: Apache-2.0
 3 |   copyright-owner: Apache Software Foundation
 4 |   header:
 5 |     content: |
 6 |       Licensed to the Apache Software Foundation (ASF) under one or more
 7 |       contributor license agreements.  See the NOTICE file distributed with
 8 |       this work for additional information regarding copyright ownership.
 9 |       The ASF licenses this file to You under the Apache License, Version 2.0
10 |       (the "License"); you may not use this file except in compliance with
11 |       the License.  You may obtain a copy of the License at
12 |       http://www.apache.org/licenses/LICENSE-2.0
13 |       Unless required by applicable law or agreed to in writing, software
14 |       distributed under the License is distributed on an "AS IS" BASIS,
15 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |       See the License for the specific language governing permissions and
17 |       limitations under the License.
18 |   paths-ignore:
19 |     - 'tests/**'
20 |     - 'poetry.lock'
21 |   paths:
22 |     - "**/*.py"
23 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
 3 |     # Ruff version.
 4 |     rev: 'v0.0.291'
 5 |     hooks:
 6 |       - id: ruff
 7 |   - repo: local
 8 |     hooks:
 9 |       - id: pytest
10 |         name: pytest-check
11 |         entry: poetry run pytest
12 |         language: system
13 |         pass_filenames: false
14 |         # Runs only on python files
15 |         types: [ python ]
16 |         always_run: true
17 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.7.5
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Welcome to the Quinn contributing guide
  2 | 
  3 | ## Issues
  4 | 
  5 | ### Create a new issue
  6 | 
  7 | If you spot a problem with the docs, search if an issue already. If a related issue doesn't exist, you can open a [new issue](https://github.com/MrPowers/quinn/issues/new).
  8 | 
  9 | ### Solve an issue
 10 | 
 11 | Scan through our [existing issues](https://github.com/MrPowers/quinn/issues) to find one that interests you. If you find an issue to work on, make sure that no one else is already working on it, so you can get assigned. After that, you are welcome to open a PR with a fix.
 12 | 
 13 | ### Good first issue
 14 | 
 15 | You can find a list of [good first issues](https://github.com/MrPowers/quinn/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) which can help you better understand code base of the project.
 16 | 
 17 | ### Auto-assigning issues
 18 | 
 19 | We have a workflow that automatically assigns issues to users who comment 'take' on an issue. This is configured in the `.github/workflows/assign-on-comment.yml` file. When a user comments `take` on the issue, a GitHub Action will be run to assign the issue to the user if it's not already assigned.
 20 | 
 21 | ## Contributing
 22 | 
 23 | ### Fork the repository
 24 | 
 25 | To start contributing you should fork this repository and only after that clone your fork. If you accidentally forked this repository you can fix it any time by this command:
 26 | 
 27 | ```shell
 28 | # for user-login
 29 | git remote set-url origin https://github.com/your-github-name/quinn.git
 30 | # for private keys way
 31 | git remote set-url origin git@github.com:your-github-name/quinn.git
 32 | ```
 33 | 
 34 | ### Install the project
 35 | 
 36 | #### Installing poetry
 37 | 
 38 | After cloning the project you should install all the dependencies. We are using `poetry` as a build tool. You can install `poetry` by following [this instruction](https://python-poetry.org/docs/#installation).
 39 | 
 40 | #### Installing dependencies
 41 | 
 42 | You can create a virtualenv with `poetry`. The recommended version of Python is `3.9`:
 43 | ```shell
 44 | poetry env use python3.9
 45 | ```
 46 | 
 47 | After that you should install all the dependencies including development:
 48 | ```shell
 49 | make install_deps
 50 | ```
 51 | 
 52 | #### Setup Java
 53 | 
 54 | To run spark tests you need to have properly configured Java. Apache Spark currently supports mainly only Java 8 (1.8). You can find an instruction on how to set up Java [here](https://www.java.com/en/download/help/download_options.html). When you are running spark tests you should have `JAVA_HOME` variable in your environment which points to the  installation of Java 8.
 55 | 
 56 | ### Pre-commit installation and execution
 57 | 
 58 | We use pre-commit hooks to ensure code quality. The configuration for pre-commit hooks is in the `.pre-commit-config.yaml` file. To install pre-commit, run:
 59 | ```shell
 60 | poetry shell
 61 | poetry run pre-commit install
 62 | ```
 63 | To run pre-commit hooks manually, use:
 64 | ```shell
 65 | pre-commit run --all-files
 66 | ```
 67 | 
 68 | ### Running Tests
 69 | 
 70 | This project uses `pytest` and `chispa` for running spark tests. Please run all the tests before creating a pull request. In the case when you are working on new functionality you should also add new tests.
 71 | You can run test as following:
 72 | ```shell
 73 | make test
 74 | ```
 75 | 
 76 | ### GitHub Actions local setup using 'act'
 77 | 
 78 | You can run GitHub Actions locally using the `act` tool. The configuration for GitHub Actions is in the `.github/workflows/ci.yml` file. To install `act`, follow the instructions [here](https://github.com/nektos/act#installation). To run a specific job, use:
 79 | ```shell
 80 | act -j <job-name>
 81 | ```
 82 | For example, to run the `test` job, use:
 83 | ```shell
 84 | act -j test
 85 | ```
 86 | If you need help with `act`, use:
 87 | ```shell
 88 | act --help
 89 | ```
 90 | For MacBooks with M1 processors, you might have to add the `--container-architecture` tag:
 91 | ```shell
 92 | act -j <job-name> --container-architecture linux/arm64
 93 | ```
 94 | 
 95 | ### Code style
 96 | 
 97 | This project follows the [PySpark style guide](https://github.com/MrPowers/spark-style-guide/blob/main/PYSPARK_STYLE_GUIDE.md). All public functions and methods should be documented in `README.md` and also should have docstrings in `sphinx format`:
 98 | 
 99 | ```python
100 | """[Summary]
101 | 
102 | :param [ParamName]: [ParamDescription], defaults to [DefaultParamVal]
103 | :type [ParamName]: [ParamType](, optional)
104 | ...
105 | :raises [ErrorType]: [ErrorDescription]
106 | ...
107 | :return: [ReturnDescription]
108 | :rtype: [ReturnType]
109 | """
110 | ```
111 | 
112 | We are using `isort` and `ruff` as linters. You can find instructions on how to set up and use these tools here:
113 | 
114 | 1. [isort](https://pycqa.github.io/isort/)
115 | 2. [ruff](https://github.com/charliermarsh/ruff)
116 | 
117 | ### Adding ruff to IDEs
118 | 
119 | #### VSCode
120 | 
121 | 1. Install the `Ruff` extension by Astral Software from the VSCode marketplace (Extension ID: *charliermarsh.ruff*).
122 | 2. Open the command palette (Ctrl+Shift+P) and select `Preferences: Open Settings (JSON)`.
123 | 3. Add the following configuration to your settings.json file:
124 | 
125 | ```json
126 | {
127 |     "python.linting.ruffEnabled": true,
128 |     "python.linting.enabled": true,
129 |     "python.formatting.provider": "none",
130 |     "editor.formatOnSave": true
131 | }
132 | ```
133 | The above settings will enable linting with Ruff, and format your code with Ruff on save.
134 | 
135 | #### PyCharm
136 | 
137 | To set up `Ruff` in PyCharm using `poetry`, follow these steps:
138 | 
139 | 1. **Find the path to your `poetry` executable:**
140 |    - Open a terminal.
141 |    - For macOS/Linux, use the command `which poetry`.
142 |    - For Windows, use the command `where poetry`.
143 |    - Note down the path returned by the command.
144 | 
145 | 2. **Open the `Preferences` window** (Cmd+, on macOS).
146 | 3. **Navigate to `Tools` > `External Tools`.**
147 | 4. **Click the `+` icon** to add a new external tool.
148 | 5. **Fill in the following details:**
149 |    - **Name:** `Ruff`
150 |    - **Program:** Enter the path to your `poetry` executable that you noted earlier.
151 |    - **Arguments:** `run ruff check --fix $FilePathRelativeToProjectRoot$`
152 |    - **Working directory:** `$ProjectFileDir$`
153 | 6. **Click `OK`** to save the configuration.
154 | 7. **To run Ruff,** right-click on a file or directory in the project view, select `External Tools`, and then select `Ruff`.
155 | 
156 | ### Pull Request
157 | 
158 | When you're finished with the changes, create a pull request, also known as a PR.
159 | - Don't forget to link PR to the issue if you are solving one.
160 | - As you update your PR and apply changes, mark each conversation as resolved.
161 | - If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues.
162 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for describing the origin of the Work and
141 |       reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # COMMON CLI COMMANDS FOR DEVELOPMENT 
 2 | 
 3 | .PHONY: install_test
 4 | install_test:
 5 | 	@poetry install --with=development,testing
 6 | 
 7 | .PHONY: install_deps
 8 | install_deps:
 9 | 	@poetry install --with=development,linting,testing,docs
10 | 
11 | .PHONY: update_deps
12 | update_deps:
13 | 	@poetry update --with=development,linting,testing,docs
14 | 
15 | .PHONY: test
16 | test:
17 | 	@poetry run pytest tests
18 | 
19 | .PHONY: lint 
20 | lint:
21 | 	@poetry run ruff check --fix quinn
22 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | """Runs benchmarks on quinn functions."""
15 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_column_performance.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | import json
 17 | import timeit
 18 | from pathlib import Path
 19 | 
 20 | 
 21 | def auto_timeit(
 22 |     stmt: str = "pass",
 23 |     setup: str = "pass",
 24 |     min_runtime_seconds: int = 2,
 25 | ) -> list[float]:
 26 |     """Automatically determine the number of runs to perform to get a minimum."""
 27 |     min_runs = 5
 28 |     print(f"Running {stmt} 1 time...")
 29 |     t = timeit.repeat(stmt, setup, repeat=1, number=1)
 30 | 
 31 |     print(f"First run: {t[0]:.2f} seconds")
 32 |     if t[0] >= min_runtime_seconds:
 33 |         return t
 34 | 
 35 |     expected_runs_needed = int((min_runtime_seconds // t[0]) + 1)
 36 |     if expected_runs_needed < min_runs:
 37 |         expected_runs_needed = min_runs
 38 | 
 39 |     expected_runtime = t[0] * expected_runs_needed
 40 |     print(f"Running {stmt} {expected_runs_needed} times.")
 41 |     print(f"Expected runtime: {expected_runtime:.2f} seconds...")
 42 |     return timeit.repeat(stmt, setup, repeat=expected_runs_needed, number=1)
 43 | 
 44 | 
 45 | def get_result(
 46 |     test_name: str,
 47 |     dataset: dict,
 48 |     expr: str,
 49 |     min_runtime_seconds: int,
 50 | ) -> None:
 51 |     """Run a test and save the results to a file."""
 52 |     setup = f"""import timeit
 53 | import pyspark.sql.functions as F
 54 | from pyspark.sql import DataFrame, SparkSession
 55 | builder = (
 56 |     SparkSession.builder.appName("MyApp")
 57 |     .config("spark.executor.memory", "10G")
 58 |     .config("spark.driver.memory", "25G")
 59 |     .config("spark.sql.shuffle.partitions", "2")
 60 |     .config("spark.sql.execution.arrow.pyspark.enabled", "true")
 61 | )
 62 | spark = builder.getOrCreate()
 63 | {dataset['name']} = spark.read.parquet('benchmarks/data/mvv_{dataset['name']}')
 64 | """
 65 |     stmt = expr.replace("df", dataset["name"])
 66 |     result = auto_timeit(stmt, setup, min_runtime_seconds)
 67 | 
 68 |     summary = {
 69 |         "test_name": test_name,
 70 |         "dataset": dataset["name"],
 71 |         "dataset_size": dataset["size"],
 72 |         "runtimes": result,
 73 |     }
 74 | 
 75 |     result_path = f"results/{test_name}_{dataset['name']}.json"
 76 |     with Path(__file__).parent.joinpath(result_path).open(mode="w") as f:
 77 |         json.dump(summary, f, indent=4)
 78 | 
 79 | 
 80 | config = {
 81 |     "toPandas": {"expr": "list(df.select('mvv').toPandas()['mvv'])"},
 82 |     "flatmap": {"expr": "df.select('mvv').rdd.flatMap(lambda x: x).collect()"},
 83 |     "map": {"expr": "df.select('mvv').rdd.map(lambda row : row[0]).collect()"},
 84 |     "collectlist": {"expr": "[row[0] for row in df.select('mvv').collect()]"},
 85 |     "localIterator": {"expr": "[r[0] for r in df.select('mvv').toLocalIterator()]"},
 86 | }
 87 | 
 88 | 
 89 | DATASETS = {
 90 |     "large": {"name": "large", "size": 100_000_000, "min_runtime_seconds": 1200},
 91 |     "medium": {"name": "medium", "size": 10_000_000, "min_runtime_seconds": 360},
 92 |     "small": {"name": "small", "size": 100_000, "min_runtime_seconds": 20},
 93 |     "xsmall": {"name": "xsmall", "size": 1_000, "min_runtime_seconds": 20},
 94 | }
 95 | 
 96 | for test_name, test_config in config.items():
 97 |     print(f"======================{test_name}======================")
 98 |     for dataset_name in DATASETS:
 99 |         dataset = DATASETS[dataset_name]
100 |         print(f"TESTING DATASET {dataset['name']} [n={dataset['size']:,}]")
101 |         get_result(
102 |             test_name=test_name,
103 |             dataset=dataset,
104 |             expr=test_config["expr"],
105 |             min_runtime_seconds=dataset["min_runtime_seconds"],
106 |         )
107 | 


--------------------------------------------------------------------------------
/benchmarks/create_benchmark_df.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | from __future__ import annotations
15 | 
16 | import random
17 | from typing import TYPE_CHECKING, Optional
18 | 
19 | from pyspark.sql import SparkSession
20 | from pyspark.sql import functions as F  # noqa: N812
21 | 
22 | if TYPE_CHECKING:
23 |     from pyspark.sql.dataframe import DataFrame
24 | 
25 | 
26 | def generate_df(spark: SparkSession, n: int) -> DataFrame:
27 |     """Generate a dataframe with a monotonically increasing id column and a random count column."""
28 |     count_vals = [(random.randint(1, 10),) for _ in range(n)]  # noqa: S311
29 |     output: DataFrame = (
30 |         spark.createDataFrame(count_vals, schema=["count"])
31 |         .withColumn("mvv", F.monotonically_increasing_id())
32 |         .select("mvv", "count")
33 |     )
34 |     return output
35 | 
36 | 
37 | def save_benchmark_df(
38 |     spark: SparkSession,
39 |     n: int,
40 |     data_label: str,
41 |     repartition_n: Optional[int] = None,
42 | ) -> None:
43 |     """Save a benchmark dataframe to disk."""
44 |     print(f"Generating benchmark df for n={n}")
45 |     benchmark_df = generate_df(spark, n)
46 | 
47 |     if repartition_n is not None:
48 |         benchmark_df = benchmark_df.repartition(repartition_n)
49 | 
50 |     benchmark_df.write.mode("overwrite").parquet(f"benchmarks/data/mvv_{data_label}")
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     xsmall_n = 1_000
55 |     small_n = 100_000
56 |     medium_n = 10_000_000
57 |     large_n = 100_000_000
58 | 
59 |     builder = (
60 |         SparkSession.builder.appName("MyApp")
61 |         .config("spark.executor.memory", "20G")
62 |         .config("spark.driver.memory", "25G")
63 |         .config("spark.sql.shuffle.partitions", "2")
64 |     )
65 | 
66 |     spark = builder.getOrCreate()
67 |     save_benchmark_df(spark, xsmall_n, "xsmall", 1)
68 |     save_benchmark_df(spark, small_n, "small", 1)
69 |     save_benchmark_df(spark, medium_n, "medium", 1)
70 |     save_benchmark_df(spark, large_n, "large", 4)
71 | 


--------------------------------------------------------------------------------
/benchmarks/results/collectlist_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "collectlist",
 3 |     "dataset": "large",
 4 |     "dataset_size": 100000000,
 5 |     "runtimes": [
 6 |         129.20805395802017,
 7 |         126.53530854202108,
 8 |         129.99196012501488,
 9 |         130.67483216698747,
10 |         126.88453424998443,
11 |         139.92618966597365,
12 |         141.54181875000359,
13 |         136.65802104197792,
14 |         129.75925845801248
15 |     ]
16 | }


--------------------------------------------------------------------------------
/benchmarks/results/collectlist_medium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "collectlist",
 3 |     "dataset": "medium",
 4 |     "dataset_size": 10000000,
 5 |     "runtimes": [
 6 |         11.525758125004359,
 7 |         11.570582416985417,
 8 |         11.951778874994488,
 9 |         12.054943958006334,
10 |         11.80891958301072,
11 |         11.82376299999305,
12 |         11.762349167023785,
13 |         11.46418624999933,
14 |         11.415677415992832,
15 |         11.75218004200724,
16 |         11.825585749989841,
17 |         11.855922749993624,
18 |         11.871351749985479,
19 |         11.430663749983069,
20 |         11.910512792004738,
21 |         12.044869125005789,
22 |         12.068957833980676,
23 |         11.957036042003892,
24 |         11.966440916992724,
25 |         11.30719208298251,
26 |         11.919239667011425,
27 |         11.903133832995081,
28 |         11.947826708987122,
29 |         11.717349375016056,
30 |         11.447638457990251,
31 |         11.941632540983846,
32 |         11.918223374988884,
33 |         11.805195124994498,
34 |         11.892586542002391,
35 |         12.127137292001862,
36 |         11.547379292023834,
37 |         11.897269000008237
38 |     ]
39 | }


--------------------------------------------------------------------------------
/benchmarks/results/collectlist_small.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "collectlist",
  3 |     "dataset": "small",
  4 |     "dataset_size": 100000,
  5 |     "runtimes": [
  6 |         0.12345570797333494,
  7 |         0.1314004999876488,
  8 |         0.12502691597910598,
  9 |         0.12530479099950753,
 10 |         0.12634062499273568,
 11 |         0.12584854100714438,
 12 |         0.12789558398071676,
 13 |         0.12472200000775047,
 14 |         0.1238887500076089,
 15 |         0.13261420800699852,
 16 |         0.12579454202204943,
 17 |         0.13280487500014715,
 18 |         0.12472591700498015,
 19 |         0.12935254198964685,
 20 |         0.12733795901294798,
 21 |         0.1325765420042444,
 22 |         0.12539891598862596,
 23 |         0.12588458400568925,
 24 |         0.12925195900606923,
 25 |         0.12419299999601208,
 26 |         0.12389950000215322,
 27 |         0.12363229200127535,
 28 |         0.13055249999160878,
 29 |         0.12383425000007264,
 30 |         0.12416162498993799,
 31 |         0.12477433300227858,
 32 |         0.12346441601403058,
 33 |         0.12381883300258778,
 34 |         0.12394650001078844,
 35 |         0.12412324998877011,
 36 |         0.12496170899248682,
 37 |         0.12474145801388659,
 38 |         0.1277002909919247,
 39 |         0.12949495899374597,
 40 |         0.13194816702161916,
 41 |         0.12390154198510572,
 42 |         0.12345825001830235,
 43 |         0.1250534169957973,
 44 |         0.12404837500071153,
 45 |         0.12392250000266358,
 46 |         0.12349095800891519,
 47 |         0.12369754200335592,
 48 |         0.12301004098844714,
 49 |         0.12339300001622178,
 50 |         0.12399666698183864,
 51 |         0.12378454199642874,
 52 |         0.12521987498621456,
 53 |         0.12401437500375323,
 54 |         0.1271002079884056,
 55 |         0.12385850001010112,
 56 |         0.12461154101765715,
 57 |         0.12916650000261143,
 58 |         0.1409682499943301,
 59 |         0.1362035000056494,
 60 |         0.13603016699198633,
 61 |         0.13636175001738593,
 62 |         0.14432266599033028,
 63 |         0.12358908398891799,
 64 |         0.12381654200726189,
 65 |         0.13095516699831933,
 66 |         0.12406137501238845,
 67 |         0.12393783399602398,
 68 |         0.12295912500121631,
 69 |         0.12365083300392143,
 70 |         0.12374750000890344,
 71 |         0.12418616699869744,
 72 |         0.12332833299296908,
 73 |         0.12342066699056886,
 74 |         0.12364624999463558,
 75 |         0.12354191701160744,
 76 |         0.12355058299726807,
 77 |         0.12401170801604167,
 78 |         0.12359929201193154,
 79 |         0.12448004202451557,
 80 |         0.12446349998936057,
 81 |         0.12385987499146722,
 82 |         0.1240622499899473,
 83 |         0.12475716599146836,
 84 |         0.13379766599973664,
 85 |         0.13572154100984335,
 86 |         0.13705558300716802,
 87 |         0.14468491700245067,
 88 |         0.15964958298718557,
 89 |         0.12460808298783377,
 90 |         0.12353074998827651,
 91 |         0.12293012501322664,
 92 |         0.12347437502467074,
 93 |         0.12478362501133233,
 94 |         0.1258309579861816,
 95 |         0.12435858300887048,
 96 |         0.12403041598736309,
 97 |         0.12377791601466015,
 98 |         0.12300579200382344,
 99 |         0.12372366600902751,
100 |         0.12322549999225885,
101 |         0.12399170798016712,
102 |         0.12390183401294053,
103 |         0.1246394159970805,
104 |         0.12383850000333041,
105 |         0.1230427919945214,
106 |         0.12371399998664856,
107 |         0.12325983299524523,
108 |         0.1240181670000311,
109 |         0.12403816697769798,
110 |         0.12381629200535826,
111 |         0.1249765410029795,
112 |         0.12391612501232885,
113 |         0.12437091598985717,
114 |         0.12400683399755508,
115 |         0.1270715839928016,
116 |         0.13924929199856706,
117 |         0.1370136250043288,
118 |         0.13644420797936618,
119 |         0.14451066602487117,
120 |         0.1584999579936266,
121 |         0.12467087499680929,
122 |         0.12404358299681917,
123 |         0.12400129198795184,
124 |         0.12341158301569521,
125 |         0.12442779101547785,
126 |         0.12422066699946299,
127 |         0.1319297500012908,
128 |         0.12391049999860115,
129 |         0.12305962498066947,
130 |         0.12398437500814907,
131 |         0.1236839999910444,
132 |         0.12356999999610707,
133 |         0.12458725000033155,
134 |         0.12558270900626667,
135 |         0.1299470840021968,
136 |         0.12408704101108015,
137 |         0.12365183301153593,
138 |         0.12331187500967644,
139 |         0.12352816699421965,
140 |         0.12367491700570099,
141 |         0.12497841700678691,
142 |         0.12355091699282639,
143 |         0.12515654202434234,
144 |         0.1254462499928195,
145 |         0.12387974999728613,
146 |         0.1302125419897493,
147 |         0.12347279200912453,
148 |         0.12541570799658075,
149 |         0.12406129197916016,
150 |         0.12351383300847374,
151 |         0.12356224999530241,
152 |         0.12350362501456402,
153 |         0.12993870800710283,
154 |         0.13651637500151992,
155 |         0.13703787501435727,
156 |         0.136697332985932,
157 |         0.13669541699346155,
158 |         0.15247504197759554,
159 |         0.12481941698933952,
160 |         0.12601887501659803,
161 |         0.12487887497991323,
162 |         0.12363970800652169,
163 |         0.12311479201889597,
164 |         0.12400833301944658,
165 |         0.12335287500172853,
166 |         0.12409445899538696
167 |     ]
168 | }


--------------------------------------------------------------------------------
/benchmarks/results/flatmap_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "flatmap",
 3 |     "dataset": "large",
 4 |     "dataset_size": 100000000,
 5 |     "runtimes": [
 6 |         36.188880041998345,
 7 |         35.77120683397516,
 8 |         35.89365566600463,
 9 |         35.60720691701863,
10 |         35.81423420799547,
11 |         35.66271516599227,
12 |         35.84787024999969,
13 |         35.752700749988435,
14 |         36.1162334579858,
15 |         35.668734874983784,
16 |         35.496447625017026,
17 |         35.78953500001808,
18 |         35.481063749990426,
19 |         35.545604249986354,
20 |         35.45867395901587,
21 |         35.56992366700433,
22 |         35.742496374994516,
23 |         35.539746249996824,
24 |         35.67015320900828,
25 |         35.719724208000116,
26 |         35.8916146249976,
27 |         35.6827434169827,
28 |         35.925275417015655,
29 |         35.92435587500222,
30 |         35.622160916012945,
31 |         35.60375379101606,
32 |         35.69027008401463,
33 |         36.12705849998747,
34 |         36.063100625004154,
35 |         35.65569358400535,
36 |         35.75822524999967,
37 |         35.81311866699252,
38 |         35.969940707989736
39 |     ]
40 | }


--------------------------------------------------------------------------------
/benchmarks/results/flatmap_medium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "flatmap",
 3 |     "dataset": "medium",
 4 |     "dataset_size": 10000000,
 5 |     "runtimes": [
 6 |         12.45473395800218,
 7 |         12.484648000012385,
 8 |         12.400262917013606,
 9 |         12.440737958007958,
10 |         12.452081541006919,
11 |         12.45336887499434,
12 |         12.473071416985476,
13 |         12.459413582982961,
14 |         12.544886957999552,
15 |         12.565210083004786,
16 |         12.474220750009408,
17 |         12.518661708018044,
18 |         12.42703645900474,
19 |         12.512266999983694,
20 |         12.47433920900221,
21 |         12.494368834013585,
22 |         12.473423833027482,
23 |         12.557817583001452,
24 |         12.480229584005428,
25 |         12.469799874990713,
26 |         12.42109241601429,
27 |         12.525904500012984,
28 |         12.395361124974443,
29 |         12.4593050830008,
30 |         12.431161542015616,
31 |         12.513594541989733,
32 |         12.510616583022056,
33 |         12.537003458011895,
34 |         12.404833499982487
35 |     ]
36 | }


--------------------------------------------------------------------------------
/benchmarks/results/flatmap_small.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "flatmap",
  3 |     "dataset": "small",
  4 |     "dataset_size": 100000,
  5 |     "runtimes": [
  6 |         0.17894395801704377,
  7 |         0.18047670801752247,
  8 |         0.1778973330219742,
  9 |         0.1768727089802269,
 10 |         0.18268641698523425,
 11 |         0.18213908301549964,
 12 |         0.18128341698320583,
 13 |         0.1818746250064578,
 14 |         0.1774590410059318,
 15 |         0.1772789589886088,
 16 |         0.17803700000513345,
 17 |         0.1780551660049241,
 18 |         0.17709937499603257,
 19 |         0.17629362500156276,
 20 |         0.17730016700807028,
 21 |         0.17746695800451562,
 22 |         0.18029112499789335,
 23 |         0.18132383402553387,
 24 |         0.17189087497536093,
 25 |         0.18303454198758118,
 26 |         0.17968620802275836,
 27 |         0.1802715000230819,
 28 |         0.17883554100990295,
 29 |         0.17680829201708548,
 30 |         0.18031212501227856,
 31 |         0.1771631249866914,
 32 |         0.1782566249894444,
 33 |         0.1816232909914106,
 34 |         0.1801042500010226,
 35 |         0.18101133400341496,
 36 |         0.17983466701116413,
 37 |         0.17949583401787095,
 38 |         0.17818658400210552,
 39 |         0.17664745802176185,
 40 |         0.1771139999909792,
 41 |         0.17674108300707303,
 42 |         0.17973291699308902,
 43 |         0.1802107500261627,
 44 |         0.1762191250163596,
 45 |         0.17706849999376573,
 46 |         0.17918199999257922,
 47 |         0.17786145798163489,
 48 |         0.17716412502340972,
 49 |         0.1766410409763921,
 50 |         0.1768924999923911,
 51 |         0.1775729159999173,
 52 |         0.17990487499628216,
 53 |         0.18075749999843538,
 54 |         0.18906725000124425,
 55 |         0.17940695802099071,
 56 |         0.17809845801093616,
 57 |         0.1767636250006035,
 58 |         0.1777554590080399,
 59 |         0.17755454202415422,
 60 |         0.18041850000736304,
 61 |         0.1767314170137979,
 62 |         0.18011308400309645,
 63 |         0.1974650830088649,
 64 |         0.17981416700058617,
 65 |         0.17755545899854042,
 66 |         0.1817649999866262,
 67 |         0.18100174999563023,
 68 |         0.17874133397708647,
 69 |         0.17930370901012793,
 70 |         0.1778174999926705,
 71 |         0.17907295800978318,
 72 |         0.18477404201985337,
 73 |         0.17951629200251773,
 74 |         0.17831079100142233,
 75 |         0.17696441698353738,
 76 |         0.17663983299280517,
 77 |         0.18076049999217503,
 78 |         0.18051687499973923,
 79 |         0.1794402500090655,
 80 |         0.17824170799576677,
 81 |         0.17966912500560284,
 82 |         0.18000287501490675,
 83 |         0.17883608298143372,
 84 |         0.1772286659979727,
 85 |         0.17074179102201015,
 86 |         0.1788426669954788,
 87 |         0.17712083397782408,
 88 |         0.17752358398865908,
 89 |         0.17786058300407603,
 90 |         0.17938104100176133,
 91 |         0.17789508399437182,
 92 |         0.17982129100710154,
 93 |         0.17926004200126044,
 94 |         0.17941241699736565,
 95 |         0.17706845901557244,
 96 |         0.17756150002242066,
 97 |         0.17965420801192522,
 98 |         0.17814670799998567,
 99 |         0.17619208298856393,
100 |         0.17750854199402966,
101 |         0.17663995901239105,
102 |         0.17970875001628883,
103 |         0.1802513329894282,
104 |         0.17710229099611752,
105 |         0.1773068750044331,
106 |         0.17719570797635242,
107 |         0.1766349590034224,
108 |         0.17752995900809765,
109 |         0.17610691700247116,
110 |         0.17689754100865684,
111 |         0.17640399999800138,
112 |         0.17956375001813285,
113 |         0.18023837500368245,
114 |         0.17701083299471065,
115 |         0.17760500000440516,
116 |         0.17637616698630154,
117 |         0.18168737497762777,
118 |         0.18223700000089593
119 |     ]
120 | }


--------------------------------------------------------------------------------
/benchmarks/results/flatmap_xsmall.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "flatmap",
  3 |     "dataset": "xsmall",
  4 |     "dataset_size": 1000,
  5 |     "runtimes": [
  6 |         0.06271020899293944,
  7 |         0.0635450420086272,
  8 |         0.06335712500731461,
  9 |         0.06321512500289828,
 10 |         0.062088957987725735,
 11 |         0.06404470797860995,
 12 |         0.0631327080191113,
 13 |         0.06216233299346641,
 14 |         0.06263370800297707,
 15 |         0.06245729100191966,
 16 |         0.06296845900942571,
 17 |         0.06254270899808034,
 18 |         0.06320150001556613,
 19 |         0.06327195902122185,
 20 |         0.06219645799137652,
 21 |         0.06233966600848362,
 22 |         0.06254474999150261,
 23 |         0.06455795798683539,
 24 |         0.06295058300020173,
 25 |         0.06280325000989251,
 26 |         0.06276320898905396,
 27 |         0.06324816701817326,
 28 |         0.06330595898907632,
 29 |         0.06303054100135341,
 30 |         0.06244050001259893,
 31 |         0.06298899999819696,
 32 |         0.06360708401189186,
 33 |         0.06316616598633118,
 34 |         0.06316054199123755,
 35 |         0.06246474999352358,
 36 |         0.06387491698842496,
 37 |         0.05970104201696813,
 38 |         0.0637037499982398,
 39 |         0.05698100000154227,
 40 |         0.062209665979025885,
 41 |         0.062292166985571384,
 42 |         0.06463962499401532,
 43 |         0.0631676249904558,
 44 |         0.06258800000068732,
 45 |         0.07127483398653567,
 46 |         0.062426417018286884,
 47 |         0.062181249988498166,
 48 |         0.0633687500085216,
 49 |         0.06301658399752341,
 50 |         0.062204457994084805,
 51 |         0.0635230419866275,
 52 |         0.06324341602157801,
 53 |         0.06341391601017676,
 54 |         0.06350133300293237,
 55 |         0.06179737497586757,
 56 |         0.06287929098471068,
 57 |         0.06224550001206808,
 58 |         0.06272991601144895,
 59 |         0.06310937501257285,
 60 |         0.061683125008130446,
 61 |         0.06274387502344325,
 62 |         0.05475016700802371,
 63 |         0.06346645799931139,
 64 |         0.06321550000575371,
 65 |         0.06312870798865333,
 66 |         0.06330529201659374,
 67 |         0.057836541993310675,
 68 |         0.06378749999566935,
 69 |         0.062172083009500057,
 70 |         0.0622389999916777,
 71 |         0.06221112501225434,
 72 |         0.06303629197645932,
 73 |         0.061823541997000575,
 74 |         0.06333087501116097,
 75 |         0.06266720799612813,
 76 |         0.062348166975425556,
 77 |         0.061840707989176735,
 78 |         0.06385124998632818,
 79 |         0.06369624999933876,
 80 |         0.06562579199089669,
 81 |         0.05890387500403449,
 82 |         0.0646380000107456,
 83 |         0.06226920799235813,
 84 |         0.06261112500214949,
 85 |         0.06252599999425001,
 86 |         0.06316274998243898,
 87 |         0.06251712498487905,
 88 |         0.06276937498478219,
 89 |         0.06257433400605805,
 90 |         0.0631431249785237,
 91 |         0.06309254097868688,
 92 |         0.06353920898982324,
 93 |         0.06316645900369622,
 94 |         0.06292100000428036,
 95 |         0.06184179100091569,
 96 |         0.06192958299652673,
 97 |         0.06376187500427477,
 98 |         0.06397637500776909,
 99 |         0.060634541005128995,
100 |         0.05874520802171901,
101 |         0.06295916601084173,
102 |         0.06267850002041087,
103 |         0.06178716701106168,
104 |         0.06267579199629836,
105 |         0.06213916698470712,
106 |         0.06340775001444854,
107 |         0.06389649998163804,
108 |         0.06311183399520814,
109 |         0.06350162500166334,
110 |         0.06177533301524818,
111 |         0.06338916599634103,
112 |         0.06310429199947976,
113 |         0.061465624981792644,
114 |         0.06373420800082386,
115 |         0.06199354201089591,
116 |         0.06215308399987407,
117 |         0.06257812498370185,
118 |         0.0633107080066111,
119 |         0.06273208401398733,
120 |         0.0631805420271121,
121 |         0.06331116700312123,
122 |         0.06246858401573263,
123 |         0.06368912500329316,
124 |         0.06410987497656606,
125 |         0.06336450000526384,
126 |         0.06258158400305547,
127 |         0.06312241600244306,
128 |         0.06379858302534558,
129 |         0.06289729100535624,
130 |         0.06289037500391714,
131 |         0.06203354100580327,
132 |         0.062200749991461635,
133 |         0.06217162500252016,
134 |         0.06172491700272076,
135 |         0.06542712499503978,
136 |         0.06341266701929271,
137 |         0.06175950000761077,
138 |         0.06402004201663658,
139 |         0.06260204099817201,
140 |         0.06249041701084934,
141 |         0.06243433299823664,
142 |         0.0627741249918472,
143 |         0.06282150000333786,
144 |         0.062061250006081536,
145 |         0.06406637499458157,
146 |         0.06280062498990446,
147 |         0.062304459017468616,
148 |         0.06356004200642928,
149 |         0.06283041698043235,
150 |         0.061882417008746415,
151 |         0.06290529097896069,
152 |         0.06289950001519173,
153 |         0.06333416598499753,
154 |         0.0628599580086302,
155 |         0.06355745901237242,
156 |         0.06196416600141674,
157 |         0.06188041699351743,
158 |         0.06376770898350514,
159 |         0.06300670900964178,
160 |         0.06230516699724831,
161 |         0.06191929100896232,
162 |         0.06300550000742078,
163 |         0.06300841600750573,
164 |         0.06200337500195019,
165 |         0.06354029200156219,
166 |         0.0659458750160411,
167 |         0.05894091600202955,
168 |         0.06434062501648441,
169 |         0.0626857919851318,
170 |         0.06448016598005779,
171 |         0.061678500002017245,
172 |         0.06217950000427663,
173 |         0.06420358398463577,
174 |         0.06217670798650943,
175 |         0.06327091698767617,
176 |         0.06339983301586471,
177 |         0.0625319580140058,
178 |         0.06349887500982732,
179 |         0.06309629100724123,
180 |         0.061857416993007064,
181 |         0.06333012497634627,
182 |         0.062418874993454665,
183 |         0.06239558299421333,
184 |         0.06274174997815862,
185 |         0.06306512499577366,
186 |         0.06182762500247918,
187 |         0.06269933300791308,
188 |         0.06308016699040309,
189 |         0.06277629200485535,
190 |         0.06207137499586679,
191 |         0.06244466599309817,
192 |         0.0633198749856092,
193 |         0.06167837500106543,
194 |         0.06364629199379124,
195 |         0.06294808301026933,
196 |         0.05437820800580084,
197 |         0.06281262502307072,
198 |         0.06303454199223779,
199 |         0.06329387499135919,
200 |         0.06553379198885523,
201 |         0.06447225000010803,
202 |         0.0625169170089066,
203 |         0.06290137500036508,
204 |         0.06351570802507922,
205 |         0.06383208397892304,
206 |         0.06378558400319889,
207 |         0.06266166700515896,
208 |         0.06600916699972004,
209 |         0.065376666985685,
210 |         0.06304545799503103,
211 |         0.06368670801748522,
212 |         0.06373416702263057,
213 |         0.06324483299977146,
214 |         0.0623699999996461,
215 |         0.06250420899596065,
216 |         0.0624531659996137,
217 |         0.061555457999929786,
218 |         0.06325995799852535,
219 |         0.06258933400386013,
220 |         0.06306745801703073,
221 |         0.06318608298897743,
222 |         0.06250525001087226,
223 |         0.06293795901001431,
224 |         0.06285270798252895,
225 |         0.06319583300501108,
226 |         0.06153624999569729,
227 |         0.06166912498883903,
228 |         0.06316866699489765,
229 |         0.0639053330232855,
230 |         0.06248512500314973,
231 |         0.06289591599488631,
232 |         0.06300166697474197,
233 |         0.06208249999326654,
234 |         0.06384591700043529,
235 |         0.06281524998485111,
236 |         0.06237158298608847,
237 |         0.062482915993314236,
238 |         0.06318995798937976,
239 |         0.06263566701090895,
240 |         0.06326208400423639,
241 |         0.06235483300406486,
242 |         0.062092083011521026,
243 |         0.062356249982258305,
244 |         0.06236979100503959,
245 |         0.06236908299615607,
246 |         0.06264866699348204,
247 |         0.0623248330084607,
248 |         0.06337700001313351,
249 |         0.06261341698700562,
250 |         0.06188200000906363,
251 |         0.06209420898812823,
252 |         0.06319358400651254,
253 |         0.06200754200108349,
254 |         0.06214087500120513,
255 |         0.06296975002624094,
256 |         0.06244700000388548,
257 |         0.062316083000041544,
258 |         0.06276762500056066,
259 |         0.06344370800070465,
260 |         0.06250591701245867,
261 |         0.06244683300610632,
262 |         0.062258125020889565,
263 |         0.0626247079926543,
264 |         0.06162374999257736,
265 |         0.06347287498647347,
266 |         0.06327266700100154,
267 |         0.06206795800244436,
268 |         0.06352941697696224,
269 |         0.06299666600534692,
270 |         0.06262033397797495,
271 |         0.0635691249917727,
272 |         0.06253383302828297,
273 |         0.06278875001589768,
274 |         0.055675582989351824,
275 |         0.06303029198898003,
276 |         0.06262429201160558,
277 |         0.06254154100315645,
278 |         0.06339245798881166,
279 |         0.06276429200079292,
280 |         0.06166370800929144,
281 |         0.06227633301750757,
282 |         0.06254345900379121,
283 |         0.06165145800332539,
284 |         0.06242929198197089,
285 |         0.06382783301523887,
286 |         0.06225416698725894,
287 |         0.06241558401961811,
288 |         0.06248187500750646,
289 |         0.06359933299245313,
290 |         0.06395112501922995,
291 |         0.06229025000357069,
292 |         0.06295412499457598,
293 |         0.06498241701046936,
294 |         0.06674179100082256,
295 |         0.07840916700661182,
296 |         0.06708204199094325,
297 |         0.061809292004909366,
298 |         0.06293816599645652,
299 |         0.06328933301847428,
300 |         0.06201166598475538,
301 |         0.06213245901744813,
302 |         0.062167500000214204,
303 |         0.06238087499514222,
304 |         0.06289520900463685,
305 |         0.06334875000175089,
306 |         0.06292608298826963,
307 |         0.06222179200267419,
308 |         0.06339583298540674,
309 |         0.06269570800941437,
310 |         0.06143370800418779,
311 |         0.0618649169919081,
312 |         0.06222004198934883,
313 |         0.06312449998222291,
314 |         0.06340012498549186,
315 |         0.06352550000883639,
316 |         0.06266974998288788,
317 |         0.0615700000198558,
318 |         0.06288187499740161,
319 |         0.06359874998452142,
320 |         0.06129166699247435,
321 |         0.06143262499244884,
322 |         0.06272533399169333
323 |     ]
324 | }


--------------------------------------------------------------------------------
/benchmarks/results/localIterator_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "localIterator",
 3 |     "dataset": "large",
 4 |     "dataset_size": 100000000,
 5 |     "runtimes": [
 6 |         142.63744066699292,
 7 |         144.66499787499197,
 8 |         144.58708516601473,
 9 |         143.8303821659938,
10 |         144.1865681670024,
11 |         142.28104958301992,
12 |         141.77062158400076,
13 |         142.2637243749923,
14 |         142.05179520900128
15 |     ]
16 | }


--------------------------------------------------------------------------------
/benchmarks/results/localIterator_medium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "localIterator",
 3 |     "dataset": "medium",
 4 |     "dataset_size": 10000000,
 5 |     "runtimes": [
 6 |         14.169408250018023,
 7 |         14.201851833000546,
 8 |         14.226777459000004,
 9 |         14.27066791697871,
10 |         14.312426666991087,
11 |         14.300455041025998,
12 |         14.31601262500044,
13 |         14.306134959013434,
14 |         14.316025750013068,
15 |         14.288483624986839,
16 |         14.255477875005454,
17 |         14.252781917020911,
18 |         14.275479709001957,
19 |         14.253912209009286,
20 |         14.302826917002676,
21 |         14.300289417005843,
22 |         14.241876915999455,
23 |         14.261930708016735,
24 |         14.304426707996754,
25 |         14.276426333002746,
26 |         14.25401162498747,
27 |         14.275975541997468,
28 |         14.250861790991621,
29 |         14.247211042005802,
30 |         14.321850750013255,
31 |         14.335214499995345
32 |     ]
33 | }


--------------------------------------------------------------------------------
/benchmarks/results/localIterator_small.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "localIterator",
  3 |     "dataset": "small",
  4 |     "dataset_size": 100000,
  5 |     "runtimes": [
  6 |         0.1572923339845147,
  7 |         0.15610516601009294,
  8 |         0.15729875001125038,
  9 |         0.15489737497409806,
 10 |         0.15635183299309574,
 11 |         0.15574870799900964,
 12 |         0.15605116699589416,
 13 |         0.15766783300205134,
 14 |         0.15673399998922832,
 15 |         0.156182542006718,
 16 |         0.155715875007445,
 17 |         0.15624404099071398,
 18 |         0.15578624999034218,
 19 |         0.1647759170155041,
 20 |         0.15573512500850484,
 21 |         0.15536620799684897,
 22 |         0.15588595799636096,
 23 |         0.15540254101506434,
 24 |         0.1557722500001546,
 25 |         0.15479129197774455,
 26 |         0.15707508299965411,
 27 |         0.1547249169962015,
 28 |         0.1563042080088053,
 29 |         0.16178824999951757,
 30 |         0.15583224999136291,
 31 |         0.15531091700540856,
 32 |         0.1602989160164725,
 33 |         0.163350624992745,
 34 |         0.15613983297953382,
 35 |         0.15645695800776593,
 36 |         0.15639954200014472,
 37 |         0.15600724998512305,
 38 |         0.15622070801327936,
 39 |         0.1562975829874631,
 40 |         0.1560622500255704,
 41 |         0.15612291701836511,
 42 |         0.1556804169958923,
 43 |         0.1590131660050247,
 44 |         0.15829995801323093,
 45 |         0.15580970799783245,
 46 |         0.1557230000034906,
 47 |         0.15624499999103136,
 48 |         0.15629954199539497,
 49 |         0.15596645799814723,
 50 |         0.15629758397699334,
 51 |         0.156585292017553,
 52 |         0.157058666984085,
 53 |         0.15553504100535065,
 54 |         0.15594116700231098,
 55 |         0.15548545902129263,
 56 |         0.1660226249950938,
 57 |         0.15594999998575076,
 58 |         0.1553985000064131,
 59 |         0.15553233301034197,
 60 |         0.1561224999895785,
 61 |         0.1560029999818653,
 62 |         0.15547154101659544,
 63 |         0.1558478329970967,
 64 |         0.15539704100228846,
 65 |         0.15595591699820943,
 66 |         0.15522774998680688,
 67 |         0.1551085830142256,
 68 |         0.15504537502420135,
 69 |         0.1558237909921445,
 70 |         0.15684320899890736,
 71 |         0.15544366600806825,
 72 |         0.15511745799449272,
 73 |         0.15626950000296347,
 74 |         0.1555634169781115,
 75 |         0.16012516702176072,
 76 |         0.15576475000125356,
 77 |         0.1555737080052495,
 78 |         0.15493974997662008,
 79 |         0.15651558301760815,
 80 |         0.15583595901262015,
 81 |         0.1553781669936143,
 82 |         0.1554269159969408,
 83 |         0.1567337499873247,
 84 |         0.1555281670007389,
 85 |         0.15598245899309404,
 86 |         0.15549199999077246,
 87 |         0.15640129099483602,
 88 |         0.15553787499084137,
 89 |         0.15569637500448152,
 90 |         0.15516895800828934,
 91 |         0.15679262499907054,
 92 |         0.15679995800019242,
 93 |         0.15697504099807702,
 94 |         0.16654237499460578,
 95 |         0.15597804100252688,
 96 |         0.16012933300225995,
 97 |         0.15807808400131762,
 98 |         0.15614604199072346,
 99 |         0.15587529199547134,
100 |         0.15717116600717418,
101 |         0.1560669590253383,
102 |         0.15600720801739953,
103 |         0.15588104201015085,
104 |         0.1548118340142537,
105 |         0.1565489580098074,
106 |         0.15605266598868184,
107 |         0.15582233399618417,
108 |         0.1555045830027666,
109 |         0.15628379202098586,
110 |         0.15611258300486952,
111 |         0.1557881670014467,
112 |         0.15488887502579018,
113 |         0.15548566699726507,
114 |         0.1555823750095442,
115 |         0.15565433399751782,
116 |         0.15696658301749267,
117 |         0.15633958400576375,
118 |         0.15659804100869223,
119 |         0.15663466698606499,
120 |         0.1551692089997232,
121 |         0.1610780830087606,
122 |         0.15597808299935423,
123 |         0.1559513749962207,
124 |         0.15642320801271126,
125 |         0.15623020901693963,
126 |         0.15619325000443496,
127 |         0.15604158301721327,
128 |         0.15494554198812693,
129 |         0.15633858399814926,
130 |         0.15586795800481923,
131 |         0.15480283298529685,
132 |         0.15543520799838006,
133 |         0.16281329200137407
134 |     ]
135 | }


--------------------------------------------------------------------------------
/benchmarks/results/map_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "map",
 3 |     "dataset": "large",
 4 |     "dataset_size": 100000000,
 5 |     "runtimes": [
 6 |         38.044695958000375,
 7 |         37.88741087500239,
 8 |         37.893524833983975,
 9 |         38.120276041998295,
10 |         38.02909308302333,
11 |         37.9263411249849,
12 |         37.68712725001387,
13 |         37.93799850001233,
14 |         38.03957070899196,
15 |         38.126094834005926,
16 |         37.762346417002846,
17 |         38.304923457995756,
18 |         38.108259917004034,
19 |         38.04698508299771,
20 |         37.922059125005035,
21 |         37.88537779197213,
22 |         38.373752999992575,
23 |         37.6934795420093,
24 |         38.058965083007934,
25 |         37.86218554197694,
26 |         38.01557258400135,
27 |         38.16043354201247,
28 |         38.424862250016304,
29 |         38.14441895799246,
30 |         38.030545084009646,
31 |         38.190908041986404,
32 |         37.92010895800195,
33 |         38.539197249978315,
34 |         38.05922462500166,
35 |         38.042172666988336,
36 |         38.06336879200535,
37 |         37.96962983297999
38 |     ]
39 | }


--------------------------------------------------------------------------------
/benchmarks/results/map_medium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_name": "map",
 3 |     "dataset": "medium",
 4 |     "dataset_size": 10000000,
 5 |     "runtimes": [
 6 |         13.511881792015629,
 7 |         13.617772792000324,
 8 |         13.519385208026506,
 9 |         13.490683124982752,
10 |         13.626960124995094,
11 |         13.508057041995926,
12 |         13.502069417008897,
13 |         13.563101709005423,
14 |         13.507099166017724,
15 |         13.552681833010865,
16 |         13.591525916999672,
17 |         13.551621666003484,
18 |         13.518412290984998,
19 |         13.451721041987184,
20 |         13.499396291008452,
21 |         13.614300333021674,
22 |         13.563594542007195,
23 |         13.463782207982149,
24 |         13.588725749985315,
25 |         13.636522250017151,
26 |         13.591557374980766,
27 |         13.512941416993272,
28 |         13.552888249978423,
29 |         13.606033250020118,
30 |         13.565899540990358,
31 |         13.561953833006555,
32 |         13.635592833015835
33 |     ]
34 | }


--------------------------------------------------------------------------------
/benchmarks/results/map_small.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "map",
  3 |     "dataset": "small",
  4 |     "dataset_size": 100000,
  5 |     "runtimes": [
  6 |         0.1891134589968715,
  7 |         0.19210041698534042,
  8 |         0.1904449590074364,
  9 |         0.19111270899884403,
 10 |         0.19105345799471252,
 11 |         0.1876772920077201,
 12 |         0.19078433298273012,
 13 |         0.18876975000603124,
 14 |         0.19013570799143054,
 15 |         0.18736258399439976,
 16 |         0.1916751669778023,
 17 |         0.18777787499129772,
 18 |         0.19012708301306702,
 19 |         0.18837083299877122,
 20 |         0.18670304099214263,
 21 |         0.19053699998767115,
 22 |         0.18963375000748783,
 23 |         0.19170491700060666,
 24 |         0.1877893749915529,
 25 |         0.18843620899133384,
 26 |         0.1901267080102116,
 27 |         0.18844208301743492,
 28 |         0.1905399159877561,
 29 |         0.19096729197190143,
 30 |         0.19023104198276997,
 31 |         0.18846241701976396,
 32 |         0.19002441599150188,
 33 |         0.1901898330252152,
 34 |         0.19257025001570582,
 35 |         0.18755800000508316,
 36 |         0.19046029102173634,
 37 |         0.18996654197690077,
 38 |         0.19062725000549108,
 39 |         0.19637816699105315,
 40 |         0.18948379199719056,
 41 |         0.19232058300985955,
 42 |         0.19094045800738968,
 43 |         0.19090891699306667,
 44 |         0.18941837499733083,
 45 |         0.18869641597848386,
 46 |         0.1972927499737125,
 47 |         0.19141366600524634,
 48 |         0.19254774998989888,
 49 |         0.1912544580118265,
 50 |         0.18878950001089834,
 51 |         0.18837604200234637,
 52 |         0.190177834010683,
 53 |         0.19098016701173037,
 54 |         0.1886746659874916,
 55 |         0.18773966701701283,
 56 |         0.19070679100695997,
 57 |         0.18930591698153876,
 58 |         0.1896120419842191,
 59 |         0.18997891701292247,
 60 |         0.18771916697733104,
 61 |         0.19129891600459814,
 62 |         0.19054650000180118,
 63 |         0.19020145799731836,
 64 |         0.19060395800624974,
 65 |         0.18762170898844488,
 66 |         0.1898857920023147,
 67 |         0.19109079102054238,
 68 |         0.19092062499839813,
 69 |         0.1904410419811029,
 70 |         0.18772612500470132,
 71 |         0.1871835410129279,
 72 |         0.19085220800479874,
 73 |         0.19084308302262798,
 74 |         0.18706516700331122,
 75 |         0.18756166700040922,
 76 |         0.18911158401169814,
 77 |         0.1875695830094628,
 78 |         0.19037070800550282,
 79 |         0.19025616699946113,
 80 |         0.18603216600604355,
 81 |         0.18723708399920724,
 82 |         0.18987504197866656,
 83 |         0.191273666016059,
 84 |         0.18810095801018178,
 85 |         0.19217291599488817,
 86 |         0.19653324998216704,
 87 |         0.1821762080071494,
 88 |         0.1908734999888111,
 89 |         0.19063633400946856,
 90 |         0.19348950000130571,
 91 |         0.19051245800801553,
 92 |         0.19168770901160315,
 93 |         0.19086062497808598,
 94 |         0.19057970799622126,
 95 |         0.19091025000670925,
 96 |         0.19122595799854025,
 97 |         0.18856474998756312,
 98 |         0.19025908398907632,
 99 |         0.1902516660047695,
100 |         0.1871023330022581,
101 |         0.18909866700414568,
102 |         0.19003108300967142,
103 |         0.19090791599592194,
104 |         0.1916295829869341,
105 |         0.19268283300334588,
106 |         0.19054812501417473,
107 |         0.18951087500317954,
108 |         0.1900284580187872,
109 |         0.1899386669974774,
110 |         0.18759595800656825
111 |     ]
112 | }


--------------------------------------------------------------------------------
/benchmarks/results/map_xsmall.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "map",
  3 |     "dataset": "xsmall",
  4 |     "dataset_size": 1000,
  5 |     "runtimes": [
  6 |         0.06401224998990074,
  7 |         0.06418708301498555,
  8 |         0.062261334009235725,
  9 |         0.06443562501226552,
 10 |         0.0632782919856254,
 11 |         0.06306379099260084,
 12 |         0.06327087499084882,
 13 |         0.06286170898238197,
 14 |         0.06227070800377987,
 15 |         0.06216416700044647,
 16 |         0.05693904199870303,
 17 |         0.06316337501630187,
 18 |         0.06364212499465793,
 19 |         0.06387220800388604,
 20 |         0.06336487500811927,
 21 |         0.06294449997949414,
 22 |         0.06370962498476729,
 23 |         0.06356845897971652,
 24 |         0.0639103330031503,
 25 |         0.06442991699441336,
 26 |         0.063629915995989,
 27 |         0.06369895898387767,
 28 |         0.06332191699766554,
 29 |         0.0631050419760868,
 30 |         0.06261091600754298,
 31 |         0.061854958010371774,
 32 |         0.06402333299047314,
 33 |         0.06473608399392106,
 34 |         0.06247470900416374,
 35 |         0.062201416003517807,
 36 |         0.0645946660079062,
 37 |         0.06295474999933504,
 38 |         0.06320237502222881,
 39 |         0.06425979200867005,
 40 |         0.06295583400060423,
 41 |         0.06250070800888352,
 42 |         0.06336974998703226,
 43 |         0.06363258301280439,
 44 |         0.06313208301435225,
 45 |         0.06288966699503362,
 46 |         0.06368470800225623,
 47 |         0.06350766698596999,
 48 |         0.06388804101152346,
 49 |         0.06388683401746675,
 50 |         0.06406479200813919,
 51 |         0.06303812499390915,
 52 |         0.06303600000683218,
 53 |         0.06386075000045821,
 54 |         0.06234220799524337,
 55 |         0.06294887498370372,
 56 |         0.0634906250052154,
 57 |         0.0660005829995498,
 58 |         0.059778791008284315,
 59 |         0.06416095801978372,
 60 |         0.0634279579971917,
 61 |         0.06308670801809058,
 62 |         0.06403333399794064,
 63 |         0.06426183399162255,
 64 |         0.06455233300221153,
 65 |         0.06467220798367634,
 66 |         0.0640352499904111,
 67 |         0.06321545800892636,
 68 |         0.06419095798628405,
 69 |         0.0643165830115322,
 70 |         0.06453616698854603,
 71 |         0.06275262500275858,
 72 |         0.06344425000133924,
 73 |         0.06365591599023901,
 74 |         0.06306587500148453,
 75 |         0.06312562499078922,
 76 |         0.06393025000579655,
 77 |         0.0638675410009455,
 78 |         0.06407720799325034,
 79 |         0.06418420898262411,
 80 |         0.0635341249871999,
 81 |         0.063094999990426,
 82 |         0.06356812501326203,
 83 |         0.06378241602214985,
 84 |         0.06378399999812245,
 85 |         0.06269395901472308,
 86 |         0.064241290994687,
 87 |         0.0636746249801945,
 88 |         0.06431408401113003,
 89 |         0.06347541700233705,
 90 |         0.06370437500299886,
 91 |         0.0634606670064386,
 92 |         0.06362745899241418,
 93 |         0.0643161250045523,
 94 |         0.061846875003539026,
 95 |         0.07015920800040476,
 96 |         0.06450620800023898,
 97 |         0.06384200000320561,
 98 |         0.063679165992653,
 99 |         0.06373883300693706,
100 |         0.06338008400052786,
101 |         0.0637870830250904,
102 |         0.06380837497999892,
103 |         0.064563249994535,
104 |         0.06406691600568593,
105 |         0.06349370800307952,
106 |         0.06414991701603867,
107 |         0.06779629099764861,
108 |         0.0652000840054825,
109 |         0.06393645799835213,
110 |         0.06343612502678297,
111 |         0.06305124997743405,
112 |         0.0645338750036899,
113 |         0.06420533399796113,
114 |         0.06327091701678,
115 |         0.06383362499764189,
116 |         0.06330470799002796,
117 |         0.0635799580195453,
118 |         0.06390908299363218,
119 |         0.0642478329828009,
120 |         0.06382416700944304,
121 |         0.06274441699497402,
122 |         0.0648237080022227,
123 |         0.06421766598941758,
124 |         0.06274287501582876,
125 |         0.06418754099286161,
126 |         0.06337112499750219,
127 |         0.06352683299337514,
128 |         0.06372895801905543,
129 |         0.06390520802233368,
130 |         0.06371245800983161,
131 |         0.06338716699974611,
132 |         0.06575258300290443,
133 |         0.06351533398265019,
134 |         0.06342225000844337,
135 |         0.06350208300864324,
136 |         0.06461916698026471,
137 |         0.063936584017938,
138 |         0.06390541599830613,
139 |         0.06407525000395253,
140 |         0.06340216699754819,
141 |         0.06282837499747984,
142 |         0.06462962500518188,
143 |         0.0641327920020558,
144 |         0.06488316599279642,
145 |         0.06303525000112131,
146 |         0.06462529199779965,
147 |         0.06373324999003671,
148 |         0.06337058398639783,
149 |         0.06313145800959319,
150 |         0.06430174998240545,
151 |         0.06479545799084008,
152 |         0.0637904159957543,
153 |         0.06566670801839791,
154 |         0.05965949999517761,
155 |         0.06499429099494591,
156 |         0.0647090419952292,
157 |         0.0642154160013888,
158 |         0.06312849998357706,
159 |         0.06434266699943691,
160 |         0.06338929201592691,
161 |         0.0636489580210764,
162 |         0.064640500000678,
163 |         0.06383749999804422,
164 |         0.06494691700208932,
165 |         0.06376024999190122,
166 |         0.06398558302316815,
167 |         0.06361199999810196,
168 |         0.06372262499644421,
169 |         0.0642287080117967,
170 |         0.06415083300089464,
171 |         0.06340141699183732,
172 |         0.06467233298462816,
173 |         0.06443112500710413,
174 |         0.06474558298941702,
175 |         0.06307387497508898,
176 |         0.06402662498294376,
177 |         0.06344970798818395,
178 |         0.06343229199410416,
179 |         0.06341170900850557,
180 |         0.06379575000028126,
181 |         0.06319029198493809,
182 |         0.06499416701262817,
183 |         0.06463583299773745,
184 |         0.06370729199261405,
185 |         0.06316495800274424,
186 |         0.06409333299961872,
187 |         0.06415633400320075,
188 |         0.06386249998467974,
189 |         0.0635423339845147,
190 |         0.06403995800064877,
191 |         0.06380958401132375,
192 |         0.06409708299906924,
193 |         0.06439775001490489,
194 |         0.06419975002063438,
195 |         0.06400912502431311,
196 |         0.06388608302222565,
197 |         0.06453558398061432,
198 |         0.06436237500747666,
199 |         0.0635280410060659,
200 |         0.06348933299886994,
201 |         0.06393875001231208,
202 |         0.06437704098061658,
203 |         0.06505916701280512,
204 |         0.06457829201826826,
205 |         0.062883333011996,
206 |         0.06464354100171477,
207 |         0.06355437499587424,
208 |         0.06430345799890347,
209 |         0.06429766697692685,
210 |         0.06383433402515948,
211 |         0.06387799998628907,
212 |         0.0643800419929903,
213 |         0.06438437500037253,
214 |         0.0638653339992743,
215 |         0.06345554100698791,
216 |         0.06449558300664648,
217 |         0.06388316600350663,
218 |         0.06479329202556983,
219 |         0.06254474999150261,
220 |         0.06371562500135042,
221 |         0.06407887500245124,
222 |         0.06431366599281318,
223 |         0.0644370420195628,
224 |         0.06472729201777838,
225 |         0.06416037501185201,
226 |         0.06474170801811852,
227 |         0.06363495800178498,
228 |         0.0645637080015149,
229 |         0.05807462500524707,
230 |         0.0649033329973463,
231 |         0.06361041698255576,
232 |         0.06451291698613204,
233 |         0.06469370899139903,
234 |         0.0640999170136638,
235 |         0.0660214580129832,
236 |         0.0646264590031933,
237 |         0.06487620898406021,
238 |         0.06401458298205398,
239 |         0.06325124998693354,
240 |         0.06438924997928552,
241 |         0.0642430419975426,
242 |         0.0638723750016652,
243 |         0.0642105000151787,
244 |         0.0652872080099769,
245 |         0.06383195900707506,
246 |         0.06463574999361299,
247 |         0.06503337499452755,
248 |         0.06614325000555255,
249 |         0.06922112501342781,
250 |         0.06523287500021979,
251 |         0.06401958299102262,
252 |         0.06478358301683329,
253 |         0.0634410829807166,
254 |         0.06447337500867434,
255 |         0.06433170801028609,
256 |         0.06458100001327693,
257 |         0.06401320800068788,
258 |         0.06439433299237862,
259 |         0.06442100001731887,
260 |         0.064551624993328,
261 |         0.06353462499100715,
262 |         0.06405608300701715,
263 |         0.06532541700289585,
264 |         0.06410345798940398,
265 |         0.06398383298073895,
266 |         0.0647152500168886,
267 |         0.06400387501344085,
268 |         0.06353183399187401,
269 |         0.06437620898941532,
270 |         0.06458066700724885,
271 |         0.06377395801246166,
272 |         0.06508633299381472,
273 |         0.06455716700293124,
274 |         0.06426795799052343,
275 |         0.06406862498261034,
276 |         0.06429037501220591,
277 |         0.06486216600751504,
278 |         0.06356041599065065,
279 |         0.06424241600325331,
280 |         0.06479370797751471,
281 |         0.06473379101953469,
282 |         0.06486316601512954,
283 |         0.06432462498196401,
284 |         0.06498587501118891,
285 |         0.0635620410030242,
286 |         0.0642983750149142,
287 |         0.06517600000370294,
288 |         0.06468945799861103,
289 |         0.06470412499038503,
290 |         0.06376245801220648,
291 |         0.064886917010881,
292 |         0.06441520800581202,
293 |         0.06461637502070516,
294 |         0.0650263330026064,
295 |         0.06352854199940339,
296 |         0.06553766599972732,
297 |         0.06482400000095367,
298 |         0.06408287500380538,
299 |         0.06470920800347812,
300 |         0.06507024998427369,
301 |         0.05939904198748991,
302 |         0.05856808397220448,
303 |         0.05792379201739095,
304 |         0.05875624998589046,
305 |         0.05821258298237808,
306 |         0.05797862500185147,
307 |         0.0582444169849623,
308 |         0.05888770800083876,
309 |         0.057990708999568596,
310 |         0.058774624980287626,
311 |         0.05769141699420288,
312 |         0.06280350001179613,
313 |         0.05891133300610818,
314 |         0.05840287497267127,
315 |         0.057056749996263534,
316 |         0.057819000008748844,
317 |         0.056933666026452556,
318 |         0.055553833983140066,
319 |         0.05672291701193899,
320 |         0.05713329097488895,
321 |         0.05706799999461509,
322 |         0.05758858399349265,
323 |         0.05769229101133533,
324 |         0.0572446660080459,
325 |         0.05748454199056141
326 |     ]
327 | }


--------------------------------------------------------------------------------
/benchmarks/results/toPandas_large.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "toPandas",
  3 |     "dataset": "large",
  4 |     "dataset_size": 100000000,
  5 |     "runtimes": [
  6 |         8.862778999988222,
  7 |         8.811171500012279,
  8 |         8.938347457995405,
  9 |         8.947406374994898,
 10 |         8.88868116599042,
 11 |         8.990357999980915,
 12 |         8.73990975000197,
 13 |         8.638437292014714,
 14 |         9.160425375011982,
 15 |         9.038150041975314,
 16 |         8.591716666996945,
 17 |         9.168473375000758,
 18 |         8.798064542002976,
 19 |         8.936836874985602,
 20 |         8.671541833988158,
 21 |         8.662482666986762,
 22 |         8.708136500004912,
 23 |         8.692952374985907,
 24 |         8.592529084009584,
 25 |         8.740214041987201,
 26 |         9.146632749994751,
 27 |         8.8302964589966,
 28 |         9.15225395897869,
 29 |         9.106577541009756,
 30 |         8.817999457998667,
 31 |         8.631971499999054,
 32 |         8.868299333000323,
 33 |         8.840884500008542,
 34 |         8.621281041996554,
 35 |         8.586707083013607,
 36 |         8.629861416004132,
 37 |         8.58383437502198,
 38 |         8.67459566600155,
 39 |         8.966120708006201,
 40 |         9.302168708003592,
 41 |         8.56661416697898,
 42 |         8.576364625012502,
 43 |         9.334656874998473,
 44 |         8.738957708003,
 45 |         8.569964958005585,
 46 |         9.004718665994005,
 47 |         8.58318062502076,
 48 |         8.604225666000275,
 49 |         8.54163133300608,
 50 |         8.606262207991676,
 51 |         8.530463000002783,
 52 |         8.523315916012507,
 53 |         8.498393665999174,
 54 |         8.456541958003072,
 55 |         8.534131916996557,
 56 |         8.56562666699756,
 57 |         9.39233074997901,
 58 |         9.234180207975442,
 59 |         8.49574904202018,
 60 |         8.959661500004586,
 61 |         8.539121125009842,
 62 |         8.487174874986522,
 63 |         8.591410583001561,
 64 |         8.695382541976869,
 65 |         8.435281415993813,
 66 |         8.502639499987708,
 67 |         8.930086000007577,
 68 |         8.565875666012289,
 69 |         8.536115042021265,
 70 |         8.796861499984516,
 71 |         8.54752500000177,
 72 |         8.55864475000999,
 73 |         8.484635584027274,
 74 |         8.522846042003948,
 75 |         8.59690987499198,
 76 |         9.052915917010978,
 77 |         8.575967915996443,
 78 |         8.669178208016092,
 79 |         8.714108874992235,
 80 |         9.386535500001628,
 81 |         8.646265166986268,
 82 |         8.482657792017562,
 83 |         8.864741375000449,
 84 |         8.946433333010646,
 85 |         8.905033792019822,
 86 |         8.449145749997115,
 87 |         8.460354208014905,
 88 |         8.46207508299267,
 89 |         8.486035125009948,
 90 |         8.531593207997503,
 91 |         8.815072375000454,
 92 |         8.717701290996047,
 93 |         8.582990959024755,
 94 |         8.68444795900723,
 95 |         8.602480875008041,
 96 |         9.267432209016988,
 97 |         9.377239374996861,
 98 |         8.727259999985108,
 99 |         8.711020249989815,
100 |         8.912161042011576,
101 |         8.738440042012371,
102 |         8.81438700002036,
103 |         8.762875250016805,
104 |         8.676942000020063,
105 |         8.701102665974759,
106 |         8.623225249990355,
107 |         8.937765667011263,
108 |         8.805998249998083,
109 |         9.867303541017463,
110 |         8.744416041998193,
111 |         8.637591749982676,
112 |         9.320447209000122,
113 |         9.01535137501196,
114 |         9.542240958980983,
115 |         8.659673166985158,
116 |         8.71328033300233,
117 |         8.699457167007495,
118 |         8.678966707986547,
119 |         8.621516874991357,
120 |         8.565221416996792,
121 |         8.622395917016547,
122 |         9.136514207988512,
123 |         8.58931437501451
124 |     ]
125 | }


--------------------------------------------------------------------------------
/benchmarks/results/toPandas_medium.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test_name": "toPandas",
  3 |     "dataset": "medium",
  4 |     "dataset_size": 10000000,
  5 |     "runtimes": [
  6 |         1.2885572499944828,
  7 |         1.2571123329980765,
  8 |         1.256014916987624,
  9 |         1.2624951250036247,
 10 |         1.2701949999900535,
 11 |         1.27012212498812,
 12 |         1.2953377089870628,
 13 |         1.28163666598266,
 14 |         1.2605684580048546,
 15 |         1.2856867499940563,
 16 |         1.2515628750261385,
 17 |         1.2650770000182092,
 18 |         1.2583320830017328,
 19 |         1.2575984589930158,
 20 |         1.2693923329934478,
 21 |         1.256740249984432,
 22 |         1.2621407080150675,
 23 |         1.2525152499729302,
 24 |         1.27661829101271,
 25 |         1.258097041019937,
 26 |         1.2528983329830226,
 27 |         1.254257416992914,
 28 |         1.2604653750022408,
 29 |         1.2601165829983074,
 30 |         1.2579469589982182,
 31 |         1.258330750017194,
 32 |         1.264852874999633,
 33 |         1.2719748750096187,
 34 |         1.2666882920020726,
 35 |         1.2630691660160664,
 36 |         1.2763231249991804,
 37 |         1.260669625014998,
 38 |         1.259572000009939,
 39 |         1.2746881250059232,
 40 |         1.2601215410104487,
 41 |         1.2655172920203768,
 42 |         1.2657782919995952,
 43 |         1.2550521249941085,
 44 |         1.2478350419842172,
 45 |         1.2497141670028213,
 46 |         1.2692700409970712,
 47 |         1.2729937089898158,
 48 |         1.2649799170030747,
 49 |         1.2598057499853894,
 50 |         1.2757172500132583,
 51 |         1.2751642079965677,
 52 |         1.2607573750137817,
 53 |         1.258303791983053,
 54 |         1.2484310830186587,
 55 |         1.2535902920062654,
 56 |         1.2511154170206282,
 57 |         1.2532207920157816,
 58 |         1.256906374997925,
 59 |         1.2576233340078034,
 60 |         1.2554536249954253,
 61 |         1.264078125008382,
 62 |         1.2596141249814536,
 63 |         1.2676740419992711,
 64 |         1.2511124169977847,
 65 |         1.2469390420010313,
 66 |         1.248246333008865,
 67 |         1.2562179579981603,
 68 |         1.259968582977308,
 69 |         1.2633217920083553,
 70 |         1.2496773750171997,
 71 |         1.255484167020768,
 72 |         1.250518374989042,
 73 |         1.253819665987976,
 74 |         1.2618275830172934,
 75 |         1.2681392919912469,
 76 |         1.2453019999957178,
 77 |         1.260975375014823,
 78 |         1.271160583011806,
 79 |         1.2457151250273455,
 80 |         1.2655820829968434,
 81 |         1.2527838750102092,
 82 |         1.2574351250077598,
 83 |         1.2535599590046331,
 84 |         1.2713026250130497,
 85 |         1.2475648749968968,
 86 |         1.2485032090044115,
 87 |         1.2522275419905782,
 88 |         1.2653647499973886,
 89 |         1.2641535000002477,
 90 |         1.2570255419996101,
 91 |         1.2574704999860842,
 92 |         1.2512661659857258,
 93 |         1.2690267080033664,
 94 |         1.2580981670180336,
 95 |         1.2658240419987123,
 96 |         1.2544514170149341,
 97 |         1.2501862089848146,
 98 |         1.2534734169894364,
 99 |         1.2411465829936787,
100 |         1.2681619999930263,
101 |         1.2595267920114566,
102 |         1.2521268330165185,
103 |         1.2558963330229744,
104 |         1.2550300410075579,
105 |         1.2594273750146385,
106 |         1.2700898330076598,
107 |         1.2613907079794444,
108 |         1.2615968750033062,
109 |         1.256476125010522,
110 |         1.2549062500183936,
111 |         1.2425632910162676,
112 |         1.2587947080028243,
113 |         1.2492519579827785,
114 |         1.2572470830054954,
115 |         1.257936542009702,
116 |         1.269242457987275,
117 |         1.2409304580069147,
118 |         1.2584901249792892,
119 |         1.251469167007599,
120 |         1.2582818329974543,
121 |         1.2683968750061467,
122 |         1.250196707987925,
123 |         1.2636364579957444,
124 |         1.2618374169978779,
125 |         1.2372403330227826,
126 |         1.2556332079984713,
127 |         1.273715458024526,
128 |         1.2490043340076227,
129 |         1.2452241249848157,
130 |         1.2518945840129163,
131 |         1.2659609169932082,
132 |         1.2602919999917503,
133 |         1.2622803749982268,
134 |         1.2564580829930492,
135 |         1.2583414999826346,
136 |         1.255592000001343,
137 |         1.2519122080120724,
138 |         1.2525597079948056,
139 |         1.2449430830019992,
140 |         1.258767541992711,
141 |         1.2542946659959853,
142 |         1.2580878750013653,
143 |         1.2641330419864971,
144 |         1.2617088750121184,
145 |         1.2378346659825183,
146 |         1.2531650409800932,
147 |         1.2640607500215992,
148 |         1.2595061250030994,
149 |         1.2388757499866188,
150 |         1.2576246660028119,
151 |         1.2584112079930492,
152 |         1.2468822920054663,
153 |         1.24673616598011,
154 |         1.254638749989681,
155 |         1.2438010000041686,
156 |         1.2463356249791104,
157 |         1.2494282499828842,
158 |         1.2595032919780351,
159 |         1.24727687498671,
160 |         1.2564306669810321,
161 |         1.2541845410014503,
162 |         1.2410721249761991,
163 |         1.2458839580067433,
164 |         1.2591591250093188,
165 |         1.2470217079971917,
166 |         1.254125416977331,
167 |         1.2584732499963138,
168 |         1.2589741249976214,
169 |         1.2620728749898262,
170 |         1.2665299999935087,
171 |         1.261897999997018,
172 |         1.2440591669874266,
173 |         1.2591024589783046,
174 |         1.2497527500090655,
175 |         1.2537597499904223,
176 |         1.250720125011867,
177 |         1.2480132080090698,
178 |         1.235797332978109,
179 |         1.2646380409714766,
180 |         1.2634682499920018,
181 |         1.242793207988143,
182 |         1.2498649170156568,
183 |         1.2549589590053074,
184 |         1.2531464160128962,
185 |         1.245336749998387,
186 |         1.2464906670211349,
187 |         1.2613908749772236,
188 |         1.2601894999970682,
189 |         1.2590537079959176,
190 |         1.248518416978186,
191 |         1.2670163750008214,
192 |         1.2589331250055693,
193 |         1.2547621669946238,
194 |         1.2601012089871801,
195 |         1.2606227079813834,
196 |         1.2661379160126671,
197 |         1.2611060409981292,
198 |         1.2527715420001186,
199 |         1.2650721249810886,
200 |         1.254195499990601,
201 |         1.2650972080009524,
202 |         1.2625275420141406,
203 |         1.2661464999837335,
204 |         1.2615302499907557,
205 |         1.2513069160049781,
206 |         1.2482542910147458,
207 |         1.2654491249995772,
208 |         1.2621469580044504,
209 |         1.2478159999882337,
210 |         1.255306584003847,
211 |         1.2464843330089934,
212 |         1.3007019170036074,
213 |         1.266680291009834,
214 |         1.2509966670186259,
215 |         1.2605993750039488,
216 |         1.250382541998988,
217 |         1.3018390409997664,
218 |         1.2543151670251973,
219 |         1.2627636669785716,
220 |         1.2599722500017378,
221 |         1.2687087090162095,
222 |         1.2725013339950237,
223 |         1.2627896670019254,
224 |         1.2572203340241686,
225 |         1.2694404170033522,
226 |         1.2506123329803813,
227 |         1.2539891250198707,
228 |         1.2666916250018403,
229 |         1.2588019579998218,
230 |         1.2651899160118774,
231 |         1.2561217090114951,
232 |         1.2594041250122245,
233 |         1.2495557079964783,
234 |         1.271390917012468,
235 |         1.24894195899833,
236 |         1.2602929170243442,
237 |         1.2597828749858309,
238 |         1.2529933750047348,
239 |         1.2619111250096466,
240 |         1.2601427090121433,
241 |         1.254465791018447,
242 |         1.268157540995162,
243 |         1.2684888750081882,
244 |         1.2474725829961244,
245 |         1.259887416003039,
246 |         1.2599989169975743,
247 |         1.2568535419995897,
248 |         1.2597891249752138,
249 |         1.2698568330088165,
250 |         1.2522419590095524,
251 |         1.270519375015283,
252 |         1.2659674169844948,
253 |         1.2543618329800665,
254 |         1.2571934580046218,
255 |         1.2723624169884715,
256 |         1.259315791015979,
257 |         1.2684716249932535,
258 |         1.2472508329956327,
259 |         1.2557019999949262,
260 |         1.261543167027412,
261 |         1.2660564169927966,
262 |         1.2707085419970099,
263 |         1.2638022909814026,
264 |         1.2595980420010164,
265 |         1.2619648329855409,
266 |         1.2532179999980144,
267 |         1.2595204589888453,
268 |         1.264111166994553,
269 |         1.2625636250013486,
270 |         1.2478563750046305,
271 |         1.26558091698098,
272 |         1.2610336250218097,
273 |         1.2406471249996684,
274 |         1.2575986250012647,
275 |         1.2457927499781363,
276 |         1.2491438330034725,
277 |         1.2553590000024997,
278 |         1.2700152920151595,
279 |         1.2501201249833684,
280 |         1.2646489159960765,
281 |         1.2433127920085099,
282 |         1.2392549159994815,
283 |         1.2527907500043511,
284 |         1.2585749159916304,
285 |         1.2368915829865728,
286 |         1.2542310419958085
287 |     ]
288 | }


--------------------------------------------------------------------------------
/benchmarks/visualize_benchmarks.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | from datetime import datetime as dt
 17 | from pathlib import Path
 18 | 
 19 | import pandas as pd
 20 | import plotly.express as px
 21 | import pyspark.sql.functions as F  # noqa: N812
 22 | import pytz
 23 | from pyspark.sql import SparkSession
 24 | 
 25 | 
 26 | def parse_results(spark: SparkSession) -> tuple[pd.DataFrame, pd.DataFrame, str]:
 27 |     """Parse benchmark results into a Pandas DataFrame."""
 28 |     result_df = (
 29 |         spark.read.json("benchmarks/results/*.json", multiLine=True)
 30 |         .select(
 31 |             "test_name",
 32 |             "dataset",
 33 |             "dataset_size",
 34 |             F.explode("runtimes").alias("runtime"),
 35 |         )
 36 |         .withColumnRenamed("dataset", "dataset_name")
 37 |         .withColumn(
 38 |             "dataset_size_formatted",
 39 |             F.concat(F.lit("n="), F.format_number(F.col("dataset_size"), 0)),
 40 |         )
 41 |         .withColumn(
 42 |             "dataset",
 43 |             F.concat(
 44 |                 F.col("dataset_name"),
 45 |                 F.lit(" ("),
 46 |                 F.col("dataset_size_formatted"),
 47 |                 F.lit(")"),
 48 |             ),
 49 |         )
 50 |         .toPandas()
 51 |     )
 52 | 
 53 |     if not isinstance(result_df, pd.DataFrame):
 54 |         raise TypeError
 55 | 
 56 |     result_df["dataset_name"] = pd.Categorical(
 57 |         result_df["dataset_name"],
 58 |         ["xsmall", "small", "medium", "large"],
 59 |     )
 60 | 
 61 |     average_df = (
 62 |         result_df[["test_name", "dataset_size", "runtime"]]
 63 |         .groupby(["test_name", "dataset_size"], observed=False)
 64 |         .mean()
 65 |         .reset_index()
 66 |     )
 67 | 
 68 |     benchmark_date = get_benchmark_date(benchmark_path="benchmarks/results/")
 69 |     return result_df, average_df, benchmark_date
 70 | 
 71 | 
 72 | def save_boxplot(df: pd.DataFrame, benchmark_date: str) -> None:
 73 |     """Displays faceted boxplot of benchmark results."""
 74 |     machine_config = "Python 3.12.0, Spark 3.5, Pandas 2.1.3, M1 Macbook Pro 32GB RAM"
 75 |     subtitle = f"<sup>{benchmark_date} | {machine_config}</sup>"
 76 | 
 77 |     fig = px.box(
 78 |         df,
 79 |         x="dataset_size_formatted",
 80 |         y="runtime",
 81 |         color="test_name",
 82 |         facet_col="dataset_name",
 83 |         points="all",
 84 |         title=f"Column to List Benchmark Results<br>{subtitle}</br>",
 85 |         labels={"runtime": "Runtime (seconds)"},
 86 |         category_orders={
 87 |             "dataset_name": ["xsmall", "small", "medium", "large"],
 88 |             "test_name": [
 89 |                 "localIterator",
 90 |                 "collectlist",
 91 |                 "map",
 92 |                 "flatmap",
 93 |                 "toPandas",
 94 |             ],
 95 |         },
 96 |         color_discrete_map={
 97 |             "collectlist": "#636EFA",
 98 |             "localIterator": "#EF553B",
 99 |             "toPandas": "#00CC96",
100 |             "map": "#AB63FA",
101 |             "flatmap": "#FFA15A",
102 |         },
103 |     )
104 |     fig.update_yaxes(matches=None)
105 |     fig.update_yaxes({"tickfont": {"size": 9}})
106 |     fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
107 |     fig.update_xaxes(matches=None, title=None)
108 |     fig.update_layout(legend_title_text="")
109 | 
110 |     fig.write_image(
111 |         "benchmarks/images/column_to_list_boxplot.svg",
112 |         width=1000,
113 |         height=700,
114 |     )
115 | 
116 | 
117 | def save_line_plot(df: pd.DataFrame, benchmark_date: str) -> None:
118 |     """Displays line plot of average benchmark results."""
119 |     machine_config = "Python 3.12.0, Spark 3.5, Pandas 2.1.3, M1 Macbook Pro 32GB RAM"
120 |     subtitle = f"<sup>{benchmark_date} | {machine_config}</sup>"
121 |     fig = px.line(
122 |         df,
123 |         x="dataset_size",
124 |         y="runtime",
125 |         log_x=True,
126 |         color="test_name",
127 |         title=f"Column to List Benchmark Results<br>{subtitle}</br>",
128 |         labels={"runtime": "Runtime (seconds)", "dataset_size": "Number of Rows"},
129 |         category_orders={
130 |             "test_name": [
131 |                 "localIterator",
132 |                 "collectlist",
133 |                 "map",
134 |                 "flatmap",
135 |                 "toPandas",
136 |             ],
137 |         },
138 |         color_discrete_map={
139 |             "collectlist": "#636EFA",
140 |             "localIterator": "#EF553B",
141 |             "toPandas": "#00CC96",
142 |             "map": "#AB63FA",
143 |             "flatmap": "#FFA15A",
144 |         },
145 |     )
146 |     fig.update_traces(mode="markers+lines")
147 |     fig.update_traces(marker={"size": 12})
148 |     fig.update_layout(legend_title_text="")
149 | 
150 |     fig.write_image(
151 |         "benchmarks/images/column_to_list_line_plot.svg",
152 |         width=900,
153 |         height=450,
154 |     )
155 | 
156 | 
157 | def get_benchmark_date(benchmark_path: str) -> str:
158 |     """Returns the date of the benchmark results."""
159 |     path = Path(benchmark_path)
160 |     benchmark_ts = path.stat().st_mtime
161 |     return dt.fromtimestamp(
162 |         benchmark_ts,
163 |         tz=pytz.timezone("US/Eastern"),
164 |     ).strftime("%Y-%m-%d")
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     spark = (
169 |         SparkSession.builder.appName("MyApp")  # type: ignore # noqa: PGH003
170 |         .config("spark.executor.memory", "10G")
171 |         .config("spark.driver.memory", "25G")
172 |         .config("spark.sql.shuffle.partitions", "2")
173 |         .getOrCreate()
174 |     )
175 | 
176 |     result_df, average_df, benchmark_date = parse_results(spark)
177 |     save_boxplot(result_df, benchmark_date)
178 |     save_line_plot(average_df, benchmark_date)
179 | 


--------------------------------------------------------------------------------
/docs/examples/index.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | Example Quinn code snippets
4 | 
5 | - [Schema as Code](../notebooks/schema_as_code.ipynb)


--------------------------------------------------------------------------------
/docs/gen_ref_pages.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | """Generate the code reference pages and navigation.
15 | 
16 | Script was taken from
17 | https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages
18 | """
19 | 
20 | from pathlib import Path
21 | 
22 | import mkdocs_gen_files
23 | 
24 | nav = mkdocs_gen_files.Nav()
25 | 
26 | for path in sorted(Path(".").rglob("quinn/**/*.py")):
27 |     module_path = path.relative_to(".").with_suffix("")
28 |     doc_path = path.relative_to(".").with_suffix(".md")
29 |     full_doc_path = Path("reference", doc_path)
30 | 
31 |     parts = tuple(module_path.parts)
32 | 
33 |     if parts[-1] == "__init__":
34 |         parts = parts[:-1]
35 |         doc_path = doc_path.with_name("index.md")
36 |         full_doc_path = full_doc_path.with_name("index.md")
37 |     elif parts[-1] == "__main__":
38 |         continue
39 | 
40 |     nav[parts] = doc_path.as_posix()  #
41 | 
42 |     with mkdocs_gen_files.open(full_doc_path, "w") as fd:
43 |         ident = ".".join(parts)
44 |         fd.write(f"::: {ident}")
45 | 
46 |     mkdocs_gen_files.set_edit_path(full_doc_path, path)
47 | 
48 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
49 |     nav_file.writelines(nav.build_literate_nav())
50 | 


--------------------------------------------------------------------------------
/docs/images/quinn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/docs/images/quinn.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Quinn
 2 | 
 3 | ![quinn logo](images/quinn.png)
 4 | 
 5 | Quinn contains PySpark helper methods that will make you more productive.
 6 | 
 7 | Quinn is also a great way to learn about PySpark best practices like how to organize and unit test your code.
 8 | 
 9 | ## Contributing
10 | 
11 | We have a solid group of maintainers, chat on contributor meetings regularly, and eagerly accept contributions from other members.
12 | 
13 | We want to help the world write beautiful PySpark and give them a wonderful developer experience.
14 | 
15 | ### Code Style
16 | 
17 | We are using [PySpark code-style](https://github.com/MrPowers/spark-style-guide/blob/main/PYSPARK_STYLE_GUIDE.md) and `sphinx` as docstrings format. For more details about `sphinx` format see [this tutorial](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html). A short example of `sphinx`-formatted docstring is placed below:
18 | 
19 | ```python
20 | """[Summary]
21 | 
22 | :param [ParamName]: [ParamDescription], defaults to [DefaultParamVal]
23 | :type [ParamName]: [ParamType](, optional)
24 | ...
25 | :raises [ErrorType]: [ErrorDescription]
26 | ...
27 | :return: [ReturnDescription]
28 | :rtype: [ReturnType]
29 | """
30 | ```
31 | 


--------------------------------------------------------------------------------
/docs/learn_more/column_to_list.md:
--------------------------------------------------------------------------------
  1 | # Column to list performance
  2 | 
  3 | In PySpark, there are many approaches to accomplish the same task. Given a test DataFrame containing two columns - mvv and count, here are five methods to produce an identical list of mvv values using base PySpark functionality.
  4 | 
  5 | ---
  6 | 
  7 | ## Setup
  8 | 
  9 | ```python
 10 | import pyspark.sql.functions as F
 11 | from pyspark.sql import SparkSession
 12 | ```
 13 | 
 14 | ```python
 15 | spark = SparkSession.builder.getOrCreate()
 16 | vals = [(0, 5), (1, 10), (2, 4), (3, 2), (4, 1)]
 17 | df = spark.createDataFrame(count_vals, schema="mvv int, count int")
 18 | ```
 19 | 
 20 | ---
 21 | 
 22 | ## Approaches
 23 | 
 24 | ### 1. toPandas()
 25 | 
 26 | ```python
 27 | list(df.select("mvv").toPandas()["mvv"])
 28 | # [0, 1, 2, 3, 4]
 29 | ```
 30 | 
 31 | ### 2. flatMap
 32 | 
 33 | ```python
 34 | df.select("mvv").rdd.flatMap(lambda x: x).collect()
 35 | # [0, 1, 2, 3, 4]
 36 | ```
 37 | 
 38 | ### 3. map
 39 | 
 40 | ```python
 41 | df.select("mvv").rdd.map(lambda row: row[0]).collect()
 42 | # [0, 1, 2, 3, 4]
 43 | ```
 44 | 
 45 | ### 4. collect list comprehension
 46 | 
 47 | ```python
 48 | [row[0] for row in df.select("mvv").collect()]
 49 | # [0, 1, 2, 3, 4]
 50 | ```
 51 | 
 52 | ### 5. toLocalIterator() list comprehension
 53 | 
 54 | ```python
 55 | [row[0] for row in df.select("mvv").toLocalIterator()]
 56 | # [0, 1, 2, 3, 4]
 57 | ```
 58 | 
 59 | ---
 60 | 
 61 | ## Benchmark Results
 62 | 
 63 | Substantial runtime differences were observed on the medium and large datasets:
 64 | 
 65 | ![box plot](../images/column_to_list_boxplot.svg)
 66 | 
 67 | ![line plot](../images/column_to_list_line_plot.svg)
 68 | 
 69 | All approaches have similar performance at 1K and 100k rows. `toPandas()` is consistently the fastest method across the tested dataset sizes, and exhibits the least variance in runtime. However, `pyarrow` and `pandas` are not required dependencies of Quinn so this method will only work with those packages available. For typical spark workloads, the `flatMap` approach is the next best option to use by default.
 70 | 
 71 | ---
 72 | 
 73 | ## Quinn Implementation
 74 | 
 75 | [:material-api: `quinn.column_to_list`](https://mrpowers.github.io/quinn/reference/quinn/dataframe_helpers)
 76 | 
 77 | To address these performance results, we updated `quinn.column_to_list()` to check the runtime environment and use the fastest method. If `pandas` and `pyarrow` are available, `toPandas()` is used. Otherwise, `flatmap` is used.
 78 | 
 79 | ---
 80 | 
 81 | ## More Information
 82 | 
 83 | ### Datasets
 84 | 
 85 | Four datasets were used for this benchmark. Each dataset contains two columns - mvv and index. The mvv column is a monotonically increasing integer and the count column is a random integer between 1 and 10. The datasets were created using the `create_benchmark_df.py` script in `quinn/benchmarks`
 86 | 
 87 | | Dataset name | Number of rows | Number of files | Size on disk (mb) |
 88 | | ------------ | -------------- | --------------- | ----------------- |
 89 | | mvv_xsmall   | 1,000          | 1               | 0.005             |
 90 | | mvv_small    | 100,000        | 1               | 0.45              |
 91 | | mvv_medium   | 10,000,000     | 1               | 45                |
 92 | | mvv_large    | 100,000,000    | 4               | 566               |
 93 | 
 94 | ---
 95 | 
 96 | ### Validation
 97 | 
 98 | The code and results from this test are available in the `/benchmarks` directory of Quinn. To run this benchmark yourself:
 99 | 
100 | #### 1. install the required dependencies
101 | 
102 | ```bash
103 | poetry install --with docs
104 | ```
105 | 
106 | #### 2. create the datasets
107 | 
108 | ```bash
109 | poetry run python benchmarks/create_benchmark_df.py
110 | ```
111 | 
112 | #### 3. run the benchmark
113 | 
114 | ```bash
115 | poetry run python benchmarks/benchmark_column_performance.py
116 | ```
117 | 
118 | Results will be stored in the `benchmarks/results` directory.
119 | By default each implementation will run for the following durations:
120 | 
121 | | Dataset name | Duration (seconds) |
122 | | ------------ | ------------------ |
123 | | mvv_xsmall   | 20                 |
124 | | mvv_small    | 20                 |
125 | | mvv_medium   | 360                |
126 | | mvv_large    | 1200               |
127 | 
128 | These can be adjusted in benchmarks/benchmark_column_performance.py if a shorter or longer duration is desired.
129 | 
130 | #### 4. Visualize the results
131 | 
132 | ```bash
133 | poetry run python benchmarks/visualize_benchmarks.py
134 | ```
135 | 
136 | .svg files will be saved in the `benchmarks/images` directory.
137 | 


--------------------------------------------------------------------------------
/docs/learn_more/index.md:
--------------------------------------------------------------------------------
1 | # Learn More
2 | 
3 | Deeper explanations of design decisions and use cases for Quinn
4 | 
5 | - [Convert PySpark DataFrame Columns to a Python List](column_to_list.md)
6 | 


--------------------------------------------------------------------------------
/docs/notebooks/schema_as_code.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "57a1c914-7244-4759-8abc-9e27060eef7f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Print SCHEMA as code\n",
  9 |     "\n",
 10 |     "Function, that take `pyspark.sql.types.StructType` and print a valid `Python` code."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "2505c5c1-15cc-47ea-b71d-d53472ae67ae",
 17 |    "metadata": {
 18 |     "tags": []
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "from quinn import print_schema_as_code"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "id": "04338b8d-f604-4b59-9904-afa2fa7c4e4d",
 29 |    "metadata": {
 30 |     "tags": []
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from pyspark.sql import types as T"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "id": "68a70047-e805-4be8-be52-08b168a0363b",
 41 |    "metadata": {
 42 |     "tags": []
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "schema = T.StructType(\n",
 47 |     "    [\n",
 48 |     "        T.StructField(\"string_field\", T.StringType()),\n",
 49 |     "        T.StructField(\"decimal_38_10_field\", T.DecimalType(38, 10)),\n",
 50 |     "        T.StructField(\"decimal_10_2_field\", T.DecimalType(10, 2)),\n",
 51 |     "        T.StructField(\"array_of_double\", T.ArrayType(elementType=T.DoubleType())),\n",
 52 |     "        T.StructField(\"map_type\", T.MapType(keyType=T.StringType(), valueType=T.ShortType())),\n",
 53 |     "        T.StructField(\"struct_type\", T.StructType([T.StructField(\"t1\", T.StringType()), T.StructField(\"t2\", T.BooleanType())])),\n",
 54 |     "    ]\n",
 55 |     ")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "id": "61a487be-765a-46bf-881c-cc08b292e951",
 62 |    "metadata": {
 63 |     "tags": []
 64 |    },
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "StructType(\n",
 71 |       "\tfields=[\n",
 72 |       "\t\tStructField(\"string_field\", StringType(), True),\n",
 73 |       "\t\tStructField(\"decimal_38_10_field\", DecimalType(38, 10), True),\n",
 74 |       "\t\tStructField(\"decimal_10_2_field\", DecimalType(10, 2), True),\n",
 75 |       "\t\tStructField(\n",
 76 |       "\t\t\t\"array_of_double\",\n",
 77 |       "\t\t\tArrayType(DoubleType()),\n",
 78 |       "\t\t\tTrue,\n",
 79 |       "\t\t),\n",
 80 |       "\t\tStructField(\n",
 81 |       "\t\t\t\"map_type\",\n",
 82 |       "\t\t\tMapType(\n",
 83 |       "\t\t\t\tStringType(),\n",
 84 |       "\t\t\t\tShortType(),\n",
 85 |       "\t\t\t\tTrue,\n",
 86 |       "\t\t\t),\n",
 87 |       "\t\t\tTrue,\n",
 88 |       "\t\t),\n",
 89 |       "\t\tStructField(\n",
 90 |       "\t\t\t\"struct_type\",\n",
 91 |       "\t\t\tStructType(\n",
 92 |       "\t\t\t\tfields=[\n",
 93 |       "\t\t\t\t\tStructField(\"t1\", StringType(), True),\n",
 94 |       "\t\t\t\t\tStructField(\"t2\", BooleanType(), True),\n",
 95 |       "\t\t\t\t]\n",
 96 |       "\t\t\t),\n",
 97 |       "\t\t\tTrue,\n",
 98 |       "\t\t),\n",
 99 |       "\t]\n",
100 |       ")\n"
101 |      ]
102 |     },
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "StructType([StructField('string_field', StringType(), True), StructField('decimal_38_10_field', DecimalType(38,10), True), StructField('decimal_10_2_field', DecimalType(10,2), True), StructField('array_of_double', ArrayType(DoubleType(), True), True), StructField('map_type', MapType(StringType(), ShortType(), True), True), StructField('struct_type', StructType([StructField('t1', StringType(), True), StructField('t2', BooleanType(), True)]), True)])"
107 |       ]
108 |      },
109 |      "execution_count": 4,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "print(print_schema_as_code(schema))\n",
116 |     "\n",
117 |     "# Create a dictionary of PySpark SQL types to provide context to 'eval()' \n",
118 |     "spark_type_dict = {k: getattr(T, k) for k in dir(T) if isinstance(getattr(T, k), type)}\n",
119 |     "eval(print_schema_as_code(schema), {\"__builtins__\": None}, spark_type_dict)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "6fb30b81",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": []
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "Python 3 (ipykernel)",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.10.12"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 5
152 | }
153 | 


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
  1 | ## Quinn Helper Functions
  2 | 
  3 | ```python
  4 | import quinn
  5 | ```
  6 | 
  7 | ### DataFrame Validations
  8 | 
  9 | **validate_presence_of_columns()**
 10 | 
 11 | ```python
 12 | quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"])
 13 | ```
 14 | 
 15 | Raises an exception unless `source_df` contains the `name`, `age`, and `fun` column.
 16 | 
 17 | **validate_schema()**
 18 | 
 19 | ```python
 20 | quinn.validate_schema(source_df, required_schema)
 21 | ```
 22 | 
 23 | Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`.
 24 | 
 25 | **validate_absence_of_columns()**
 26 | 
 27 | ```python
 28 | quinn.validate_absence_of_columns(source_df, ["age", "cool"])
 29 | ```
 30 | 
 31 | Raises an exception if `source_df` contains `age` or `cool` columns.
 32 | 
 33 | ### Functions
 34 | 
 35 | **single_space()**
 36 | 
 37 | ```python
 38 | actual_df = source_df.withColumn(
 39 |     "words_single_spaced",
 40 |     quinn.single_space(col("words"))
 41 | )
 42 | ```
 43 | 
 44 | Replaces all multispaces with single spaces (e.g. changes `"this has   some"` to `"this has some"`.
 45 | 
 46 | **remove_all_whitespace()**
 47 | 
 48 | ```python
 49 | actual_df = source_df.withColumn(
 50 |     "words_without_whitespace",
 51 |     quinn.remove_all_whitespace(col("words"))
 52 | )
 53 | ```
 54 | 
 55 | Removes all whitespace in a string (e.g. changes `"this has some"` to `"thishassome"`.
 56 | 
 57 | **anti_trim()**
 58 | 
 59 | ```python
 60 | actual_df = source_df.withColumn(
 61 |     "words_anti_trimmed",
 62 |     quinn.anti_trim(col("words"))
 63 | )
 64 | ```
 65 | 
 66 | Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes `" this has some "` to `" thishassome "`.
 67 | 
 68 | **remove_non_word_characters()**
 69 | 
 70 | ```python
 71 | actual_df = source_df.withColumn(
 72 |     "words_without_nonword_chars",
 73 |     quinn.remove_non_word_characters(col("words"))
 74 | )
 75 | ```
 76 | 
 77 | Removes all non-word characters from a string (e.g. changes `"si%$#@!#$!@#mpsons"` to `"simpsons"`.
 78 | 
 79 | **multi_equals()**
 80 | 
 81 | ```python
 82 | source_df.withColumn(
 83 |     "are_s1_and_s2_cat",
 84 |     quinn.multi_equals("cat")(col("s1"), col("s2"))
 85 | )
 86 | ```
 87 | 
 88 | `multi_equals` returns true if `s1` and `s2` are both equal to `"cat"`.
 89 | 
 90 | **approx_equal()**
 91 | 
 92 | This function takes 3 arguments which are 2 Pyspark DataFrames and one integer values as threshold, and returns the Boolean column which tells if the columns are equal in the threshold.
 93 | 
 94 | ```
 95 | let the columns be
 96 | col1 = [1.2, 2.5, 3.1, 4.0, 5.5]
 97 | col2 = [1.3, 2.3, 3.0, 3.9, 5.6]
 98 | threshold = 0.2
 99 | 
100 | result = approx_equal(col("col1"), col("col2"), threshold)
101 | result.show()
102 | 
103 | +-----+
104 | |value|
105 | +-----+
106 | | true|
107 | |false|
108 | | true|
109 | | true|
110 | | true|
111 | +-----+
112 | ```
113 | 
114 | **array_choice()**
115 | 
116 | This function takes a Column as a parameter and returns a PySpark column that contains a random value from the input column parameter
117 | 
118 | ```
119 | df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["values"])
120 | result = df.select(array_choice(col("values")))
121 | 
122 | The output is :=
123 | +--------------+
124 | |array_choice()|
125 | +--------------+
126 | |             2|
127 | +--------------+
128 | 
129 | ```
130 | 
131 | **regexp_extract_all()**
132 | 
133 | The regexp_extract_all takes 2 parameters String `s` and `regexp` which is a regular expression. This function finds all the matches for the string which satisfies the regular expression.
134 | 
135 | ```
136 | print(regexp_extract_all("this is a example text message for testing application",r"\b\w*a\w*\b"))
137 | 
138 | The output is :=
139 | ['a', 'example', 'message', 'application']
140 | 
141 | ```
142 | 
143 | Where `r"\b\w*a\w*\b"` pattern checks for words containing letter `a`
144 | 
145 | **week_start_date()**
146 | 
147 | It takes 2 parameters, column and week_start_day. It returns a Spark Dataframe column which contains the start date of the week. By default the week_start_day is set to "Sun".
148 | 
149 | For input `["2023-03-05", "2023-03-06", "2023-03-07", "2023-03-08"]` the Output is
150 | 
151 | ```
152 | result = df.select("date", week_start_date(col("date"), "Sun"))
153 | result.show()
154 | +----------+----------------+
155 | |      date|week_start_date |
156 | +----------+----------------+
157 | |2023-03-05|      2023-03-05|
158 | |2023-03-07|      2023-03-05|
159 | |2023-03-08|      2023-03-05|
160 | +----------+----------------+
161 | ```
162 | 
163 | **week_end_date()**
164 | 
165 | It also takes 2 Paramters as Column and week_end_day, and returns the dateframe column which contains the end date of the week. By default the week_end_day is set to "sat"
166 | 
167 | ```
168 | +---------+-------------+
169 |       date|week_end_date|
170 | +---------+-------------+
171 | 2023-03-05|   2023-03-05|
172 | 2023-03-07|   2023-03-12|
173 | 2023-03-08|   2023-03-12|
174 | +---------+-------------+
175 | 
176 | ```
177 | 
178 | **uuid5()**
179 | 
180 | This function generates UUIDv5 in string form from the passed column and optionally namespace and optional extra salt.
181 | By default namespace is NAMESPACE_DNS UUID and no extra string used to reduce hash collisions.
182 | 
183 | ```
184 | 
185 | df = spark.createDataFrame([("lorem",), ("ipsum",)], ["values"])
186 | result = df.select(quinn.uuid5(F.col("values")).alias("uuid5"))
187 | result.show(truncate=False)
188 | 
189 | The output is :=
190 | +------------------------------------+
191 | |uuid5                               |
192 | +------------------------------------+
193 | |35482fda-c10a-5076-8da2-dc7bf22d6be4|
194 | |51b79c1d-d06c-5b30-a5c6-1fadcd3b2103|
195 | +------------------------------------+
196 | 
197 | ```
198 | 
199 | ### Transformations
200 | 
201 | **snake_case_col_names()**
202 | 
203 | ```python
204 | quinn.snake_case_col_names(source_df)
205 | ```
206 | 
207 | Converts all the column names in a DataFrame to snake_case. It's annoying to write SQL queries when columns aren't snake cased.
208 | 
209 | **sort_columns()**
210 | 
211 | ```python
212 | quinn.sort_columns(df=source_df, sort_order="asc", sort_nested=True)
213 | ```
214 | 
215 | Sorts the DataFrame columns in alphabetical order, including nested columns if sort_nested is set to True. Wide DataFrames are easier to navigate when they're sorted alphabetically.
216 | 
217 | ### DataFrame Helpers
218 | 
219 | **column_to_list()**
220 | 
221 | ```python
222 | quinn.column_to_list(source_df, "name")
223 | ```
224 | 
225 | Converts a column in a DataFrame to a list of values.
226 | 
227 | **two_columns_to_dictionary()**
228 | 
229 | ```python
230 | quinn.two_columns_to_dictionary(source_df, "name", "age")
231 | ```
232 | 
233 | Converts two columns of a DataFrame into a dictionary. In this example, `name` is the key and `age` is the value.
234 | 
235 | **to_list_of_dictionaries()**
236 | 
237 | ```python
238 | quinn.to_list_of_dictionaries(source_df)
239 | ```
240 | 
241 | Converts an entire DataFrame into a list of dictionaries.
242 | 
243 | **show_output_to_df()**
244 | 
245 | ```python
246 | quinn.show_output_to_df(output_str, spark)
247 | ```
248 | 
249 | Parses a spark DataFrame output string into a spark DataFrame. Useful for quickly pulling data from a log into a DataFrame. In this example, output_str is a string of the form:
250 | 
251 | ```
252 | +----+---+-----------+------+
253 | |name|age|     stuff1|stuff2|
254 | +----+---+-----------+------+
255 | |jose|  1|nice person|  yoyo|
256 | |  li|  2|nice person|  yoyo|
257 | | liz|  3|nice person|  yoyo|
258 | +----+---+-----------+------+
259 | ```
260 | 
261 | ### Schema Helpers
262 | 
263 | **schema_from_csv()**
264 | 
265 | ```python
266 | quinn.schema_from_csv("schema.csv")
267 | ```
268 | 
269 | Converts a CSV file into a PySpark schema (aka `StructType`). The CSV must contain the column name and type.  The nullable and metadata columns are optional.
270 | 
271 | Here's an example CSV file:
272 | 
273 | ```
274 | name,type
275 | person,string
276 | address,string
277 | phoneNumber,string
278 | age,int
279 | ```
280 | 
281 | Here's how to convert that CSV file to a PySpark schema:
282 | 
283 | ```python
284 | schema = schema_from_csv(spark, "some_file.csv")
285 | 
286 | StructType([
287 |     StructField("person", StringType(), True),
288 |     StructField("address", StringType(), True),
289 |     StructField("phoneNumber", StringType(), True),
290 |     StructField("age", IntegerType(), True),
291 | ])
292 | ```
293 | 
294 | Here's a more complex CSV file:
295 | 
296 | ```
297 | name,type,nullable,metadata
298 | person,string,false,{"description":"The person's name"}
299 | address,string
300 | phoneNumber,string,TRUE,{"description":"The person's phone number"}
301 | age,int,False
302 | ```
303 | 
304 | Here's how to read this CSV file into a PySpark schema:
305 | 
306 | ```python
307 | another_schema = schema_from_csv(spark, "some_file.csv")
308 | 
309 | StructType([
310 |     StructField("person", StringType(), False, {"description": "The person's name"}),
311 |     StructField("address", StringType(), True),
312 |     StructField("phoneNumber", StringType(), True, {"description": "The person's phone number"}),
313 |     StructField("age", IntegerType(), False),
314 | ])
315 | ```
316 | 
317 | **print_schema_as_code()**
318 | 
319 | ```python   
320 | fields = [
321 |     StructField("simple_int", IntegerType()),
322 |     StructField("decimal_with_nums", DecimalType(19, 8)),
323 |     StructField("array", ArrayType(FloatType()))
324 | ]
325 | schema = StructType(fields)
326 | printable_schema: str = quinn.print_schema_as_code(schema)
327 | ```
328 | 
329 | Converts a Spark `DataType` to a string of Python code that can be evaluated as code using eval(). If the `DataType` is a `StructType`, this can be used to print an existing schema in a format that can be copy-pasted into a Python script, log to a file, etc. 
330 | 
331 | For example:
332 | ```python
333 | print(printable_schema)
334 | ```
335 | 
336 | ```
337 | StructType(
338 | 	fields=[
339 | 		StructField("simple_int", IntegerType(), True),
340 | 		StructField("decimal_with_nums", DecimalType(19, 8), True),
341 | 		StructField(
342 | 			"array",
343 | 			ArrayType(FloatType()),
344 | 			True,
345 | 		),
346 | 	]
347 | )
348 | ```
349 | 
350 | Once evaluated, the printable schema is a valid schema that can be used in dataframe creation, validation, etc.
351 | 
352 | ```python
353 | from chispa.schema_comparer import assert_basic_schema_equality
354 | 
355 | parsed_schema = eval(printable_schema)
356 | assert_basic_schema_equality(parsed_schema, schema) # passes
357 | ```
358 | 
359 | 
360 | `print_schema_as_code()` can also be used to print other `DataType` objects.
361 | 
362 |  `ArrayType`
363 | ```python
364 | array_type = ArrayType(FloatType())
365 | printable_type: str = quinn.print_schema_as_code(array_type)
366 | print(printable_type)
367 |  ```
368 | 
369 |  ```
370 | ArrayType(FloatType())
371 |  ```
372 | 
373 | `MapType`
374 | ```python
375 | map_type = MapType(StringType(), FloatType())
376 | printable_type: str = quinn.print_schema_as_code(map_type)
377 | print(printable_type)
378 |  ```
379 | 
380 |  ```
381 | MapType(
382 |         StringType(),
383 |         FloatType(),
384 |         True,
385 | )
386 |  ```
387 | 
388 | `IntegerType`, `StringType` etc.
389 | ```python
390 | integer_type = IntegerType()
391 | printable_type: str = quinn.print_schema_as_code(integer_type)
392 | print(printable_type)
393 |  ```
394 | 
395 |  ```
396 | IntegerType()
397 |  ```
398 | 
399 | ## Pyspark Core Class Extensions
400 | 
401 | ```
402 | import pyspark.sql.functions as F
403 | import quinn
404 | ```
405 | 
406 | ### Column Extensions
407 | 
408 | **isFalsy()**
409 | 
410 | ```python
411 | source_df.withColumn("is_stuff_falsy", quinn.is_falsy(F.col("has_stuff")))
412 | ```
413 | 
414 | Returns `True` if `has_stuff` is `None` or `False`.
415 | 
416 | **isTruthy()**
417 | 
418 | ```python
419 | source_df.withColumn("is_stuff_truthy", quinn.is_truthy(F.col("has_stuff")))
420 | ```
421 | 
422 | Returns `True` unless `has_stuff` is `None` or `False`.
423 | 
424 | **isNullOrBlank()**
425 | 
426 | ```python
427 | source_df.withColumn("is_blah_null_or_blank", quinn.is_null_or_blank(F.col("blah")))
428 | ```
429 | 
430 | Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace).
431 | 
432 | **isNotIn()**
433 | 
434 | ```python
435 | source_df.withColumn("is_not_bobs_hobby", quinn.is_not_in(F.col("fun_thing")))
436 | ```
437 | 
438 | Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list.
439 | 
440 | **nullBetween()**
441 | 
442 | ```python
443 | source_df.withColumn("is_between", quinn.null_between(F.col("age"), F.col("lower_age"), F.col("upper_age")))
444 | ```
445 | 
446 | Returns `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`.


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Quinn
 2 | site_url: "https://mrpowers.github.io/quinn/"
 3 | repo_url: "https://github.com/MrPowers/quinn"
 4 | repo_name: "MrPowers/quinn"
 5 | 
 6 | theme:
 7 |   name: material
 8 |   palette:
 9 |   - media: "(prefers-color-scheme: light)"
10 |     scheme: default
11 |     toggle:
12 |       icon: material/brightness-7
13 |       name: Switch to dark mode
14 |   - media: "(prefers-color-scheme: dark)"
15 |     scheme: slate
16 |     toggle:
17 |       icon: material/brightness-4
18 |       name: Switch to light mode
19 |   features:
20 |     - navigation.tracking
21 |     - navigation.instant
22 |     - navigation.tabs
23 |     - navigation.tabs.sticky
24 |     - navigation.footer
25 |     - navigation.indexes
26 |     - navigation.expand
27 |     - content.tabs.link
28 |     - content.code.copy
29 |     - content.code.select
30 | 
31 | plugins:
32 |   - search
33 |   - gen-files:
34 |       scripts:
35 |         - docs/gen_ref_pages.py
36 |   - section-index
37 |   - mkdocstrings:
38 |       default_handler: python
39 |       handlers:
40 |         python:
41 |           options:
42 |             docstring_style: sphinx
43 |             docstring_options:
44 |               show_if_no_docstring: true
45 |             show_source: true
46 |   - mkdocs-jupyter
47 |   - markdown-exec
48 | 
49 | nav:
50 |   - Home: index.md
51 |   - Usage: usage.md
52 |   - API Reference: reference/SUMMARY.md
53 |   - Examples:
54 |     - "examples/index.md"
55 |     - "notebooks/schema_as_code.ipynb"
56 |   - Learn more: 
57 |     - learn_more/index.md
58 |     - learn_more/column_to_list.md
59 | 
60 | 
61 | markdown_extensions:
62 |     - markdown_include.include:
63 |         base_path: docs
64 |     - attr_list
65 |     - pymdownx.emoji:
66 |         emoji_index: !!python/name:materialx.emoji.twemoji
67 |         emoji_generator: !!python/name:materialx.emoji.to_svg


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "quinn"
  3 | version = "0.10.3"
  4 | description = "Pyspark helper methods to maximize developer efficiency"
  5 | authors = ["MrPowers <matthewkevinpowers@gmail.com>"]
  6 | 
  7 | # Maintainers of the project
  8 | maintainers = [
  9 |     "SemyonSinchenko <ssinchenko@apache.org>"
 10 | ]
 11 | 
 12 | readme = "README.md"
 13 | homepage = "https://github.com/MrPowers/quinn/"
 14 | keywords = ['apachespark', 'spark', 'pyspark']
 15 | 
 16 | [build-system]
 17 | requires = ["poetry>=0.12"]
 18 | build-backend = "poetry.masonry.api"
 19 | 
 20 | ###########################################################################
 21 | #                         MAIN DEPENDENCIES
 22 | ###########################################################################
 23 | 
 24 | [tool.poetry.dependencies]
 25 | python = ">=3.7,<4.0"
 26 | 
 27 | 
 28 | ###########################################################################
 29 | #                         DEPENDENCY GROUPS 
 30 | ###########################################################################
 31 | 
 32 | [tool.poetry.group.development]
 33 | optional = true
 34 | 
 35 | [tool.poetry.group.docs]
 36 | optional = true
 37 | 
 38 | [tool.poetry.group.testing]
 39 | optional = true
 40 | 
 41 | [tool.poetry.group.linting]
 42 | optional = true
 43 | 
 44 | [tool.poetry.group.development.dependencies]
 45 | pyspark = ">2"
 46 | semver = "^3"
 47 | 
 48 | [tool.poetry.group.testing.dependencies]
 49 | pytest = "^7"
 50 | pytest-rerunfailures= "^13"
 51 | chispa = "0.9.4"
 52 | pytest-describe = "^2"
 53 | pyspark = ">2"
 54 | semver = "^3"
 55 | 
 56 | [tool.poetry.group.linting.dependencies]
 57 | ruff = "^0.0.291"
 58 | 
 59 | [tool.poetry.group.docs.dependencies]
 60 | mkdocstrings-python = "^0.8.3"
 61 | mkdocs-gen-files = "^0.4.0"
 62 | mkdocs-literate-nav = "^0.6.0"
 63 | mkdocs-section-index = "^0.3.5"
 64 | markdown-include = "^0.8.1"
 65 | mkdocs = "^1"
 66 | jupyterlab = "*"
 67 | mkdocs-jupyter = "*"
 68 | mkdocs-material = "*"
 69 | pymdown-extensions = "*"
 70 | mkdocs-macros-plugin = "*"
 71 | mkdocs-material-extensions = "*"
 72 | markdown-exec = "*"
 73 | ###########################################################################
 74 | #                         LINTING CONFIGURATION 
 75 | ###########################################################################
 76 | 
 77 | [tool.ruff]
 78 | select = ["ALL"]
 79 | line-length = 150
 80 | ignore = [
 81 |     "D100",
 82 |     "D203",    # Ignore blank line before summary of class
 83 |     "D213",    # Ignore multiline summary second line
 84 |     "T201",    # Allow print() in code.
 85 |     "D401",    # Docstrings should be in imperative modes
 86 |     "D404",    # Boring thing about how to write docsrings
 87 |     "FBT001",  # Boolean positional arg is OK
 88 |     "FBT002",  # Boolean default arg value is OK
 89 |     "D205",    # It is broken
 90 |     "TCH003",  # I have no idea what is it about
 91 |     "PLC1901", # Strange thing
 92 |     "UP007",   # Not supported in py3.6
 93 |     "UP038",   # Not supported in all py versions
 94 |     "SIM108",  # Don't create long ternary operators
 95 |     "PTH123",  # Don't force use of Pathlib
 96 |     "PTH207",  # Don't force use of Pathlib
 97 |     "PTH113",  # Don't force use of Pathlib
 98 | ]
 99 | extend-exclude = ["tests", "docs"]
100 | 
101 | [tool.ruff.per-file-ignores]
102 | "quinn/extensions/column_ext.py" = ["FBT003", "N802"]
103 | "quinn/extensions/__init__.py" = ["F401", "F403"]
104 | "quinn/__init__.py" = ["F401", "F403"]
105 | "quinn/functions.py" = ["FBT003"]
106 | "quinn/keyword_finder.py" = ["A002"]
107 | 


--------------------------------------------------------------------------------
/quinn.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
4 |     <exclude-output />
5 |     <content url="file://$MODULE_DIR$" />
6 |     <orderEntry type="inheritedJdk" />
7 |     <orderEntry type="sourceFolder" forTests="false" />
8 |   </component>
9 | </module>


--------------------------------------------------------------------------------
/quinn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/quinn.png


--------------------------------------------------------------------------------
/quinn/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | """quinn API."""
15 | 
16 | from quinn.append_if_schema_identical import append_if_schema_identical
17 | from quinn.dataframe_helpers import (
18 |     column_to_list,
19 |     create_df,
20 |     print_athena_create_table,
21 |     show_output_to_df,
22 |     to_list_of_dictionaries,
23 |     two_columns_to_dictionary,
24 | )
25 | from quinn.dataframe_validator import (
26 |     DataFrameMissingColumnError,
27 |     DataFrameMissingStructFieldError,
28 |     DataFrameProhibitedColumnError,
29 |     validate_absence_of_columns,
30 |     validate_presence_of_columns,
31 |     validate_schema,
32 | )
33 | from quinn.functions import (
34 |     anti_trim,
35 |     approx_equal,
36 |     array_choice,
37 |     business_days_between,
38 |     exists,
39 |     forall,
40 |     is_false,
41 |     is_falsy,
42 |     is_not_in,
43 |     is_null_or_blank,
44 |     is_true,
45 |     is_truthy,
46 |     multi_equals,
47 |     null_between,
48 |     remove_all_whitespace,
49 |     remove_non_word_characters,
50 |     single_space,
51 |     uuid5,
52 |     week_end_date,
53 |     week_start_date,
54 | )
55 | from quinn.math import rand_laplace, rand_range, randn
56 | from quinn.schema_helpers import print_schema_as_code
57 | from quinn.split_columns import split_col
58 | from quinn.transformations import (
59 |     snake_case_col_names,
60 |     sort_columns,
61 |     to_snake_case,
62 |     with_columns_renamed,
63 |     with_some_columns_renamed,
64 | )
65 | 


--------------------------------------------------------------------------------
/quinn/append_if_schema_identical.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | from pyspark.sql import DataFrame
15 | 
16 | 
17 | class SchemaMismatchError(ValueError):
18 |     """raise this when there's a schema mismatch between source & target schema."""
19 | 
20 | 
21 | def append_if_schema_identical(source_df: DataFrame, target_df: DataFrame) -> DataFrame:
22 |     """Compare the schema of source & target dataframe.
23 | 
24 |     :param source_df: Input DataFrame
25 |     :type source_df: pyspark.sql.DataFrame
26 |     :param target_df: Input DataFrame
27 |     :type target_df: pyspark.sql.DataFrame
28 |     :return: dataframe
29 |     :rtype: pyspark.sql.DataFrame
30 |     """
31 |     # Retrieve the schemas of the source and target dataframes
32 |     source_schema = source_df.schema
33 |     target_schema = target_df.schema
34 | 
35 |     # Convert the schemas to a list of tuples
36 |     source_schema_list = [(field.name, str(field.dataType)) for field in source_schema]
37 |     target_schema_list = [(field.name, str(field.dataType)) for field in target_schema]
38 | 
39 |     unmatched_cols = [
40 |         col for col in source_schema_list if col not in target_schema_list
41 |     ]
42 |     error_message = (
43 |         f"The schemas of the source and target dataframes are not identical."
44 |         f"From source schema column {unmatched_cols} is missing in target schema"
45 |     )
46 |     # Check if the column names in the source and target schemas are the same, regardless of their order
47 |     if set(source_schema.fieldNames()) != set(target_schema.fieldNames()):
48 |         raise SchemaMismatchError(error_message)
49 |     # Check if the column names and data types in the source and target schemas are the same, in the same order
50 |     if sorted(source_schema_list) != sorted(target_schema_list):
51 |         raise SchemaMismatchError(error_message)
52 | 
53 |     # Append the dataframes if the schemas are identical
54 |     return target_df.unionByName(source_df)
55 | 


--------------------------------------------------------------------------------
/quinn/dataframe_helpers.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | from typing import TYPE_CHECKING
 17 | 
 18 | if TYPE_CHECKING:
 19 |     from pyspark.sql import DataFrame, SparkSession
 20 | import sys
 21 | import warnings
 22 | from typing import Any
 23 | 
 24 | from pyspark.sql.types import StructField, StructType
 25 | 
 26 | 
 27 | def column_to_list(df: DataFrame, col_name: str) -> list[Any]:
 28 |     """Collect column to list of values.
 29 | 
 30 |     :param df: Input DataFrame
 31 |     :type df: pyspark.sql.DataFrame
 32 |     :param col_name: Column to collect
 33 |     :type col_name: str
 34 |     :return: List of values
 35 |     :rtype: List[Any]
 36 |     """
 37 |     if "pyspark" not in sys.modules:
 38 |         raise ImportError
 39 | 
 40 |     # sparksession from df is not available in older versions of pyspark
 41 |     if sys.modules["pyspark"].__version__ < "3.3.0":
 42 |         return [row[0] for row in df.select(col_name).collect()]
 43 | 
 44 |     spark_session = df.sparkSession.getActiveSession()
 45 |     if spark_session is None:
 46 |         return [row[0] for row in df.select(col_name).collect()]
 47 | 
 48 |     pyarrow_enabled = (
 49 |         spark_session.conf.get(
 50 |             "spark.sql.execution.arrow.pyspark.enabled",
 51 |         )
 52 |         == "true"
 53 |     )
 54 | 
 55 |     pyarrow_valid = pyarrow_enabled and sys.modules["pyarrow"].__version__ >= "0.17.0"
 56 | 
 57 |     pandas_exists = "pandas" in sys.modules
 58 |     pandas_valid = pandas_exists and sys.modules["pandas"].__version__ >= "0.24.2"
 59 | 
 60 |     if pyarrow_valid and pandas_valid:
 61 |         return df.select(col_name).toPandas()[col_name].tolist()
 62 | 
 63 |     return [row[0] for row in df.select(col_name).collect()]
 64 | 
 65 | 
 66 | def two_columns_to_dictionary(
 67 |     df: DataFrame,
 68 |     key_col_name: str,
 69 |     value_col_name: str,
 70 | ) -> dict[str, Any]:
 71 |     """Collect two columns as dictionary when first column is key and second is value.
 72 | 
 73 |     :param df: Input DataFrame
 74 |     :type df: pyspark.sql.DataFrame
 75 |     :param key_col_name: Key-column
 76 |     :type key_col_name: str
 77 |     :param value_col_name: Value-column
 78 |     :type value_col_name: str
 79 |     :return: Dictionary with values
 80 |     :rtype: Dict[str, Any]
 81 |     """
 82 |     k, v = key_col_name, value_col_name
 83 |     return {x[k]: x[v] for x in df.select(k, v).collect()}
 84 | 
 85 | 
 86 | def to_list_of_dictionaries(df: DataFrame) -> list[dict[str, Any]]:
 87 |     """Convert a Spark DataFrame to a list of dictionaries.
 88 | 
 89 |     :param df: The Spark DataFrame to convert.
 90 |     :type df: :py:class:`pyspark.sql.DataFrame`
 91 |     :return: A list of dictionaries representing the rows in the DataFrame.
 92 |     :rtype: List[Dict[str, Any]]
 93 |     """
 94 |     return list(map(lambda r: r.asDict(), df.collect()))  # noqa: C417
 95 | 
 96 | 
 97 | def print_athena_create_table(
 98 |     df: DataFrame,
 99 |     athena_table_name: str,
100 |     s3location: str,
101 | ) -> None:
102 |     """Generate the Athena create table statement for a given DataFrame.
103 |     :param df: The pyspark.sql.DataFrame to use
104 |     :param athena_table_name: The name of the athena table to generate
105 |     :param s3location: The S3 location of the parquet data
106 |     :return: None.
107 |     """
108 |     warnings.warn(
109 |         "Function print_athena_create_table is deprecated and will be removed in the version 1.0",
110 |         category=DeprecationWarning,
111 |         stacklevel=2,
112 |     )
113 | 
114 |     fields = df.schema
115 | 
116 |     print(f"CREATE EXTERNAL TABLE IF NOT EXISTS `{athena_table_name}` ( ")
117 | 
118 |     for field in fields.fieldNames()[:-1]:
119 |         print("\t", f"`{fields[field].name}` {fields[field].dataType.simpleString()}, ")
120 |     last = fields[fields.fieldNames()[-1]]
121 |     print("\t", f"`{last.name}` {last.dataType.simpleString()} ")
122 | 
123 |     print(")")
124 |     print("STORED AS PARQUET")
125 |     print(f"LOCATION '{s3location}'\n")
126 | 
127 | 
128 | def show_output_to_df(show_output: str, spark: SparkSession) -> DataFrame:
129 |     """Show output as spark DataFrame.
130 | 
131 |     :param show_output: String representing output of 'show' command in spark
132 |     :type show_output: str
133 |     :param spark: SparkSession object
134 |     :type spark: SparkSession
135 |     :return: DataFrame object containing output of a show command in spark
136 |     :rtype: Dataframe
137 |     """
138 |     lines = show_output.split("\n")
139 |     ugly_column_names = lines[1]
140 |     pretty_column_names = [i.strip() for i in ugly_column_names[1:-1].split("|")]
141 |     pretty_data = []
142 |     ugly_data = lines[3:-1]
143 |     for row in ugly_data:
144 |         r = [i.strip() for i in row[1:-1].split("|")]
145 |         pretty_data.append(tuple(r))
146 |     return spark.createDataFrame(pretty_data, pretty_column_names)
147 | 
148 | 
149 | def create_df(spark: SparkSession, rows_data, col_specs) -> DataFrame:  # noqa: ANN001
150 |     """Create a new DataFrame from the given data and column specs.
151 | 
152 |     The returned DataFrame s created using the StructType and StructField classes provided by PySpark.
153 | 
154 |     :param spark: SparkSession object
155 |     :type spark: SparkSession
156 |     :param rows_data: the data used to create the DataFrame
157 |     :type rows_data: array-like
158 |     :param col_specs: list of tuples containing the name and type of the field
159 |     :type col_specs: list of tuples
160 |     :return: a new DataFrame
161 |     :rtype: DataFrame
162 |     """
163 |     struct_fields = list(map(lambda x: StructField(*x), col_specs))  # noqa: C417
164 |     return spark.createDataFrame(data=rows_data, schema=StructType(struct_fields))
165 | 


--------------------------------------------------------------------------------
/quinn/dataframe_validator.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | import copy
 17 | from typing import TYPE_CHECKING, Union
 18 | 
 19 | if TYPE_CHECKING:
 20 |     from pyspark.sql import DataFrame
 21 |     from pyspark.sql.types import StructType
 22 | 
 23 | 
 24 | class DataFrameMissingColumnError(ValueError):
 25 |     """Raise this when there's a DataFrame column error."""
 26 | 
 27 | 
 28 | class DataFrameMissingStructFieldError(ValueError):
 29 |     """Raise this when there's a DataFrame column error."""
 30 | 
 31 | 
 32 | class DataFrameProhibitedColumnError(ValueError):
 33 |     """Raise this when a DataFrame includes prohibited columns."""
 34 | 
 35 | 
 36 | def validate_presence_of_columns(df: DataFrame, required_col_names: list[str], return_bool: bool = False) -> Union[None, bool]:
 37 |     """Validate the presence of column names in a DataFrame.
 38 |     :param df: A spark DataFrame.
 39 |     :type df: DataFrame
 40 |     :param required_col_names: List of the required column names for the DataFrame.
 41 |     :type required_col_names: list[str]
 42 |     :param return_bool: If True, return a boolean instead of raising an exception.
 43 |     :type return_bool: bool
 44 |     :return: None if return_bool is False, otherwise a boolean indicating if validation passed.
 45 |     :raises DataFrameMissingColumnError: if any of the requested column names are
 46 |     not present in the DataFrame and return_bool is False.
 47 |     """
 48 |     all_col_names = df.columns
 49 |     missing_col_names = [x for x in required_col_names if x not in all_col_names]
 50 | 
 51 |     if missing_col_names:
 52 |         error_message = f"The {missing_col_names} columns are not included in the DataFrame with the following columns {all_col_names}"
 53 |         if return_bool:
 54 |             return False
 55 |         raise DataFrameMissingColumnError(error_message)
 56 | 
 57 |     return True if return_bool else None
 58 | 
 59 | 
 60 | def validate_schema(
 61 |     df: DataFrame,
 62 |     required_schema: StructType,
 63 |     ignore_nullable: bool = False,
 64 |     return_bool: bool = False,
 65 | ) -> Union[None, bool]:
 66 |     """Function that validate if a given DataFrame has a given StructType as its schema.
 67 |     :param df: DataFrame to validate
 68 |     :type df: DataFrame
 69 |     :param required_schema: StructType required for the DataFrame
 70 |     :type required_schema: StructType
 71 |     :param ignore_nullable: (Optional) A flag for if nullable fields should be
 72 |     ignored during validation
 73 |     :type ignore_nullable: bool, optional
 74 |     :param return_bool: If True, return a boolean instead of raising an exception.
 75 |     :type return_bool: bool
 76 |     :return: None if return_bool is False, otherwise a boolean indicating if validation passed.
 77 |     :raises DataFrameMissingStructFieldError: if any StructFields from the required
 78 |     schema are not included in the DataFrame schema and return_bool is False.
 79 |     """
 80 |     _all_struct_fields = copy.deepcopy(df.schema)
 81 |     _required_schema = copy.deepcopy(required_schema)
 82 | 
 83 |     if ignore_nullable:
 84 |         for x in _all_struct_fields:
 85 |             x.nullable = None
 86 | 
 87 |         for x in _required_schema:
 88 |             x.nullable = None
 89 | 
 90 |     missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields]
 91 | 
 92 |     if missing_struct_fields:
 93 |         error_message = (
 94 |             f"The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {_all_struct_fields}"
 95 |         )
 96 |         if return_bool:
 97 |             return False
 98 |         raise DataFrameMissingStructFieldError(error_message)
 99 | 
100 |     return True if return_bool else None
101 | 
102 | 
103 | def validate_absence_of_columns(df: DataFrame, prohibited_col_names: list[str], return_bool: bool = False) -> Union[None, bool]:
104 |     """Validate that none of the prohibited column names are present among specified DataFrame columns.
105 |     :param df: DataFrame containing columns to be checked.
106 |     :param prohibited_col_names: List of prohibited column names.
107 |     :param return_bool: If True, return a boolean instead of raising an exception.
108 |     :type return_bool: bool
109 |     :return: None if return_bool is False, otherwise a boolean indicating if validation passed.
110 |     :raises DataFrameProhibitedColumnError: If the prohibited column names are
111 |     present among the specified DataFrame columns and return_bool is False.
112 |     """
113 |     all_col_names = df.columns
114 |     extra_col_names = [x for x in all_col_names if x in prohibited_col_names]
115 | 
116 |     if extra_col_names:
117 |         error_message = f"The {extra_col_names} columns are not allowed to be included in the DataFrame with the following columns {all_col_names}"
118 |         if return_bool:
119 |             return False
120 |         raise DataFrameProhibitedColumnError(error_message)
121 | 
122 |     return True if return_bool else None
123 | 


--------------------------------------------------------------------------------
/quinn/extensions/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | """Extensions API."""
15 | 
16 | from quinn.extensions.dataframe_ext import _ext_function
17 | from quinn.extensions.spark_session_ext import create_df
18 | 


--------------------------------------------------------------------------------
/quinn/extensions/dataframe_ext.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import warnings
15 | 
16 | from pyspark.sql import SparkSession
17 | from pyspark.sql.dataframe import DataFrame
18 | 
19 | 
20 | def _ext_function(spark: SparkSession, f: object) -> object:
21 |     warnings.warn(
22 |         "Extensions may be removed in the future versions of quinn. Please use explicit functions instead",
23 |         category=DeprecationWarning,
24 |         stacklevel=2,
25 |     )
26 |     return f(spark)
27 | 
28 | 
29 | DataFrame.transform = getattr(DataFrame, "transform", _ext_function)
30 | 


--------------------------------------------------------------------------------
/quinn/extensions/spark_session_ext.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | from __future__ import annotations
15 | 
16 | import warnings
17 | 
18 | from pyspark.sql import DataFrame, SparkSession
19 | from pyspark.sql.types import StructField, StructType
20 | 
21 | 
22 | def create_df(
23 |     spark: SparkSession,
24 |     rows_data: list[tuple],
25 |     col_specs: list[tuple],
26 | ) -> DataFrame:
27 |     """Creates a new DataFrame from the given data and column specs.
28 | 
29 |     The returned DataFrame is created using the StructType and StructField classes provided by PySpark.
30 | 
31 |     :param rows_data: the data used to create the DataFrame
32 |     :type rows_data: array-like
33 |     :param col_specs: list of tuples containing the name and type of the field
34 |     :type col_specs: list of tuples
35 |     :return: a new DataFrame
36 |     :rtype: DataFrame
37 |     """
38 |     warnings.warn(
39 |         "Extensions may be removed in the future versions of quinn. Please use `quinn.create_df()` instead",
40 |         category=DeprecationWarning,
41 |         stacklevel=2,
42 |     )
43 | 
44 |     struct_fields = [StructField(*x) for x in col_specs]
45 |     return spark.createDataFrame(data=rows_data, schema=StructType(struct_fields))
46 | 
47 | 
48 | SparkSession.create_df = create_df
49 | 


--------------------------------------------------------------------------------
/quinn/keyword_finder.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | import os
 17 | from dataclasses import dataclass
 18 | from glob import iglob
 19 | 
 20 | default_keywords = [
 21 |     "_jsc",
 22 |     "_jconf",
 23 |     "_jvm",
 24 |     "_jsparkSession",
 25 |     "_jreader",
 26 |     "_jc",
 27 |     "_jseq",
 28 |     "_jdf",
 29 |     "_jmap",
 30 |     "_jco",
 31 |     "emptyRDD",
 32 |     "range",
 33 |     "init_batched_serializer",
 34 |     "parallelize",
 35 |     "pickleFile",
 36 |     "textFile",
 37 |     "wholeTextFiles",
 38 |     "binaryFiles",
 39 |     "binaryRecords",
 40 |     "sequenceFile",
 41 |     "newAPIHadoopFile",
 42 |     "newAPIHadoopRDD",
 43 |     "hadoopFile",
 44 |     "hadoopRDD",
 45 |     "union",
 46 |     "runJob",
 47 |     "setSystemProperty",
 48 |     "uiWebUrl",
 49 |     "stop",
 50 |     "setJobGroup",
 51 |     "setLocalProperty",
 52 |     "getCon",
 53 |     "rdd",
 54 |     "sparkContext",
 55 | ]
 56 | 
 57 | @dataclass
 58 | class SearchResult:
 59 |     """Class to hold the results of a file search.
 60 |     file_path: The path to the file that was searched.
 61 |     word_count: A dictionary containing the number of times each keyword was found in the file.
 62 |     """
 63 | 
 64 |     file_path: str
 65 |     word_count: dict[str, int]
 66 | 
 67 | 
 68 | def search_file(path: str, keywords: list[str] = default_keywords) -> SearchResult:
 69 |     """Searches a file for keywords and prints the line number and line containing the keyword.
 70 | 
 71 |     :param path: The path to the file to search.
 72 |     :type path: str
 73 |     :param keywords: The list of keywords to search for.
 74 |     :type keywords: list[str]
 75 |     :returns: A dictionary containing a file path and the number of lines containing a keyword in `keywords`.
 76 |     :rtype: SearchResult
 77 | 
 78 |     """
 79 |     match_results = SearchResult(file_path=path, word_count={keyword: 0 for keyword in keywords})
 80 | 
 81 |     print(f"\nSearching: {path}")
 82 |     with open(path) as f:
 83 |         for line_number, line in enumerate(f, 1):
 84 |             line_printed = False
 85 |             for keyword in keywords:
 86 |                 if keyword in line:
 87 |                     match_results.word_count[keyword] += 1
 88 | 
 89 |                     if not line_printed:
 90 |                         print(f"{line_number}: {keyword_format(line)}", end="")
 91 |                         line_printed = True
 92 | 
 93 |     return match_results
 94 | 
 95 | 
 96 | def search_files(path: str, keywords: list[str] = default_keywords) -> list[SearchResult]:
 97 |     """Searches all files in a directory for keywords.
 98 | 
 99 |     :param path: The path to the directory to search.
100 |     :type path: str
101 |     :param keywords: The list of keywords to search for.
102 |     :type keywords: list[str]
103 |     :returns: A list of dictionaries containing file paths and the number of lines containing a keyword in `keywords`.
104 |     :rtype: list[SearchResult]
105 | 
106 |     """
107 |     rootdir_glob = f"{path}/**/*"
108 |     file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f)]
109 |     return [search_file(f, keywords) for f in file_list]
110 | 
111 | 
112 | def keyword_format(input: str, keywords: list[str] = default_keywords) -> str:
113 |     """Formats the input string to highlight the keywords.
114 | 
115 |     :param input: The string to format.
116 |     :type input: str
117 |     :param keywords: The list of keywords to highlight.
118 |     :type keywords: list[str]
119 | 
120 |     """
121 |     nc = "\033[0m"
122 |     red = "\033[31m"
123 |     bold = "\033[1m"
124 |     res = input
125 |     for keyword in keywords:
126 |         res = surround_substring(res, keyword, red + bold, nc)
127 |     return res
128 | 
129 | 
130 | def surround_substring(input: str, substring: str, surround_start: str, surround_end: str) -> str:
131 |     """Surrounds a substring with the given start and end strings.
132 | 
133 |     :param input: The string to search.
134 |     :type input: str
135 |     :param substring: The substring to surround.
136 |     :type substring: str
137 |     :param surround_start: The string to start the surrounding with.
138 |     :type surround_start: str
139 |     :param surround_end: The string to end the surrounding with.
140 |     :type surround_end: str
141 |     :returns: The input string with the substring surrounded.
142 |     :rtype: str
143 | 
144 |     """
145 |     return input.replace(
146 |         substring,
147 |         surround_start + substring + surround_end,
148 |     )
149 | 


--------------------------------------------------------------------------------
/quinn/math.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | """Math routines for PySpark."""
 15 | from __future__ import annotations
 16 | 
 17 | from typing import Optional, Union
 18 | 
 19 | from pyspark.sql import Column
 20 | from pyspark.sql import functions as F  # noqa: N812
 21 | 
 22 | 
 23 | def rand_laplace(
 24 |     mu: Union[float, Column],
 25 |     beta: Union[float, Column],
 26 |     seed: Optional[int] = None,
 27 | ) -> Column:
 28 |     """Generate random numbers from Laplace(mu, beta).
 29 | 
 30 |     :param mu: mu parameter of Laplace distribution
 31 |     :param beta: beta parameter of Laplace distribution
 32 |     :param seed: random seed value (optional, default None)
 33 |     :returns: column with random numbers
 34 |     """
 35 |     if not isinstance(mu, Column):
 36 |         mu = F.lit(mu)
 37 | 
 38 |     if not isinstance(beta, Column):
 39 |         beta = F.lit(beta)
 40 | 
 41 |     u = F.rand(seed) - F.lit(0.5)
 42 |     return (mu - beta * F.signum(u) * F.log(F.lit(1) - (F.lit(2) * F.abs(u)))).alias(
 43 |         "laplace_random",
 44 |     )
 45 | 
 46 | 
 47 | def rand_range(
 48 |     minimum: Union[int, Column],
 49 |     maximum: Union[int, Column],
 50 |     seed: Optional[int] = None,
 51 | ) -> Column:
 52 |     """Generate random numbers uniformly distributed in [`minimum`, `maximum`).
 53 | 
 54 |     :param minimum: minimum value of the random numbers
 55 |     :param maximum: maximum value of the random numbers
 56 |     :param seed: random seed value (optional, default None)
 57 |     :returns: column with random numbers
 58 |     """
 59 |     if not isinstance(minimum, Column):
 60 |         minimum = F.lit(minimum)
 61 | 
 62 |     if not isinstance(maximum, Column):
 63 |         maximum = F.lit(maximum)
 64 | 
 65 |     u = F.rand(seed)
 66 | 
 67 |     return minimum + (maximum - minimum) * u
 68 | 
 69 | 
 70 | def randn(
 71 |     mean: Union[float, Column],
 72 |     variance: Union[float, Column],
 73 |     seed: Optional[int] = None,
 74 | ) -> Column:
 75 |     """Generate a column with independent and identically distributed (i.i.d.) samples from
 76 |     the standard normal distribution with given `mean` and `variance`..
 77 | 
 78 |     :param mean: Mean of the normal distribution of the random numbers
 79 |     :param variance: variance of the normal distribution of the random numbers
 80 |     :param seed: random seed value (optional, default None)
 81 |     :returns: column with random numbers
 82 |     """
 83 |     if not isinstance(mean, Column):
 84 |         mean = F.lit(mean)
 85 | 
 86 |     if not isinstance(variance, Column):
 87 |         variance = F.lit(variance)
 88 | 
 89 |     return F.randn(seed) * F.sqrt(variance) + mean
 90 | 
 91 | 
 92 | def div_or_else(
 93 |     cola: Column,
 94 |     colb: Column,
 95 |     default: Union[float, Column] = 0.0,
 96 | ) -> Column:
 97 |     """Return result of division of cola by colb or default if colb is zero.
 98 | 
 99 |     :param cola: dividend
100 |     :param colb: divisor
101 |     :param default: default value
102 |     :returns: result of division or zero
103 |     """
104 |     if not isinstance(default, Column):
105 |         default = F.lit(default)
106 | 
107 |     return F.when(colb == F.lit(0.0), default).otherwise(cola / colb)
108 | 


--------------------------------------------------------------------------------
/quinn/schema_helpers.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | import json
 17 | from typing import Optional
 18 | 
 19 | from pyspark.sql import SparkSession
 20 | from pyspark.sql import types as T  # noqa: N812
 21 | 
 22 | 
 23 | def print_schema_as_code(dtype: T.DataType) -> str:
 24 |     """Represent DataType (including StructType) as valid Python code.
 25 | 
 26 |     :param dtype: The input DataType or Schema object
 27 |     :type dtype: pyspark.sql.types.DataType
 28 |     :return: A valid python code which generate the same schema.
 29 |     :rtype: str
 30 |     """
 31 |     res = []
 32 |     if isinstance(dtype, T.StructType):
 33 |         res.append("StructType(\n\tfields=[")
 34 |         for field in dtype.fields:
 35 |             for line in _repr_column(field).split("\n"):
 36 |                 res.append("\n\t\t")
 37 |                 res.append(line)
 38 |             res.append(",")
 39 |         res.append("\n\t]\n)")
 40 | 
 41 |     elif isinstance(dtype, T.ArrayType):
 42 |         res.append("ArrayType(")
 43 |         res.append(print_schema_as_code(dtype.elementType))
 44 |         res.append(")")
 45 | 
 46 |     elif isinstance(dtype, T.MapType):
 47 |         res.append("MapType(")
 48 |         res.append(f"\n\t{print_schema_as_code(dtype.keyType)},")
 49 |         for line in print_schema_as_code(dtype.valueType).split("\n"):
 50 |             res.append("\n\t")
 51 |             res.append(line)
 52 |         res.append(",")
 53 |         res.append(f"\n\t{dtype.valueContainsNull},")
 54 |         res.append("\n)")
 55 | 
 56 |     elif isinstance(dtype, T.DecimalType):
 57 |         res.append(f"DecimalType({dtype.precision}, {dtype.scale})")
 58 | 
 59 |     elif str(dtype).endswith("()"):
 60 |         # PySpark 3.3+
 61 |         res.append(str(dtype))
 62 |     else:
 63 |         res.append(f"{dtype}()")
 64 | 
 65 |     return "".join(res)
 66 | 
 67 | 
 68 | def _repr_column(column: T.StructField) -> str:
 69 |     res = []
 70 | 
 71 |     if isinstance(column.dataType, (T.ArrayType, T.MapType, T.StructType)):
 72 |         res.append(f'StructField(\n\t"{column.name}",')
 73 |         for line in print_schema_as_code(column.dataType).split("\n"):
 74 |             res.append("\n\t")
 75 |             res.append(line)
 76 |         res.append(",")
 77 |         res.append(f"\n\t{column.nullable},")
 78 |         res.append("\n)")
 79 | 
 80 |     else:
 81 |         res.append(
 82 |             f'StructField("{column.name}", {print_schema_as_code(column.dataType)}, {column.nullable})',
 83 |         )
 84 | 
 85 |     return "".join(res)
 86 | 
 87 | 
 88 | def schema_from_csv(spark: SparkSession, file_path: str) -> T.StructType:  # noqa: C901
 89 |     """Return a StructType from a CSV file containing schema configuration.
 90 | 
 91 |     :param spark: The SparkSession object
 92 |     :type spark: pyspark.sql.session.SparkSession
 93 | 
 94 |     :param file_path: The path to the CSV file containing the schema configuration
 95 |     :type file_path: str
 96 | 
 97 |     :raises ValueError: If the CSV file does not contain the expected columns: name, type, nullable, description
 98 | 
 99 |     :return: A StructType object representing the schema configuration
100 |     :rtype: pyspark.sql.types.StructType
101 |     """
102 | 
103 |     def _validate_json(metadata: Optional[str]) -> dict:
104 |         if metadata is None:
105 |             return {}
106 | 
107 |         try:
108 |             metadata_dict = json.loads(metadata)
109 | 
110 |         except json.JSONDecodeError as exc:
111 |             msg = f"Invalid JSON: {metadata}"
112 |             raise ValueError(msg) from exc
113 | 
114 |         return metadata_dict
115 | 
116 |     def _lookup_type(type_str: str) -> T.DataType:
117 |         type_lookup = {
118 |             "string": T.StringType(),
119 |             "int": T.IntegerType(),
120 |             "float": T.FloatType(),
121 |             "double": T.DoubleType(),
122 |             "boolean": T.BooleanType(),
123 |             "bool": T.BooleanType(),
124 |             "timestamp": T.TimestampType(),
125 |             "date": T.DateType(),
126 |             "binary": T.BinaryType(),
127 |         }
128 | 
129 |         if type_str not in type_lookup:
130 |             msg = f"Invalid type: {type_str}. Expecting one of: {type_lookup.keys()}"
131 |             raise ValueError(msg)
132 | 
133 |         return type_lookup[type_str]
134 | 
135 |     def _convert_nullable(null_str: str) -> bool:
136 |         if null_str is None:
137 |             return True
138 | 
139 |         parsed_val = null_str.lower()
140 |         if parsed_val not in ["true", "false"]:
141 |             msg = f"Invalid nullable value: {null_str}. Expecting True or False."
142 |             raise ValueError(msg)
143 | 
144 |         return parsed_val == "true"
145 | 
146 |     schema_df = spark.read.csv(file_path, header=True)
147 |     possible_columns = ["name", "type", "nullable", "metadata"]
148 |     num_cols = len(schema_df.columns)
149 |     expected_columns = possible_columns[0:num_cols]
150 | 
151 |     # ensure that csv contains the expected columns: name, type, nullable, description
152 |     if schema_df.columns != expected_columns:
153 |         msg = f"CSV must contain columns in this order: {expected_columns}"
154 |         raise ValueError(msg)
155 | 
156 |     # create a StructType per field
157 |     fields = []
158 |     for row in schema_df.collect():
159 |         field = T.StructField(
160 |             name=row["name"],
161 |             dataType=_lookup_type(row["type"]),
162 |             nullable=_convert_nullable(row["nullable"]) if "nullable" in row else True,
163 |             metadata=_validate_json(row["metadata"] if "metadata" in row else None),
164 |         )
165 |         fields.append(field)
166 | 
167 |     return T.StructType(fields=fields)
168 | 
169 | 
170 | def complex_fields(schema: T.StructType) -> dict[str, object]:
171 |     """Returns a dictionary of complex field names and their data types from the input DataFrame's schema.
172 | 
173 |     :param df: The input PySpark DataFrame.
174 |     :type df: DataFrame
175 |     :return: A dictionary with complex field names as keys and their respective data types as values.
176 |     :rtype: Dict[str, object]
177 |     """
178 |     return {
179 |         field.name: field.dataType
180 |         for field in schema.fields
181 |         if isinstance(field.dataType, (T.ArrayType, T.StructType, T.MapType))
182 |     }
183 | 


--------------------------------------------------------------------------------
/quinn/split_columns.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | from typing import TYPE_CHECKING, Optional
 17 | 
 18 | from pyspark.sql.functions import length, split, trim, udf, when
 19 | from pyspark.sql.types import IntegerType
 20 | 
 21 | if TYPE_CHECKING:
 22 |     from pyspark.sql import DataFrame
 23 | 
 24 | 
 25 | def split_col(  # noqa: PLR0913
 26 |     df: DataFrame,
 27 |     col_name: str,
 28 |     delimiter: str,
 29 |     new_col_names: list[str],
 30 |     mode: str = "permissive",
 31 |     default: Optional[str] = None,
 32 | ) -> DataFrame:
 33 |     """Splits the given column based on the delimiter and creates new columns with the split values.
 34 | 
 35 |     :param df: The input DataFrame
 36 |     :type df: pyspark.sql.DataFrame
 37 |     :param col_name: The name of the column to split
 38 |     :type col_name: str
 39 |     :param delimiter: The delimiter to split the column on
 40 |     :type delimiter: str
 41 |     :param new_col_names: A list of two strings for the new column names
 42 |     :type new_col_names: (List[str])
 43 |     :param mode: The split mode. Can be "strict" or "permissive". Default is "permissive"
 44 |     :type mode: str
 45 |     :param default: If the mode is "permissive" then default value will be assigned to column
 46 |     :type mode: str
 47 |     :return: dataframe: The resulting DataFrame with the split columns
 48 |     :rtype: pyspark.sql.DataFrame.
 49 |     """
 50 |     # Check if the column to be split exists in the DataFrame
 51 |     if col_name not in df.columns:
 52 |         msg = f"Column '{col_name}' not found in DataFrame."
 53 |         raise ValueError(msg)
 54 | 
 55 |     # Check if the delimiter is a string
 56 |     if not isinstance(delimiter, str):
 57 |         msg = "Delimiter must be a string."
 58 |         raise TypeError(msg)
 59 | 
 60 |     # Check if the new column names are a list of strings
 61 |     if not isinstance(new_col_names, list):
 62 |         msg = "New column names must be a list of strings."
 63 |         raise TypeError(msg)
 64 | 
 65 |     # Define a UDF to check the occurrence of delimitter
 66 |     def _num_delimiter(col_value1: str) -> int:
 67 |         # Get the count of delimiter and store the result in no_of_delimiter
 68 |         no_of_delimiter = col_value1.count(delimiter)
 69 |         # Split col_value based on delimiter and store the result in split_value
 70 |         split_value = col_value1.split(delimiter)
 71 | 
 72 |         # Check if col_value is not None
 73 |         if col_value1 is not None:
 74 |             # Check if the no of delimiters in split_value is not as expected
 75 |             if no_of_delimiter != len(new_col_names) - 1:
 76 |                 # If the length is not same, raise an IndexError with the message mentioning the expected and found length
 77 |                 msg = f"Expected {len(new_col_names)} elements after splitting on delimiter, found {len(split_value)} elements"
 78 |                 raise IndexError(
 79 |                     msg,
 80 |                 )
 81 | 
 82 |             # If the length of split_value is same as new_col_names, check if any of the split values is None or empty string
 83 |             elif any(  # noqa: RET506
 84 |                 x is None or x.strip() == "" for x in split_value[: len(new_col_names)]
 85 |             ):
 86 |                 msg = "Null or empty values are not accepted for columns in strict mode"
 87 |                 raise ValueError(
 88 |                     msg,
 89 |                 )
 90 | 
 91 |             # If the above checks pass, return the count of delimiter
 92 |             return int(no_of_delimiter)
 93 | 
 94 |         # If col_value is None, return 0
 95 |         return 0
 96 | 
 97 |     num_udf = udf(lambda y: None if y is None else _num_delimiter(y), IntegerType())
 98 | 
 99 |     # Get the column expression for the column to be split
100 |     col_expr = df[col_name]
101 | 
102 |     # Split the column by the delimiter
103 |     split_col_expr = split(trim(col_expr), delimiter)
104 | 
105 |     # Check the split mode
106 |     if mode == "strict":
107 |         # Create an array of select expressions to create new columns from the split values
108 |         select_exprs = [
109 |             when(split_col_expr.getItem(i) != "", split_col_expr.getItem(i)).alias(
110 |                 new_col_names[i],
111 |             )
112 |             for i in range(len(new_col_names))
113 |         ]
114 | 
115 |         # Select all the columns from the input DataFrame, along with the new split columns
116 |         df = df.select("*", *select_exprs)  # noqa: PD901
117 |         df = df.withColumn("del_length", num_udf(df[col_name]))  # noqa: PD901
118 |         df.cache()
119 |         # Drop the original column if the new columns were created successfully
120 |         df = df.select( # noqa: PD901
121 |             [c for c in df.columns if c not in {"del_length", col_name}],
122 |         )
123 | 
124 |     elif mode == "permissive":
125 |         # Create an array of select expressions to create new columns from the split values
126 |         # Use the default value if a split value is missing or empty
127 |         select_exprs = select_exprs = [
128 |             when(length(split_col_expr.getItem(i)) > 0, split_col_expr.getItem(i))
129 |             .otherwise(default)
130 |             .alias(new_col_names[i])
131 |             for i in range(len(new_col_names))
132 |         ]
133 | 
134 |         # Select all the columns from the input DataFrame, along with the new split columns
135 |         # Drop the original column if the new columns were created successfully
136 |         df = df.select("*", *select_exprs).drop(col_name)  # noqa: PD901
137 |         df.cache()
138 | 
139 |     else:
140 |         msg = f"Invalid mode: {mode}"
141 |         raise ValueError(msg)
142 | 
143 |     # Return the DataFrame with the split columns
144 |     return df
145 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/tests/__init__.py


--------------------------------------------------------------------------------
/tests/extensions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/tests/extensions/__init__.py


--------------------------------------------------------------------------------
/tests/extensions/dataframe_transformations.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import lit
 2 | 
 3 | 
 4 | def with_greeting(df):
 5 |     return df.withColumn("greeting", lit("hi"))
 6 | 
 7 | 
 8 | def with_something(df, something):
 9 |     return df.withColumn("something", lit(something))
10 | 
11 | 
12 | def with_funny(word):
13 |     def inner(df):
14 |         return df.withColumn("funny", lit(word))
15 | 
16 |     return inner
17 | 
18 | 
19 | def with_jacket(word, df):
20 |     return df.withColumn("jacket", lit(word))
21 | 


--------------------------------------------------------------------------------
/tests/extensions/test_dataframe_ext.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import pytest
  3 | import pyspark
  4 | import chispa
  5 | from pyspark.sql.functions import col
  6 | 
  7 | from ..spark import spark
  8 | 
  9 | from .dataframe_transformations import (
 10 |     with_greeting,
 11 |     with_something,
 12 |     with_funny,
 13 |     with_jacket,
 14 | )
 15 | 
 16 | 
 17 | def test_verbose_code_without_transform():
 18 |     data = [("jose", 1), ("li", 2), ("liz", 3)]
 19 |     source_df = spark.createDataFrame(data, ["name", "age"])
 20 |     df1 = with_greeting(source_df)
 21 |     df2 = with_something(df1, "moo")
 22 |     expected_data = [
 23 |         ("jose", 1, "hi", "moo"),
 24 |         ("li", 2, "hi", "moo"),
 25 |         ("liz", 3, "hi", "moo"),
 26 |     ]
 27 |     expected_df = spark.createDataFrame(
 28 |         expected_data, ["name", "age", "greeting", "something"]
 29 |     )
 30 |     chispa.assert_df_equality(df2, expected_df, ignore_nullable=True)
 31 | 
 32 | 
 33 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0")
 34 | def test_transform_with_lambda():
 35 |     data = [("jose", 1), ("li", 2), ("liz", 3)]
 36 |     source_df = spark.createDataFrame(data, ["name", "age"])
 37 |     actual_df = source_df.transform(
 38 |         lambda df: df.withColumn("age_times_two", col("age") * 2)
 39 |     )
 40 |     expected_data = [("jose", 1, 2), ("li", 2, 4), ("liz", 3, 6)]
 41 |     expected_df = spark.createDataFrame(expected_data, ["name", "age", "age_times_two"])
 42 |     chispa.assert_df_equality(actual_df, expected_df)
 43 | 
 44 | 
 45 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0")
 46 | def test_transform_with_no_arg_fun():
 47 |     data = [("jose", 1), ("li", 2), ("liz", 3)]
 48 |     source_df = spark.createDataFrame(data, ["name", "age"])
 49 |     actual_df = source_df.transform(lambda df: with_greeting(df))
 50 |     expected_data = [("jose", 1, "hi"), ("li", 2, "hi"), ("liz", 3, "hi")]
 51 |     expected_df = spark.createDataFrame(expected_data, ["name", "age", "greeting"])
 52 |     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
 53 | 
 54 | 
 55 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0")
 56 | def test_transform_with_one_arg_fun():
 57 |     data = [("jose", 1), ("li", 2), ("liz", 3)]
 58 |     source_df = spark.createDataFrame(data, ["name", "age"])
 59 |     actual_df = source_df.transform(lambda df: with_something(df, "crazy"))
 60 |     expected_data = [("jose", 1, "crazy"), ("li", 2, "crazy"), ("liz", 3, "crazy")]
 61 |     expected_df = spark.createDataFrame(expected_data, ["name", "age", "something"])
 62 |     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
 63 | 
 64 | 
 65 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0")
 66 | def test_chain_transforms():
 67 |     data = [("jose", 1), ("li", 2), ("liz", 3)]
 68 |     source_df = spark.createDataFrame(data, ["name", "age"])
 69 |     actual_df = source_df.transform(with_greeting).transform(
 70 |         lambda df: with_something(df, "crazy")
 71 |     )
 72 |     expected_data = [
 73 |         ("jose", 1, "hi", "crazy"),
 74 |         ("li", 2, "hi", "crazy"),
 75 |         ("liz", 3, "hi", "crazy"),
 76 |     ]
 77 |     expected_df = spark.createDataFrame(
 78 |         expected_data, ["name", "age", "greeting", "something"]
 79 |     )
 80 |     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
 81 | 
 82 | 
 83 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0")
 84 | def test_transform_with_closure():
 85 |     data = [("jose", 1), ("li", 2), ("liz", 3)]
 86 |     source_df = spark.createDataFrame(data, ["name", "age"])
 87 |     actual_df = source_df.transform(with_greeting).transform(  # no lambda required
 88 |         with_funny("haha")
 89 |     )
 90 |     expected_data = [
 91 |         ("jose", 1, "hi", "haha"),
 92 |         ("li", 2, "hi", "haha"),
 93 |         ("liz", 3, "hi", "haha"),
 94 |     ]
 95 |     expected_df = spark.createDataFrame(
 96 |         expected_data, ["name", "age", "greeting", "funny"]
 97 |     )
 98 |     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
 99 | 
100 | 
101 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0")
102 | def test_transform_with_functools_partial():
103 |     data = [("jose", 1), ("li", 2), ("liz", 3)]
104 |     source_df = spark.createDataFrame(data, ["name", "age"])
105 |     actual_df = source_df.transform(
106 |         partial(with_greeting)
107 |     ).transform(  # partial is optional for transformations that only take a single DataFrame argument
108 |         partial(with_jacket, "warm")
109 |     )
110 |     expected_data = [
111 |         ("jose", 1, "hi", "warm"),
112 |         ("li", 2, "hi", "warm"),
113 |         ("liz", 3, "hi", "warm"),
114 |     ]
115 |     expected_df = spark.createDataFrame(
116 |         expected_data, ["name", "age", "greeting", "jacket"]
117 |     )
118 |     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
119 | 


--------------------------------------------------------------------------------
/tests/extensions/test_spark_session_ext.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import StructType, StructField, StringType
 2 | 
 3 | from ..spark import spark
 4 | 
 5 | import chispa
 6 | import quinn
 7 | 
 8 | 
 9 | def test_create_df():
10 |     schema = StructType(
11 |         [
12 |             StructField("name", StringType(), True),
13 |             StructField("blah", StringType(), True),
14 |         ]
15 |     )
16 |     data = [("jose", "a"), ("li", "b"), ("sam", "c")]
17 |     actual_df = spark.createDataFrame(data, schema)
18 | 
19 |     expected_df = quinn.create_df(
20 |         spark,
21 |         [("jose", "a"), ("li", "b"), ("sam", "c")],
22 |         [("name", StringType(), True), ("blah", StringType(), True)],
23 |     )
24 | 
25 |     chispa.assert_df_equality(expected_df, actual_df)
26 | 


--------------------------------------------------------------------------------
/tests/spark.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | 
3 | spark = SparkSession.builder.master("local").appName("chispa").getOrCreate()
4 | 


--------------------------------------------------------------------------------
/tests/test_append_if_schema_identical.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import StructType, StructField, IntegerType, StringType
 2 | import quinn
 3 | from .spark import spark
 4 | from chispa.schema_comparer import assert_basic_schema_equality
 5 | from quinn.append_if_schema_identical import SchemaMismatchError
 6 | 
 7 | 
 8 | def test_append_if_schema_identical():
 9 |     source_data = [(1, "cape town", "Alice"), (2, "delhi", "Bob")]
10 |     target_data = [(3, "Charlie", "New York"), (4, "Dave", "Los Angeles")]
11 |     bad_data = [(5, "Eve", "London", "extra_column")]
12 | 
13 |     source_df = spark.createDataFrame(
14 |         source_data,
15 |         schema=StructType(
16 |             [
17 |                 StructField("id", IntegerType()),
18 |                 StructField("city", StringType()),
19 |                 StructField("name", StringType()),
20 |             ]
21 |         ),
22 |     )
23 | 
24 |     target_df = spark.createDataFrame(
25 |         target_data,
26 |         schema=StructType(
27 |             [
28 |                 StructField("id", IntegerType()),
29 |                 StructField("name", StringType()),
30 |                 StructField("city", StringType()),
31 |             ]
32 |         ),
33 |     )
34 | 
35 |     unidentical_df = spark.createDataFrame(
36 |         bad_data,
37 |         schema=StructType(
38 |             [
39 |                 StructField("id", IntegerType()),
40 |                 StructField("name", StringType()),
41 |                 StructField("city", StringType()),
42 |                 StructField("extra", StringType()),
43 |             ]
44 |         ),
45 |     )
46 | 
47 |     check_if_error_caught = False
48 |     expected_names = ["Charlie", "Dave", "Alice", "Bob"]
49 |     expected_cities = ["New York", "Los Angeles", "cape town", "delhi"]
50 | 
51 |     # Call the append_if_schema_identical function
52 |     result = quinn.append_if_schema_identical(source_df, target_df)
53 | 
54 |     # check result content
55 |     names = [i.name for i in result.select("name").collect()]
56 |     cities = [i.city for i in result.select("city").collect()]
57 | 
58 |     if result.count() != 4:
59 |         raise AssertionError("result should have 4 rows")
60 | 
61 |     if names != expected_names:
62 |         raise AssertionError("result should have the correct names")
63 | 
64 |     if cities != expected_cities:
65 |         raise AssertionError("result should have the correct cities")
66 | 
67 |     assert_basic_schema_equality(target_df.schema, result.schema)
68 | 
69 |     try:
70 |         quinn.append_if_schema_identical(source_df, unidentical_df)
71 |     except SchemaMismatchError:
72 |         check_if_error_caught = True
73 | 
74 |     if not check_if_error_caught:
75 |         raise AssertionError(
76 |             "append_if_schema_identical should raise an error if the schemas are not identical"
77 |         )
78 | 


--------------------------------------------------------------------------------
/tests/test_dataframe_helpers.py:
--------------------------------------------------------------------------------
 1 | import quinn
 2 | from .spark import spark
 3 | import chispa
 4 | from pyspark.sql.types import IntegerType, StringType, StructType, StructField
 5 | 
 6 | 
 7 | def describe_column_to_list():
 8 |     def it_returns_a_list():
 9 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
10 |         source_df = spark.createDataFrame(data, ["name", "age"])
11 |         actual = quinn.column_to_list(source_df, "name")
12 |         assert ["jose", "li", "luisa"] == actual
13 | 
14 | 
15 | def describe_two_columns_to_dictionary():
16 |     def it_returns_a_dictionary():
17 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
18 |         source_df = spark.createDataFrame(data, ["name", "age"])
19 |         actual = quinn.two_columns_to_dictionary(source_df, "name", "age")
20 |         assert {"jose": 1, "li": 2, "luisa": 3} == actual
21 | 
22 | 
23 | def describe_to_list_of_dictionaries():
24 |     def returns_a_list_of_dicts():
25 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
26 |         source_df = spark.createDataFrame(data, ["name", "age"])
27 |         actual = quinn.to_list_of_dictionaries(source_df)
28 |         expected = [
29 |             {"name": "jose", "age": 1},
30 |             {"name": "li", "age": 2},
31 |             {"name": "luisa", "age": 3},
32 |         ]
33 |         assert expected == actual
34 | 
35 | 
36 | def describe_show_output_to_df():
37 |     def it_converts_a_show_string_to_a_dataframe():
38 |         s = """+----+---+-----------+------+
39 | |name|age|     stuff1|stuff2|
40 | +----+---+-----------+------+
41 | |jose|  1|nice person|  yoyo|
42 | |  li|  2|nice person|  yoyo|
43 | | liz|  3|nice person|  yoyo|
44 | +----+---+-----------+------+"""
45 |         actual_df = quinn.show_output_to_df(s, spark)
46 |         expected_data = [
47 |             ("jose", "1", "nice person", "yoyo"),
48 |             ("li", "2", "nice person", "yoyo"),
49 |             ("liz", "3", "nice person", "yoyo"),
50 |         ]
51 |         expected_df = spark.createDataFrame(
52 |             expected_data, ["name", "age", "stuff1", "stuff2"]
53 |         )
54 |         chispa.assert_df_equality(expected_df, actual_df)
55 | 
56 | 
57 | def describe_print_athena_create_table():
58 |     def it_prints_a_create_table_string_for_athena(capsys):
59 |         source_df = spark.createDataFrame(
60 |             [("jets", "football", 45), ("nacional", "soccer", 10)],
61 |             ["team", "sport", "goals_for"],
62 |         )
63 |         quinn.print_athena_create_table(source_df, "athena_table", "s3://mock")
64 |         out, _ = capsys.readouterr()
65 |         assert (
66 |             out
67 |             == "CREATE EXTERNAL TABLE IF NOT EXISTS `athena_table` ( \n\t `team` string, \n\t `sport` string, \n\t `goals_for` bigint \n)\nSTORED AS PARQUET\nLOCATION 's3://mock'\n\n"  # noqa
68 |         )
69 | 
70 | 
71 | def test_create_df():
72 |     rows_data = [("jose", 1), ("li", 2), ("luisa", 3)]
73 |     col_specs = [("name", StringType()), ("age", IntegerType())]
74 | 
75 |     expected_schema = StructType(
76 |         [
77 |             StructField("name", StringType(), True),
78 |             StructField("age", IntegerType(), True),
79 |         ]
80 |     )
81 |     actual = quinn.create_df(spark, rows_data, col_specs)
82 |     expected = spark.createDataFrame(rows_data, expected_schema)
83 |     chispa.assert_df_equality(actual, expected)
84 | 


--------------------------------------------------------------------------------
/tests/test_dataframe_validator.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pyspark.sql.types import StructType, StructField, StringType, LongType
  3 | import semver
  4 | import quinn
  5 | from .spark import spark
  6 | 
  7 | 
  8 | def describe_validate_presence_of_columns():
  9 |     def it_raises_if_a_required_column_is_missing_and_return_bool_is_false():
 10 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 11 |         source_df = spark.createDataFrame(data, ["name", "age"])
 12 |         with pytest.raises(quinn.DataFrameMissingColumnError) as excinfo:
 13 |             quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"], False)
 14 |         assert (
 15 |             excinfo.value.args[0]
 16 |             == "The ['fun'] columns are not included in the DataFrame with the following columns ['name', 'age']"
 17 |         )
 18 | 
 19 |     def it_does_nothing_if_all_required_columns_are_present_and_return_bool_is_false():
 20 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 21 |         source_df = spark.createDataFrame(data, ["name", "age"])
 22 |         quinn.validate_presence_of_columns(source_df, ["name"], False)
 23 | 
 24 |     def it_returns_false_if_a_required_column_is_missing_and_return_bool_is_true():
 25 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 26 |         source_df = spark.createDataFrame(data, ["name", "age"])
 27 |         result = quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"], True)
 28 |         assert result is False
 29 | 
 30 |     def it_returns_true_if_all_required_columns_are_present_and_return_bool_is_true():
 31 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 32 |         source_df = spark.createDataFrame(data, ["name", "age"])
 33 |         result = quinn.validate_presence_of_columns(source_df, ["name"], True)
 34 |         assert result is True
 35 | 
 36 | 
 37 | def describe_validate_schema():
 38 |     def it_raises_when_struct_field_is_missing_and_return_bool_is_false():
 39 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 40 |         source_df = spark.createDataFrame(data, ["name", "age"])
 41 |         required_schema = StructType(
 42 |             [
 43 |                 StructField("name", StringType(), True),
 44 |                 StructField("city", StringType(), True),
 45 |             ]
 46 |         )
 47 |         with pytest.raises(quinn.DataFrameMissingStructFieldError) as excinfo:
 48 |             quinn.validate_schema(source_df, required_schema, return_bool = False)
 49 | 
 50 |         current_spark_version = semver.Version.parse(spark.version)
 51 |         spark_330 = semver.Version.parse("3.3.0")
 52 |         if semver.Version.compare(current_spark_version, spark_330) >= 0:  # Spark 3.3+
 53 |             expected_error_message = "The [StructField('city', StringType(), True)] StructFields are not included in the DataFrame with the following StructFields StructType([StructField('name', StringType(), True), StructField('age', LongType(), True)])"  # noqa
 54 |         else:
 55 |             expected_error_message = "The [StructField(city,StringType,true)] StructFields are not included in the DataFrame with the following StructFields StructType(List(StructField(name,StringType,true),StructField(age,LongType,true)))"  # noqa
 56 |         assert excinfo.value.args[0] == expected_error_message
 57 | 
 58 |     def it_does_nothing_when_the_schema_matches_and_return_bool_is_false():
 59 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 60 |         source_df = spark.createDataFrame(data, ["name", "age"])
 61 |         required_schema = StructType(
 62 |             [
 63 |                 StructField("name", StringType(), True),
 64 |                 StructField("age", LongType(), True),
 65 |             ]
 66 |         )
 67 |         quinn.validate_schema(source_df, required_schema, return_bool = False)
 68 | 
 69 |     def it_returns_false_when_struct_field_is_missing_and_return_bool_is_true():
 70 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 71 |         source_df = spark.createDataFrame(data, ["name", "age"])
 72 |         required_schema = StructType(
 73 |             [
 74 |                 StructField("name", StringType(), True),
 75 |                 StructField("city", StringType(), True),
 76 |             ]
 77 |         )
 78 |         result = quinn.validate_schema(source_df, required_schema, return_bool = True)
 79 |         assert result is False
 80 | 
 81 |     def it_returns_true_when_the_schema_matches_and_return_bool_is_true():
 82 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 83 |         source_df = spark.createDataFrame(data, ["name", "age"])
 84 |         required_schema = StructType(
 85 |             [
 86 |                 StructField("name", StringType(), True),
 87 |                 StructField("age", LongType(), True),
 88 |             ]
 89 |         )
 90 |         result = quinn.validate_schema(source_df, required_schema, return_bool = True)
 91 |         assert result is True
 92 | 
 93 |     def nullable_column_mismatches_are_ignored():
 94 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
 95 |         source_df = spark.createDataFrame(data, ["name", "age"])
 96 |         required_schema = StructType(
 97 |             [
 98 |                 StructField("name", StringType(), True),
 99 |                 StructField("age", LongType(), False),
100 |             ]
101 |         )
102 |         quinn.validate_schema(source_df, required_schema, ignore_nullable=True, return_bool = False)
103 | 
104 | 
105 | def describe_validate_absence_of_columns():
106 |     def it_raises_when_a_unallowed_column_is_present_and_return_bool_is_false():
107 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
108 |         source_df = spark.createDataFrame(data, ["name", "age"])
109 |         with pytest.raises(quinn.DataFrameProhibitedColumnError) as excinfo:
110 |             quinn.validate_absence_of_columns(source_df, ["age", "cool"], False)
111 |         assert (
112 |             excinfo.value.args[0]
113 |             == "The ['age'] columns are not allowed to be included in the DataFrame with the following columns ['name', 'age']"  # noqa
114 |         )
115 | 
116 |     def it_does_nothing_when_no_unallowed_columns_are_present_and_return_bool_is_false():
117 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
118 |         source_df = spark.createDataFrame(data, ["name", "age"])
119 |         quinn.validate_absence_of_columns(source_df, ["favorite_color"], False)
120 | 
121 |     def it_returns_false_when_a_unallowed_column_is_present_and_return_bool_is_true():
122 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
123 |         source_df = spark.createDataFrame(data, ["name", "age"])
124 |         result = quinn.validate_absence_of_columns(source_df, ["age", "cool"], True)
125 |         assert result is False
126 | 
127 |     def it_returns_true_when_no_unallowed_columns_are_present_and_return_bool_is_true():
128 |         data = [("jose", 1), ("li", 2), ("luisa", 3)]
129 |         source_df = spark.createDataFrame(data, ["name", "age"])
130 |         result = quinn.validate_absence_of_columns(source_df, ["favorite_color"], True)
131 |         assert result is True
132 | 


--------------------------------------------------------------------------------
/tests/test_files/bad_schema.csv:
--------------------------------------------------------------------------------
1 | whatever,type,nullable,metadata
2 | blah,string,false,{"description":"The person's name"}
3 | no,string


--------------------------------------------------------------------------------
/tests/test_files/good_schema1.csv:
--------------------------------------------------------------------------------
1 | ﻿name,type,nullable,metadata
2 | person,string,false,{"description":"The person's name"}
3 | address,string
4 | phoneNumber,string,TRUE,{"description":"The person's phone number"}
5 | age,int,False


--------------------------------------------------------------------------------
/tests/test_files/good_schema2.csv:
--------------------------------------------------------------------------------
1 | name,type
2 | person,string
3 | address,string
4 | phoneNumber,string
5 | age,int


--------------------------------------------------------------------------------
/tests/test_files/some_pyspark.py:
--------------------------------------------------------------------------------
 1 | import pyspark
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | print("hi")
 5 | 
 6 | spark = SparkSession.builder.appName('my_app').getOrCreate()
 7 | sparkContext = spark.sparkContext
 8 | rdd=sparkContext.parallelize([1,2,3,4,5])
 9 | rddCollect = rdd.collect()
10 | print("Number of Partitions: "+str(rdd.getNumPartitions()))
11 | print("Action: First element: "+str(rdd.first()))
12 | print(rddCollect)
13 | 
14 | print("bye")


--------------------------------------------------------------------------------
/tests/test_keyword_finder.py:
--------------------------------------------------------------------------------
 1 | from quinn.keyword_finder import search_file, search_files, keyword_format, surround_substring
 2 | 
 3 | 
 4 | def test_search_file():
 5 |     file_path = "tests/test_files/some_pyspark.py"
 6 |     results = search_file(file_path)
 7 | 
 8 |     assert results.word_count["rdd"] == 5
 9 |     assert results.word_count["sparkContext"] == 2
10 | 
11 | 
12 | def test_search_files():
13 |     results = search_files("tests/test_files")
14 | 
15 |     pyspark_file = [result for result in results if result.file_path == "tests/test_files/some_pyspark.py"][0]
16 |     csv_file = [result for result in results if result.file_path == "tests/test_files/good_schema1.csv"][0]
17 | 
18 |     assert pyspark_file.word_count["rdd"] == 5
19 |     assert pyspark_file.word_count["sparkContext"] == 2
20 |     assert csv_file.word_count["rdd"] == 0
21 | 
22 | 
23 | def test_keyword_format():
24 |     print(keyword_format("spark rdd stuff"))
25 |     print(keyword_format("spark rdd stuff with bad _jvm"))
26 |     print(keyword_format("nice string"))
27 |     print(keyword_format(""))
28 | 
29 | 
30 | def test_surround_substring():
31 | 
32 |     assert "spark **rdd|| stuff" == surround_substring("spark rdd stuff", "rdd", "**", "||")
33 |     assert "spark **rdd|| stuff with **rdd||" == surround_substring("spark rdd stuff with rdd", "rdd", "**", "||")
34 |     assert "spark **rdd||dd stuff" == surround_substring("spark rdddd stuff", "rdd", "**", "||")
35 | 


--------------------------------------------------------------------------------
/tests/test_math.py:
--------------------------------------------------------------------------------
 1 | import pyspark.sql.functions as F
 2 | 
 3 | import pytest
 4 | import quinn
 5 | import math
 6 | from .spark import spark
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "mean, scale",
11 |     [
12 |         (1.0, 2.0),
13 |         (2.0, 3.0),
14 |         (3.0, 4.0),
15 |     ],
16 | )
17 | @pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"])
18 | def test_rand_laplace(mean: float, scale: float):
19 |     stats = (
20 |         spark.range(100000)
21 |         .select(quinn.rand_laplace(mean, scale, 42))
22 |         .agg(
23 |             F.mean("laplace_random").alias("mean"),
24 |             F.stddev("laplace_random").alias("std_dev"),
25 |         )
26 |         .first()
27 |     )
28 | 
29 |     laplace_mean = stats["mean"]
30 |     laplace_stddev = stats["std_dev"]
31 | 
32 |     # Laplace distribution with mean=0.0 and scale=1.0 has mean=0.0 and stddev=sqrt(2.0)
33 |     assert abs(laplace_mean - mean) <= 0.1
34 |     assert abs(laplace_stddev - scale * math.sqrt(2.0)) <= 0.1
35 | 
36 | 
37 | @pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"])
38 | def test_rand_range():
39 |     lower_bound = 5
40 |     upper_bound = 10
41 |     stats = (
42 |         spark.range(1000)
43 |         .select(quinn.rand_range(lower_bound, upper_bound).alias("rand_uniform"))
44 |         .agg(F.min("rand_uniform").alias("min"), F.min("rand_uniform").alias("max"))
45 |         .first()
46 |     )
47 | 
48 |     uniform_min = stats["min"]
49 |     uniform_max = stats["max"]
50 | 
51 |     assert lower_bound <= uniform_min <= uniform_max <= upper_bound
52 | 
53 | 
54 | @pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"])
55 | def test_randn():
56 |     mean = 1.0
57 |     variance = 2.0
58 |     stats = (
59 |         spark.range(1000)
60 |         .select(quinn.randn(mean, variance).alias("rand_normal"))
61 |         .agg(
62 |             F.mean("rand_normal").alias("agg_mean"),
63 |             F.variance("rand_normal").alias("agg_variance"),
64 |         )
65 |         .first()
66 |     )
67 | 
68 |     agg_mean = stats["agg_mean"]
69 |     agg_variance = stats["agg_variance"]
70 | 
71 |     assert agg_mean - mean <= 0.1
72 |     assert agg_variance - variance <= 0.2
73 | 


--------------------------------------------------------------------------------
/tests/test_schema_helpers.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from pyspark.sql.types import (
  4 |     StructType,
  5 |     IntegerType,
  6 |     DecimalType,
  7 |     ArrayType,
  8 |     FloatType,
  9 |     MapType,
 10 |     StringType,
 11 |     DoubleType,
 12 |     TimestampType,
 13 |     StructField,
 14 | )
 15 | import pyspark.sql.dataframe
 16 | 
 17 | from quinn.schema_helpers import print_schema_as_code, schema_from_csv, complex_fields
 18 | 
 19 | from chispa.schema_comparer import assert_basic_schema_equality
 20 | import pytest
 21 | 
 22 | from .spark import spark
 23 | 
 24 | 
 25 | def test_print_schema_as_code():
 26 |     fields = []
 27 |     fields.append(StructField("simple_int", IntegerType()))
 28 |     fields.append(StructField("decimal_with_nums", DecimalType(19, 8)))
 29 |     fields.append(StructField("array", ArrayType(FloatType())))
 30 |     fields.append(StructField("map", MapType(StringType(), ArrayType(DoubleType()))))
 31 |     fields.append(
 32 |         StructField(
 33 |             "struct",
 34 |             StructType(
 35 |                 [
 36 |                     StructField("first", StringType()),
 37 |                     StructField("second", TimestampType()),
 38 |                 ]
 39 |             ),
 40 |         )
 41 |     )
 42 | 
 43 |     schema = StructType(fields=fields)
 44 | 
 45 |     assert_basic_schema_equality(schema, eval(print_schema_as_code(schema)))
 46 | 
 47 | 
 48 | def test_schema_from_csv_good_schema1():
 49 |     expected_schema = StructType(
 50 |         [
 51 |             StructField(
 52 |                 "person", StringType(), False, {"description": "The person's name"}
 53 |             ),
 54 |             StructField("address", StringType(), True),
 55 |             StructField(
 56 |                 "phoneNumber",
 57 |                 StringType(),
 58 |                 True,
 59 |                 {"description": "The person's phone number"},
 60 |             ),
 61 |             StructField("age", IntegerType(), False),
 62 |         ]
 63 |     )
 64 |     path = "tests/test_files/good_schema1.csv"
 65 |     assert_basic_schema_equality(expected_schema, schema_from_csv(spark, path))
 66 | 
 67 | 
 68 | def test_schema_from_csv_good_schema2():
 69 |     expected_schema = StructType(
 70 |         [
 71 |             StructField("person", StringType(), True),
 72 |             StructField("address", StringType(), True),
 73 |             StructField("phoneNumber", StringType(), True),
 74 |             StructField("age", IntegerType(), True),
 75 |         ]
 76 |     )
 77 |     path = "tests/test_files/good_schema2.csv"
 78 |     assert_basic_schema_equality(expected_schema, schema_from_csv(spark, path))
 79 | 
 80 | 
 81 | def test_schema_from_csv_equality_for_bad_csv():
 82 |     path = "tests/test_files/bad_schema.csv"
 83 |     with pytest.raises(ValueError) as excinfo:
 84 |         schema_from_csv(spark, path)
 85 |     assert (
 86 |         excinfo.value.args[0]
 87 |         == "CSV must contain columns in this order: ['name', 'type', 'nullable', 'metadata']"
 88 |     )
 89 | 
 90 | 
 91 | def test_complex_fields():
 92 |     schema = StructType(
 93 |         [
 94 |             StructField("id", IntegerType(), True),
 95 |             StructField(
 96 |                 "details",
 97 |                 StructType(
 98 |                     [
 99 |                         StructField("name", StringType(), True),
100 |                         StructField("address", StringType(), True),
101 |                         StructField("age", IntegerType(), True),
102 |                     ]
103 |                 ),
104 |                 True,
105 |             ),
106 |         ]
107 |     )
108 |     expected = {
109 |         "details": StructType(
110 |             [
111 |                 StructField("name", StringType(), True),
112 |                 StructField("address", StringType(), True),
113 |                 StructField("age", IntegerType(), True),
114 |             ]
115 |         )
116 |     }
117 |     assert complex_fields(schema) == expected
118 | 


--------------------------------------------------------------------------------
/tests/test_split_columns.py:
--------------------------------------------------------------------------------
 1 | import quinn
 2 | import chispa
 3 | import pytest
 4 | from .spark import spark
 5 | 
 6 | 
 7 | def test_split_columns():
 8 |     data = [
 9 |         ("chrisXXmoe", 2025, "bio"),
10 |         ("davidXXbb", 2026, "physics"),
11 |         (None, 2025, "physics"),
12 |     ]
13 |     df = spark.createDataFrame(data, ["student_name", "graduation_year", "major"])
14 |     new_df = quinn.split_col(
15 |         df,
16 |         col_name="student_name",
17 |         delimiter="XX",
18 |         new_col_names=["student_first_name", "student_last_name"],
19 |         mode="permissive",
20 |     )
21 |     data = [
22 |         (2025, "bio", "chris", "moe"),
23 |         (2026, "physics", "david", "bb"),
24 |         (2025, "physics", None, None),
25 |     ]
26 |     expected = spark.createDataFrame(
27 |         data, ["graduation_year", "major", "student_first_name", "student_last_name"]
28 |     )
29 |     chispa.assert_df_equality(new_df, expected)
30 | 
31 | 
32 | def test_split_columns_advanced():
33 |     data = [
34 |         ("chrisXXsomethingXXmoe", 2025, "bio"),
35 |         ("davidXXbb", 2026, "physics"),
36 |         (None, 2025, "physics"),
37 |     ]
38 |     df = spark.createDataFrame(data, ["student_name", "graduation_year", "major"])
39 |     new_df = quinn.split_col(
40 |         df,
41 |         col_name="student_name",
42 |         delimiter="XX",
43 |         new_col_names=[
44 |             "student_first_name",
45 |             "student_middle_name",
46 |             "student_last_name",
47 |         ],
48 |         mode="permissive",
49 |     )
50 |     data = [
51 |         (2025, "bio", "chris", "something", "moe"),
52 |         (2026, "physics", "david", "bb", None),
53 |         (2025, "physics", None, None, None),
54 |     ]
55 |     expected = spark.createDataFrame(
56 |         data,
57 |         [
58 |             "graduation_year",
59 |             "major",
60 |             "student_first_name",
61 |             "student_middle_name",
62 |             "student_last_name",
63 |         ],
64 |     )
65 |     chispa.assert_df_equality(new_df, expected)
66 | 
67 | 
68 | def test_split_columns_strict():
69 |     data = [
70 |         ("chrisXXsomethingXXmoe", 2025, "bio"),
71 |         ("davidXXbb", 2026, "physics"),
72 |         (None, 2025, "physics"),
73 |     ]
74 |     df = spark.createDataFrame(data, ["student_name", "graduation_year", "major"])
75 |     df2 = quinn.split_col(
76 |         df,
77 |         col_name="student_name",
78 |         delimiter="XX",
79 |         new_col_names=[
80 |             "student_first_name",
81 |             "student_middle_name",
82 |             "student_last_name",
83 |         ],
84 |         mode="strict",
85 |         default="hi",
86 |     )
87 |     with pytest.raises(
88 |         Exception
89 |     ):  # there is no way to make it work for all the versions
90 |         df2.show()
91 | 


--------------------------------------------------------------------------------