├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── docs_improvement.yml │ └── feature_request.yml ├── pull_request_template.md └── workflows │ ├── assign-on-comment.yml │ ├── ci.yml │ ├── lint.yaml │ ├── mkdocs.yml │ └── pr.yml ├── .gitignore ├── .licenserc.yaml ├── .pre-commit-config.yaml ├── .python-version ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── benchmarks ├── __init__.py ├── benchmark_column_performance.py ├── create_benchmark_df.py ├── results │ ├── collectlist_large.json │ ├── collectlist_medium.json │ ├── collectlist_small.json │ ├── collectlist_xsmall.json │ ├── flatmap_large.json │ ├── flatmap_medium.json │ ├── flatmap_small.json │ ├── flatmap_xsmall.json │ ├── localIterator_large.json │ ├── localIterator_medium.json │ ├── localIterator_small.json │ ├── localIterator_xsmall.json │ ├── map_large.json │ ├── map_medium.json │ ├── map_small.json │ ├── map_xsmall.json │ ├── toPandas_large.json │ ├── toPandas_medium.json │ ├── toPandas_small.json │ └── toPandas_xsmall.json └── visualize_benchmarks.py ├── docs ├── examples │ └── index.md ├── gen_ref_pages.py ├── images │ ├── column_to_list_boxplot.svg │ ├── column_to_list_line_plot.svg │ └── quinn.png ├── index.md ├── learn_more │ ├── column_to_list.md │ └── index.md ├── notebooks │ └── schema_as_code.ipynb └── usage.md ├── mkdocs.yml ├── poetry.lock ├── pyproject.toml ├── quinn.iml ├── quinn.png ├── quinn ├── __init__.py ├── append_if_schema_identical.py ├── dataframe_helpers.py ├── dataframe_validator.py ├── extensions │ ├── __init__.py │ ├── dataframe_ext.py │ └── spark_session_ext.py ├── functions.py ├── keyword_finder.py ├── math.py ├── schema_helpers.py ├── split_columns.py └── transformations.py └── tests ├── __init__.py ├── extensions ├── __init__.py ├── dataframe_transformations.py ├── test_dataframe_ext.py └── test_spark_session_ext.py ├── spark.py ├── test_append_if_schema_identical.py ├── test_dataframe_helpers.py ├── test_dataframe_validator.py ├── test_files ├── bad_schema.csv ├── good_schema1.csv ├── good_schema2.csv └── some_pyspark.py ├── test_functions.py ├── test_keyword_finder.py ├── test_math.py ├── test_schema_helpers.py ├── test_split_columns.py └── test_transformations.py /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Report incorrect behavior in the quinn library 3 | title: "BUG: " 4 | labels: [Bug] 5 | 6 | body: 7 | - type: checkboxes 8 | id: checks 9 | attributes: 10 | label: Quinn version checks 11 | options: 12 | - label: > 13 | I have checked that this issue has not already been reported. 14 | required: true 15 | - label: > 16 | I have confirmed this bug exists on the 17 | [latest version](https://pypi.org/project/quinn/) of quinn. 18 | required: true 19 | - label: > 20 | I have confirmed this bug exists on the 21 | [main branch](https://github.com/MrPowers/quinn) of quinn. 22 | - type: textarea 23 | id: example 24 | attributes: 25 | label: Reproducible Example 26 | description: > 27 | Please follow [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) on how to 28 | provide a minimal, copy-pastable example. 29 | placeholder: > 30 | import quinn 31 | 32 | 33 | quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"]) 34 | 35 | ... 36 | render: python 37 | validations: 38 | required: true 39 | - type: textarea 40 | id: problem 41 | attributes: 42 | label: Issue Description 43 | description: > 44 | Please provide a description of the issue shown in the reproducible example. 45 | validations: 46 | required: true 47 | - type: textarea 48 | id: expected-behavior 49 | attributes: 50 | label: Expected Behavior 51 | description: > 52 | Please describe or show a code example of the expected behavior. 53 | validations: 54 | required: true 55 | - type: textarea 56 | id: version 57 | attributes: 58 | label: Installed Versions 59 | description: > 60 | Please paste the output of ``quinn.__version__`` 61 | value: > 62 |
63 | 64 | Replace this line with the output of quinn.__version__ 65 | 66 |
67 | validations: 68 | required: true -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/docs_improvement.yml: -------------------------------------------------------------------------------- 1 | name: Documentation Improvement 2 | description: Report wrong or missing documentation 3 | title: "DOC: " 4 | labels: [Docs] 5 | 6 | body: 7 | - type: checkboxes 8 | attributes: 9 | label: Quinn version checks 10 | options: 11 | - label: > 12 | I have checked that the issue still exists on the latest versions of the docs 13 | on `main` [here](https://mrpowers.github.io/quinn/) 14 | required: true 15 | - type: textarea 16 | id: location 17 | attributes: 18 | label: Location of the documentation 19 | description: > 20 | Please provide the location of the documentation, e.g. "quinn.validate_schema() 21 | validations: 22 | required: true 23 | - type: textarea 24 | id: problem 25 | attributes: 26 | label: Documentation problem 27 | description: > 28 | Please provide a description of what documentation you believe needs to be fixed/improved 29 | validations: 30 | required: true 31 | - type: textarea 32 | id: suggested-fix 33 | attributes: 34 | label: Suggested fix for documentation 35 | description: > 36 | Please explain the suggested fix and **why** it's better than the existing documentation 37 | validations: 38 | required: true -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Suggest an idea for quinn 3 | title: "ENH: " 4 | labels: [Enhancement] 5 | body: 6 | - type: checkboxes 7 | id: checks 8 | attributes: 9 | label: Feature Type 10 | description: Please check what type of feature request you would like to propose. 11 | options: 12 | - label: > 13 | Adding new functionality to quinn 14 | - label: > 15 | Changing existing functionality in quinn 16 | - label: > 17 | Removing existing functionality in quinn 18 | - type: textarea 19 | id: description 20 | attributes: 21 | label: Problem Description 22 | description: > 23 | Please describe what problem the feature would solve, e.g. "I wish I could use quinn to ..." 24 | validations: 25 | required: true 26 | - type: textarea 27 | id: feature 28 | attributes: 29 | label: Feature Description 30 | description: > 31 | Please describe how the new feature would be implemented, using psudocode if relevant. 32 | validations: 33 | required: true 34 | - type: textarea 35 | id: context 36 | attributes: 37 | label: Additional Context 38 | description: > 39 | Please provide any relevant GitHub issues, code examples or references that help describe and support 40 | the feature request. -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Proposed changes 2 | 3 | Describe the big picture of your changes here to communicate to the maintainers. If it fixes a bug or resolves a feature request, please provide a link to that issue. 4 | 5 | ## Types of changes 6 | 7 | What types of changes does your code introduce to quinn? 8 | _Put an `x` in the boxes that apply_ 9 | 10 | - [ ] Bugfix (non-breaking change which fixes an issue) 11 | - [ ] New feature (non-breaking change which adds functionality) 12 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 13 | - [ ] Documentation Update (if none of the other choices apply) 14 | 15 | ## Further comments 16 | 17 | If this is a relatively large or complex change, kick off the discussion by explaining why you chose the solution you did and what alternatives you considered, etc... -------------------------------------------------------------------------------- /.github/workflows/assign-on-comment.yml: -------------------------------------------------------------------------------- 1 | # This workflow was inspired by the issue_comments.yml workflow from the delta-io/delta-rs repository. 2 | # Source: https://github.com/delta-io/delta-rs/blob/main/.github/workflows/issue_comments.yml 3 | name: Auto-assign issue on comment 4 | 5 | on: 6 | issue_comment: 7 | types: [created] 8 | 9 | permissions: 10 | issues: write 11 | 12 | jobs: 13 | auto-assign-issue: 14 | runs-on: ubuntu-latest 15 | if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' 16 | concurrency: 17 | # Only run one a time per user 18 | group: ${{ github.actor }}-auto-assign-issue 19 | steps: 20 | - name: Check if issue can be assigned 21 | id: check-assignee 22 | run: | 23 | RESPONSE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}' -s) 24 | echo "HTTP_CODE=$RESPONSE" >> $GITHUB_ENV 25 | 26 | - name: Assign issue to commenter 27 | if: env.HTTP_CODE == '204' 28 | run: | 29 | echo "Assigning issue #${{ github.event.issue.number }} to @${{ github.event.comment.user.login }}" 30 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees 31 | env: 32 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 33 | 34 | - name: Log failure to assign 35 | if: env.HTTP_CODE != '204' 36 | run: | 37 | echo "Issue #${{ github.event.issue.number }} cannot be assigned to @${{ github.event.comment.user.login }}. HTTP response code: ${{ env.HTTP_CODE }}" -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Unit tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | workflow_dispatch: 11 | 12 | jobs: 13 | 14 | test: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | include: 20 | - pyspark-version: 2.4.8 # latest published 2.x version 21 | pip-packages: "pypandoc==1.7 pyspark==2.4.8" # downgrade of pypandoc necessary 22 | - pyspark-version: 3.0.3 23 | pip-packages: "pyspark==3.0.3" 24 | - pyspark-version: 3.1.3 25 | pip-packages: "pyspark==3.1.3" 26 | - pyspark-version: 3.2.4 27 | pip-packages: "pyspark==3.2.4" 28 | - pyspark-version: 3.3.2 29 | pip-packages: "pyspark==3.3.2" 30 | - pyspark-version: 3.4.0 31 | pip-packages: "pyspark==3.4.0" 32 | 33 | steps: 34 | - uses: actions/checkout@v1 35 | with: 36 | fetch-depth: 1 37 | 38 | - name: Setup Java 39 | uses: actions/setup-java@v3 40 | with: 41 | distribution: 'zulu' 42 | java-version: '8' # Supported by Spark 2.x & 3.x 43 | 44 | - name: Get supported Python Version depending on PySpark 45 | uses: haya14busa/action-cond@v1 46 | id: python_version 47 | with: 48 | cond: ${{ startsWith(matrix.pyspark-version, '2.') }} 49 | if_true: '3.7' # latest supported version for PySpark 2.x 50 | if_false: '3.9' # PySpark 3+ 51 | 52 | - name: Set up Python ${{ steps.python_version.outputs.value }} 53 | uses: actions/setup-python@v2 54 | with: 55 | python-version: ${{ steps.python_version.outputs.value }} 56 | 57 | - name: Get supported Poetry version 58 | uses: haya14busa/action-cond@v1 59 | id: poetry_version 60 | with: 61 | cond: ${{ startsWith(matrix.pyspark-version, '2.') }} 62 | if_true: '1.5.1' # latest supported version for PySpark 2.x 63 | if_false: '1.6.1' # PySpark 3+ 64 | 65 | - name: Install Poetry 66 | uses: snok/install-poetry@v1 67 | with: 68 | version: ${{ steps.poetry_version.outputs.value }} 69 | 70 | - name: Cache Poetry virtualenv 71 | uses: actions/cache@v1 72 | id: cache 73 | with: 74 | path: ~/.virtualenvs 75 | key: poetry-${{ hashFiles('**/poetry.lock') }} 76 | restore-keys: | 77 | poetry-${{ hashFiles('**/poetry.lock') }} 78 | 79 | - name: Install dependencies 80 | run: make install_test 81 | if: steps.cache.outputs.cache-hit != 'true' 82 | 83 | - name: Change PySpark to version ${{ matrix.pyspark-version }} 84 | env: 85 | PIP_PACKAGES: ${{ matrix.pip-packages }} 86 | run: poetry run pip install $PIP_PACKAGES # Using pip shouldn't mess up poetry cache 87 | 88 | - name: Run tests with pytest against PySpark ${{ matrix.pyspark-version }} 89 | run: make test 90 | 91 | check-license-headers: 92 | runs-on: ubuntu-latest 93 | steps: 94 | - uses: actions/checkout@v2 95 | with: 96 | fetch-depth: 0 97 | 98 | - name: Check License Header 99 | uses: apache/skywalking-eyes/dependency@main 100 | with: 101 | log: debug 102 | config: .licenserc.yaml 103 | mode: check 104 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | workflow_dispatch: 11 | 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v4 19 | with: 20 | python-version: 3.9 21 | - name: Run Ruff 22 | uses: chartboost/ruff-action@v1 23 | with: 24 | version: 0.0.291 25 | -------------------------------------------------------------------------------- /.github/workflows/mkdocs.yml: -------------------------------------------------------------------------------- 1 | name: MKDocs deploy 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 3.9 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.9 19 | - name: Set up Poetry 20 | uses: abatilo/actions-poetry@v2 21 | with: 22 | poetry-version: 1.4.0 23 | - name: Cache Poetry virtualenv 24 | uses: actions/cache@v1 25 | id: cache 26 | with: 27 | path: ~/.virtualenvs 28 | key: poetry-${{ hashFiles('**/poetry.lock') }} 29 | restore-keys: | 30 | poetry-${{ hashFiles('**/poetry.lock') }} 31 | - name: Install dependencies 32 | run: 33 | make install_deps 34 | if: steps.cache.outputs.cache-hit != 'true' 35 | - name: Setup GH 36 | run: | 37 | sudo apt update && sudo apt install -y git 38 | git config user.name 'github-actions[bot]' 39 | git config user.email 'github-actions[bot]@users.noreply.github.com' 40 | - name: Build and Deploy 41 | run: 42 | poetry run mkdocs gh-deploy --force 43 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: Testing against single PySpark version 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | 11 | detect_code_changes: 12 | runs-on: ubuntu-latest 13 | outputs: 14 | code_changes: ${{ steps.changes.outputs.code_changes }} 15 | steps: 16 | - uses: dorny/paths-filter@v3 17 | id: changes 18 | with: 19 | filters: | 20 | code_changes: 21 | - 'quinn/**' 22 | - 'tests/**' 23 | - 'benchmarks/**' 24 | - '.github/**' 25 | - 'poetry.lock' 26 | - 'pyproject.toml' 27 | 28 | test: 29 | runs-on: ubuntu-latest 30 | needs: [detect_code_changes] 31 | steps: 32 | - uses: actions/checkout@v3 33 | if: needs.detect_code_changes.outputs.code_changes == 'true' 34 | with: 35 | fetch-depth: 1 36 | 37 | - name: Setup Java 38 | uses: actions/setup-java@v3 39 | if: needs.detect_code_changes.outputs.code_changes == 'true' 40 | with: 41 | distribution: 'zulu' 42 | java-version: '8' # Supported by Spark 2.x & 3.x 43 | 44 | - name: Set up Python 3.9 45 | uses: actions/setup-python@v4 46 | if: needs.detect_code_changes.outputs.code_changes == 'true' 47 | with: 48 | python-version: 3.9 49 | 50 | - name: Install Poetry 51 | uses: snok/install-poetry@v1 52 | if: needs.detect_code_changes.outputs.code_changes == 'true' 53 | with: 54 | version: 1.6.1 55 | 56 | - name: Cache Poetry virtualenv 57 | uses: actions/cache@v1 58 | if: needs.detect_code_changes.outputs.code_changes == 'true' 59 | id: cache 60 | with: 61 | path: ~/.virtualenvs 62 | key: poetry-${{ hashFiles('**/poetry.lock') }} 63 | restore-keys: | 64 | poetry-${{ hashFiles('**/poetry.lock') }} 65 | 66 | - name: Install dependencies 67 | if: | 68 | needs.detect_code_changes.outputs.code_changes == 'true' && 69 | steps.cache.outputs.cache-hit != 'true' 70 | run: make install_test 71 | # if: steps.cache.outputs.cache-hit != 'true' 72 | 73 | - name: Run tests with pytest 74 | if: needs.detect_code_changes.outputs.code_changes == 'true' 75 | run: make test 76 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | quinn.egg-info/ 4 | .cache/ 5 | tmp/ 6 | .idea/ 7 | .DS_Store 8 | 9 | .coverage* 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | 15 | .pytest_cache/ 16 | 17 | # PyVenv 18 | .env 19 | .venv 20 | venv 21 | 22 | # Linters cache 23 | .mypy_cache 24 | .ruff_cache 25 | 26 | # MKDocs 27 | site 28 | 29 | # VSCode 30 | .vscode 31 | 32 | # Emacs 33 | .dir_locals.el 34 | 35 | # Jupyter notebooks 36 | .ipynb_checkpoints 37 | 38 | # Benchmarking 39 | *.crc 40 | *.parquet 41 | _SUCCESS -------------------------------------------------------------------------------- /.licenserc.yaml: -------------------------------------------------------------------------------- 1 | license: 2 | type: Apache-2.0 3 | copyright-owner: Apache Software Foundation 4 | header: 5 | content: | 6 | Licensed to the Apache Software Foundation (ASF) under one or more 7 | contributor license agreements. See the NOTICE file distributed with 8 | this work for additional information regarding copyright ownership. 9 | The ASF licenses this file to You under the Apache License, Version 2.0 10 | (the "License"); you may not use this file except in compliance with 11 | the License. You may obtain a copy of the License at 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | paths-ignore: 19 | - 'tests/**' 20 | - 'poetry.lock' 21 | paths: 22 | - "**/*.py" 23 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/charliermarsh/ruff-pre-commit 3 | # Ruff version. 4 | rev: 'v0.0.291' 5 | hooks: 6 | - id: ruff 7 | - repo: local 8 | hooks: 9 | - id: pytest 10 | name: pytest-check 11 | entry: poetry run pytest 12 | language: system 13 | pass_filenames: false 14 | # Runs only on python files 15 | types: [ python ] 16 | always_run: true 17 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.7.5 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome to the Quinn contributing guide 2 | 3 | ## Issues 4 | 5 | ### Create a new issue 6 | 7 | If you spot a problem with the docs, search if an issue already. If a related issue doesn't exist, you can open a [new issue](https://github.com/MrPowers/quinn/issues/new). 8 | 9 | ### Solve an issue 10 | 11 | Scan through our [existing issues](https://github.com/MrPowers/quinn/issues) to find one that interests you. If you find an issue to work on, make sure that no one else is already working on it, so you can get assigned. After that, you are welcome to open a PR with a fix. 12 | 13 | ### Good first issue 14 | 15 | You can find a list of [good first issues](https://github.com/MrPowers/quinn/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) which can help you better understand code base of the project. 16 | 17 | ### Auto-assigning issues 18 | 19 | We have a workflow that automatically assigns issues to users who comment 'take' on an issue. This is configured in the `.github/workflows/assign-on-comment.yml` file. When a user comments `take` on the issue, a GitHub Action will be run to assign the issue to the user if it's not already assigned. 20 | 21 | ## Contributing 22 | 23 | ### Fork the repository 24 | 25 | To start contributing you should fork this repository and only after that clone your fork. If you accidentally forked this repository you can fix it any time by this command: 26 | 27 | ```shell 28 | # for user-login 29 | git remote set-url origin https://github.com/your-github-name/quinn.git 30 | # for private keys way 31 | git remote set-url origin git@github.com:your-github-name/quinn.git 32 | ``` 33 | 34 | ### Install the project 35 | 36 | #### Installing poetry 37 | 38 | After cloning the project you should install all the dependencies. We are using `poetry` as a build tool. You can install `poetry` by following [this instruction](https://python-poetry.org/docs/#installation). 39 | 40 | #### Installing dependencies 41 | 42 | You can create a virtualenv with `poetry`. The recommended version of Python is `3.9`: 43 | ```shell 44 | poetry env use python3.9 45 | ``` 46 | 47 | After that you should install all the dependencies including development: 48 | ```shell 49 | make install_deps 50 | ``` 51 | 52 | #### Setup Java 53 | 54 | To run spark tests you need to have properly configured Java. Apache Spark currently supports mainly only Java 8 (1.8). You can find an instruction on how to set up Java [here](https://www.java.com/en/download/help/download_options.html). When you are running spark tests you should have `JAVA_HOME` variable in your environment which points to the installation of Java 8. 55 | 56 | ### Pre-commit installation and execution 57 | 58 | We use pre-commit hooks to ensure code quality. The configuration for pre-commit hooks is in the `.pre-commit-config.yaml` file. To install pre-commit, run: 59 | ```shell 60 | poetry shell 61 | poetry run pre-commit install 62 | ``` 63 | To run pre-commit hooks manually, use: 64 | ```shell 65 | pre-commit run --all-files 66 | ``` 67 | 68 | ### Running Tests 69 | 70 | This project uses `pytest` and `chispa` for running spark tests. Please run all the tests before creating a pull request. In the case when you are working on new functionality you should also add new tests. 71 | You can run test as following: 72 | ```shell 73 | make test 74 | ``` 75 | 76 | ### GitHub Actions local setup using 'act' 77 | 78 | You can run GitHub Actions locally using the `act` tool. The configuration for GitHub Actions is in the `.github/workflows/ci.yml` file. To install `act`, follow the instructions [here](https://github.com/nektos/act#installation). To run a specific job, use: 79 | ```shell 80 | act -j 81 | ``` 82 | For example, to run the `test` job, use: 83 | ```shell 84 | act -j test 85 | ``` 86 | If you need help with `act`, use: 87 | ```shell 88 | act --help 89 | ``` 90 | For MacBooks with M1 processors, you might have to add the `--container-architecture` tag: 91 | ```shell 92 | act -j --container-architecture linux/arm64 93 | ``` 94 | 95 | ### Code style 96 | 97 | This project follows the [PySpark style guide](https://github.com/MrPowers/spark-style-guide/blob/main/PYSPARK_STYLE_GUIDE.md). All public functions and methods should be documented in `README.md` and also should have docstrings in `sphinx format`: 98 | 99 | ```python 100 | """[Summary] 101 | 102 | :param [ParamName]: [ParamDescription], defaults to [DefaultParamVal] 103 | :type [ParamName]: [ParamType](, optional) 104 | ... 105 | :raises [ErrorType]: [ErrorDescription] 106 | ... 107 | :return: [ReturnDescription] 108 | :rtype: [ReturnType] 109 | """ 110 | ``` 111 | 112 | We are using `isort` and `ruff` as linters. You can find instructions on how to set up and use these tools here: 113 | 114 | 1. [isort](https://pycqa.github.io/isort/) 115 | 2. [ruff](https://github.com/charliermarsh/ruff) 116 | 117 | ### Adding ruff to IDEs 118 | 119 | #### VSCode 120 | 121 | 1. Install the `Ruff` extension by Astral Software from the VSCode marketplace (Extension ID: *charliermarsh.ruff*). 122 | 2. Open the command palette (Ctrl+Shift+P) and select `Preferences: Open Settings (JSON)`. 123 | 3. Add the following configuration to your settings.json file: 124 | 125 | ```json 126 | { 127 | "python.linting.ruffEnabled": true, 128 | "python.linting.enabled": true, 129 | "python.formatting.provider": "none", 130 | "editor.formatOnSave": true 131 | } 132 | ``` 133 | The above settings will enable linting with Ruff, and format your code with Ruff on save. 134 | 135 | #### PyCharm 136 | 137 | To set up `Ruff` in PyCharm using `poetry`, follow these steps: 138 | 139 | 1. **Find the path to your `poetry` executable:** 140 | - Open a terminal. 141 | - For macOS/Linux, use the command `which poetry`. 142 | - For Windows, use the command `where poetry`. 143 | - Note down the path returned by the command. 144 | 145 | 2. **Open the `Preferences` window** (Cmd+, on macOS). 146 | 3. **Navigate to `Tools` > `External Tools`.** 147 | 4. **Click the `+` icon** to add a new external tool. 148 | 5. **Fill in the following details:** 149 | - **Name:** `Ruff` 150 | - **Program:** Enter the path to your `poetry` executable that you noted earlier. 151 | - **Arguments:** `run ruff check --fix $FilePathRelativeToProjectRoot$` 152 | - **Working directory:** `$ProjectFileDir$` 153 | 6. **Click `OK`** to save the configuration. 154 | 7. **To run Ruff,** right-click on a file or directory in the project view, select `External Tools`, and then select `Ruff`. 155 | 156 | ### Pull Request 157 | 158 | When you're finished with the changes, create a pull request, also known as a PR. 159 | - Don't forget to link PR to the issue if you are solving one. 160 | - As you update your PR and apply changes, mark each conversation as resolved. 161 | - If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues. 162 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for describing the origin of the Work and 141 | reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # COMMON CLI COMMANDS FOR DEVELOPMENT 2 | 3 | .PHONY: install_test 4 | install_test: 5 | @poetry install --with=development,testing 6 | 7 | .PHONY: install_deps 8 | install_deps: 9 | @poetry install --with=development,linting,testing,docs 10 | 11 | .PHONY: update_deps 12 | update_deps: 13 | @poetry update --with=development,linting,testing,docs 14 | 15 | .PHONY: test 16 | test: 17 | @poetry run pytest tests 18 | 19 | .PHONY: lint 20 | lint: 21 | @poetry run ruff check --fix quinn 22 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | """Runs benchmarks on quinn functions.""" 15 | -------------------------------------------------------------------------------- /benchmarks/benchmark_column_performance.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | import json 17 | import timeit 18 | from pathlib import Path 19 | 20 | 21 | def auto_timeit( 22 | stmt: str = "pass", 23 | setup: str = "pass", 24 | min_runtime_seconds: int = 2, 25 | ) -> list[float]: 26 | """Automatically determine the number of runs to perform to get a minimum.""" 27 | min_runs = 5 28 | print(f"Running {stmt} 1 time...") 29 | t = timeit.repeat(stmt, setup, repeat=1, number=1) 30 | 31 | print(f"First run: {t[0]:.2f} seconds") 32 | if t[0] >= min_runtime_seconds: 33 | return t 34 | 35 | expected_runs_needed = int((min_runtime_seconds // t[0]) + 1) 36 | if expected_runs_needed < min_runs: 37 | expected_runs_needed = min_runs 38 | 39 | expected_runtime = t[0] * expected_runs_needed 40 | print(f"Running {stmt} {expected_runs_needed} times.") 41 | print(f"Expected runtime: {expected_runtime:.2f} seconds...") 42 | return timeit.repeat(stmt, setup, repeat=expected_runs_needed, number=1) 43 | 44 | 45 | def get_result( 46 | test_name: str, 47 | dataset: dict, 48 | expr: str, 49 | min_runtime_seconds: int, 50 | ) -> None: 51 | """Run a test and save the results to a file.""" 52 | setup = f"""import timeit 53 | import pyspark.sql.functions as F 54 | from pyspark.sql import DataFrame, SparkSession 55 | builder = ( 56 | SparkSession.builder.appName("MyApp") 57 | .config("spark.executor.memory", "10G") 58 | .config("spark.driver.memory", "25G") 59 | .config("spark.sql.shuffle.partitions", "2") 60 | .config("spark.sql.execution.arrow.pyspark.enabled", "true") 61 | ) 62 | spark = builder.getOrCreate() 63 | {dataset['name']} = spark.read.parquet('benchmarks/data/mvv_{dataset['name']}') 64 | """ 65 | stmt = expr.replace("df", dataset["name"]) 66 | result = auto_timeit(stmt, setup, min_runtime_seconds) 67 | 68 | summary = { 69 | "test_name": test_name, 70 | "dataset": dataset["name"], 71 | "dataset_size": dataset["size"], 72 | "runtimes": result, 73 | } 74 | 75 | result_path = f"results/{test_name}_{dataset['name']}.json" 76 | with Path(__file__).parent.joinpath(result_path).open(mode="w") as f: 77 | json.dump(summary, f, indent=4) 78 | 79 | 80 | config = { 81 | "toPandas": {"expr": "list(df.select('mvv').toPandas()['mvv'])"}, 82 | "flatmap": {"expr": "df.select('mvv').rdd.flatMap(lambda x: x).collect()"}, 83 | "map": {"expr": "df.select('mvv').rdd.map(lambda row : row[0]).collect()"}, 84 | "collectlist": {"expr": "[row[0] for row in df.select('mvv').collect()]"}, 85 | "localIterator": {"expr": "[r[0] for r in df.select('mvv').toLocalIterator()]"}, 86 | } 87 | 88 | 89 | DATASETS = { 90 | "large": {"name": "large", "size": 100_000_000, "min_runtime_seconds": 1200}, 91 | "medium": {"name": "medium", "size": 10_000_000, "min_runtime_seconds": 360}, 92 | "small": {"name": "small", "size": 100_000, "min_runtime_seconds": 20}, 93 | "xsmall": {"name": "xsmall", "size": 1_000, "min_runtime_seconds": 20}, 94 | } 95 | 96 | for test_name, test_config in config.items(): 97 | print(f"======================{test_name}======================") 98 | for dataset_name in DATASETS: 99 | dataset = DATASETS[dataset_name] 100 | print(f"TESTING DATASET {dataset['name']} [n={dataset['size']:,}]") 101 | get_result( 102 | test_name=test_name, 103 | dataset=dataset, 104 | expr=test_config["expr"], 105 | min_runtime_seconds=dataset["min_runtime_seconds"], 106 | ) 107 | -------------------------------------------------------------------------------- /benchmarks/create_benchmark_df.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | import random 17 | from typing import TYPE_CHECKING, Optional 18 | 19 | from pyspark.sql import SparkSession 20 | from pyspark.sql import functions as F # noqa: N812 21 | 22 | if TYPE_CHECKING: 23 | from pyspark.sql.dataframe import DataFrame 24 | 25 | 26 | def generate_df(spark: SparkSession, n: int) -> DataFrame: 27 | """Generate a dataframe with a monotonically increasing id column and a random count column.""" 28 | count_vals = [(random.randint(1, 10),) for _ in range(n)] # noqa: S311 29 | output: DataFrame = ( 30 | spark.createDataFrame(count_vals, schema=["count"]) 31 | .withColumn("mvv", F.monotonically_increasing_id()) 32 | .select("mvv", "count") 33 | ) 34 | return output 35 | 36 | 37 | def save_benchmark_df( 38 | spark: SparkSession, 39 | n: int, 40 | data_label: str, 41 | repartition_n: Optional[int] = None, 42 | ) -> None: 43 | """Save a benchmark dataframe to disk.""" 44 | print(f"Generating benchmark df for n={n}") 45 | benchmark_df = generate_df(spark, n) 46 | 47 | if repartition_n is not None: 48 | benchmark_df = benchmark_df.repartition(repartition_n) 49 | 50 | benchmark_df.write.mode("overwrite").parquet(f"benchmarks/data/mvv_{data_label}") 51 | 52 | 53 | if __name__ == "__main__": 54 | xsmall_n = 1_000 55 | small_n = 100_000 56 | medium_n = 10_000_000 57 | large_n = 100_000_000 58 | 59 | builder = ( 60 | SparkSession.builder.appName("MyApp") 61 | .config("spark.executor.memory", "20G") 62 | .config("spark.driver.memory", "25G") 63 | .config("spark.sql.shuffle.partitions", "2") 64 | ) 65 | 66 | spark = builder.getOrCreate() 67 | save_benchmark_df(spark, xsmall_n, "xsmall", 1) 68 | save_benchmark_df(spark, small_n, "small", 1) 69 | save_benchmark_df(spark, medium_n, "medium", 1) 70 | save_benchmark_df(spark, large_n, "large", 4) 71 | -------------------------------------------------------------------------------- /benchmarks/results/collectlist_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "collectlist", 3 | "dataset": "large", 4 | "dataset_size": 100000000, 5 | "runtimes": [ 6 | 129.20805395802017, 7 | 126.53530854202108, 8 | 129.99196012501488, 9 | 130.67483216698747, 10 | 126.88453424998443, 11 | 139.92618966597365, 12 | 141.54181875000359, 13 | 136.65802104197792, 14 | 129.75925845801248 15 | ] 16 | } -------------------------------------------------------------------------------- /benchmarks/results/collectlist_medium.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "collectlist", 3 | "dataset": "medium", 4 | "dataset_size": 10000000, 5 | "runtimes": [ 6 | 11.525758125004359, 7 | 11.570582416985417, 8 | 11.951778874994488, 9 | 12.054943958006334, 10 | 11.80891958301072, 11 | 11.82376299999305, 12 | 11.762349167023785, 13 | 11.46418624999933, 14 | 11.415677415992832, 15 | 11.75218004200724, 16 | 11.825585749989841, 17 | 11.855922749993624, 18 | 11.871351749985479, 19 | 11.430663749983069, 20 | 11.910512792004738, 21 | 12.044869125005789, 22 | 12.068957833980676, 23 | 11.957036042003892, 24 | 11.966440916992724, 25 | 11.30719208298251, 26 | 11.919239667011425, 27 | 11.903133832995081, 28 | 11.947826708987122, 29 | 11.717349375016056, 30 | 11.447638457990251, 31 | 11.941632540983846, 32 | 11.918223374988884, 33 | 11.805195124994498, 34 | 11.892586542002391, 35 | 12.127137292001862, 36 | 11.547379292023834, 37 | 11.897269000008237 38 | ] 39 | } -------------------------------------------------------------------------------- /benchmarks/results/collectlist_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "collectlist", 3 | "dataset": "small", 4 | "dataset_size": 100000, 5 | "runtimes": [ 6 | 0.12345570797333494, 7 | 0.1314004999876488, 8 | 0.12502691597910598, 9 | 0.12530479099950753, 10 | 0.12634062499273568, 11 | 0.12584854100714438, 12 | 0.12789558398071676, 13 | 0.12472200000775047, 14 | 0.1238887500076089, 15 | 0.13261420800699852, 16 | 0.12579454202204943, 17 | 0.13280487500014715, 18 | 0.12472591700498015, 19 | 0.12935254198964685, 20 | 0.12733795901294798, 21 | 0.1325765420042444, 22 | 0.12539891598862596, 23 | 0.12588458400568925, 24 | 0.12925195900606923, 25 | 0.12419299999601208, 26 | 0.12389950000215322, 27 | 0.12363229200127535, 28 | 0.13055249999160878, 29 | 0.12383425000007264, 30 | 0.12416162498993799, 31 | 0.12477433300227858, 32 | 0.12346441601403058, 33 | 0.12381883300258778, 34 | 0.12394650001078844, 35 | 0.12412324998877011, 36 | 0.12496170899248682, 37 | 0.12474145801388659, 38 | 0.1277002909919247, 39 | 0.12949495899374597, 40 | 0.13194816702161916, 41 | 0.12390154198510572, 42 | 0.12345825001830235, 43 | 0.1250534169957973, 44 | 0.12404837500071153, 45 | 0.12392250000266358, 46 | 0.12349095800891519, 47 | 0.12369754200335592, 48 | 0.12301004098844714, 49 | 0.12339300001622178, 50 | 0.12399666698183864, 51 | 0.12378454199642874, 52 | 0.12521987498621456, 53 | 0.12401437500375323, 54 | 0.1271002079884056, 55 | 0.12385850001010112, 56 | 0.12461154101765715, 57 | 0.12916650000261143, 58 | 0.1409682499943301, 59 | 0.1362035000056494, 60 | 0.13603016699198633, 61 | 0.13636175001738593, 62 | 0.14432266599033028, 63 | 0.12358908398891799, 64 | 0.12381654200726189, 65 | 0.13095516699831933, 66 | 0.12406137501238845, 67 | 0.12393783399602398, 68 | 0.12295912500121631, 69 | 0.12365083300392143, 70 | 0.12374750000890344, 71 | 0.12418616699869744, 72 | 0.12332833299296908, 73 | 0.12342066699056886, 74 | 0.12364624999463558, 75 | 0.12354191701160744, 76 | 0.12355058299726807, 77 | 0.12401170801604167, 78 | 0.12359929201193154, 79 | 0.12448004202451557, 80 | 0.12446349998936057, 81 | 0.12385987499146722, 82 | 0.1240622499899473, 83 | 0.12475716599146836, 84 | 0.13379766599973664, 85 | 0.13572154100984335, 86 | 0.13705558300716802, 87 | 0.14468491700245067, 88 | 0.15964958298718557, 89 | 0.12460808298783377, 90 | 0.12353074998827651, 91 | 0.12293012501322664, 92 | 0.12347437502467074, 93 | 0.12478362501133233, 94 | 0.1258309579861816, 95 | 0.12435858300887048, 96 | 0.12403041598736309, 97 | 0.12377791601466015, 98 | 0.12300579200382344, 99 | 0.12372366600902751, 100 | 0.12322549999225885, 101 | 0.12399170798016712, 102 | 0.12390183401294053, 103 | 0.1246394159970805, 104 | 0.12383850000333041, 105 | 0.1230427919945214, 106 | 0.12371399998664856, 107 | 0.12325983299524523, 108 | 0.1240181670000311, 109 | 0.12403816697769798, 110 | 0.12381629200535826, 111 | 0.1249765410029795, 112 | 0.12391612501232885, 113 | 0.12437091598985717, 114 | 0.12400683399755508, 115 | 0.1270715839928016, 116 | 0.13924929199856706, 117 | 0.1370136250043288, 118 | 0.13644420797936618, 119 | 0.14451066602487117, 120 | 0.1584999579936266, 121 | 0.12467087499680929, 122 | 0.12404358299681917, 123 | 0.12400129198795184, 124 | 0.12341158301569521, 125 | 0.12442779101547785, 126 | 0.12422066699946299, 127 | 0.1319297500012908, 128 | 0.12391049999860115, 129 | 0.12305962498066947, 130 | 0.12398437500814907, 131 | 0.1236839999910444, 132 | 0.12356999999610707, 133 | 0.12458725000033155, 134 | 0.12558270900626667, 135 | 0.1299470840021968, 136 | 0.12408704101108015, 137 | 0.12365183301153593, 138 | 0.12331187500967644, 139 | 0.12352816699421965, 140 | 0.12367491700570099, 141 | 0.12497841700678691, 142 | 0.12355091699282639, 143 | 0.12515654202434234, 144 | 0.1254462499928195, 145 | 0.12387974999728613, 146 | 0.1302125419897493, 147 | 0.12347279200912453, 148 | 0.12541570799658075, 149 | 0.12406129197916016, 150 | 0.12351383300847374, 151 | 0.12356224999530241, 152 | 0.12350362501456402, 153 | 0.12993870800710283, 154 | 0.13651637500151992, 155 | 0.13703787501435727, 156 | 0.136697332985932, 157 | 0.13669541699346155, 158 | 0.15247504197759554, 159 | 0.12481941698933952, 160 | 0.12601887501659803, 161 | 0.12487887497991323, 162 | 0.12363970800652169, 163 | 0.12311479201889597, 164 | 0.12400833301944658, 165 | 0.12335287500172853, 166 | 0.12409445899538696 167 | ] 168 | } -------------------------------------------------------------------------------- /benchmarks/results/flatmap_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "flatmap", 3 | "dataset": "large", 4 | "dataset_size": 100000000, 5 | "runtimes": [ 6 | 36.188880041998345, 7 | 35.77120683397516, 8 | 35.89365566600463, 9 | 35.60720691701863, 10 | 35.81423420799547, 11 | 35.66271516599227, 12 | 35.84787024999969, 13 | 35.752700749988435, 14 | 36.1162334579858, 15 | 35.668734874983784, 16 | 35.496447625017026, 17 | 35.78953500001808, 18 | 35.481063749990426, 19 | 35.545604249986354, 20 | 35.45867395901587, 21 | 35.56992366700433, 22 | 35.742496374994516, 23 | 35.539746249996824, 24 | 35.67015320900828, 25 | 35.719724208000116, 26 | 35.8916146249976, 27 | 35.6827434169827, 28 | 35.925275417015655, 29 | 35.92435587500222, 30 | 35.622160916012945, 31 | 35.60375379101606, 32 | 35.69027008401463, 33 | 36.12705849998747, 34 | 36.063100625004154, 35 | 35.65569358400535, 36 | 35.75822524999967, 37 | 35.81311866699252, 38 | 35.969940707989736 39 | ] 40 | } -------------------------------------------------------------------------------- /benchmarks/results/flatmap_medium.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "flatmap", 3 | "dataset": "medium", 4 | "dataset_size": 10000000, 5 | "runtimes": [ 6 | 12.45473395800218, 7 | 12.484648000012385, 8 | 12.400262917013606, 9 | 12.440737958007958, 10 | 12.452081541006919, 11 | 12.45336887499434, 12 | 12.473071416985476, 13 | 12.459413582982961, 14 | 12.544886957999552, 15 | 12.565210083004786, 16 | 12.474220750009408, 17 | 12.518661708018044, 18 | 12.42703645900474, 19 | 12.512266999983694, 20 | 12.47433920900221, 21 | 12.494368834013585, 22 | 12.473423833027482, 23 | 12.557817583001452, 24 | 12.480229584005428, 25 | 12.469799874990713, 26 | 12.42109241601429, 27 | 12.525904500012984, 28 | 12.395361124974443, 29 | 12.4593050830008, 30 | 12.431161542015616, 31 | 12.513594541989733, 32 | 12.510616583022056, 33 | 12.537003458011895, 34 | 12.404833499982487 35 | ] 36 | } -------------------------------------------------------------------------------- /benchmarks/results/flatmap_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "flatmap", 3 | "dataset": "small", 4 | "dataset_size": 100000, 5 | "runtimes": [ 6 | 0.17894395801704377, 7 | 0.18047670801752247, 8 | 0.1778973330219742, 9 | 0.1768727089802269, 10 | 0.18268641698523425, 11 | 0.18213908301549964, 12 | 0.18128341698320583, 13 | 0.1818746250064578, 14 | 0.1774590410059318, 15 | 0.1772789589886088, 16 | 0.17803700000513345, 17 | 0.1780551660049241, 18 | 0.17709937499603257, 19 | 0.17629362500156276, 20 | 0.17730016700807028, 21 | 0.17746695800451562, 22 | 0.18029112499789335, 23 | 0.18132383402553387, 24 | 0.17189087497536093, 25 | 0.18303454198758118, 26 | 0.17968620802275836, 27 | 0.1802715000230819, 28 | 0.17883554100990295, 29 | 0.17680829201708548, 30 | 0.18031212501227856, 31 | 0.1771631249866914, 32 | 0.1782566249894444, 33 | 0.1816232909914106, 34 | 0.1801042500010226, 35 | 0.18101133400341496, 36 | 0.17983466701116413, 37 | 0.17949583401787095, 38 | 0.17818658400210552, 39 | 0.17664745802176185, 40 | 0.1771139999909792, 41 | 0.17674108300707303, 42 | 0.17973291699308902, 43 | 0.1802107500261627, 44 | 0.1762191250163596, 45 | 0.17706849999376573, 46 | 0.17918199999257922, 47 | 0.17786145798163489, 48 | 0.17716412502340972, 49 | 0.1766410409763921, 50 | 0.1768924999923911, 51 | 0.1775729159999173, 52 | 0.17990487499628216, 53 | 0.18075749999843538, 54 | 0.18906725000124425, 55 | 0.17940695802099071, 56 | 0.17809845801093616, 57 | 0.1767636250006035, 58 | 0.1777554590080399, 59 | 0.17755454202415422, 60 | 0.18041850000736304, 61 | 0.1767314170137979, 62 | 0.18011308400309645, 63 | 0.1974650830088649, 64 | 0.17981416700058617, 65 | 0.17755545899854042, 66 | 0.1817649999866262, 67 | 0.18100174999563023, 68 | 0.17874133397708647, 69 | 0.17930370901012793, 70 | 0.1778174999926705, 71 | 0.17907295800978318, 72 | 0.18477404201985337, 73 | 0.17951629200251773, 74 | 0.17831079100142233, 75 | 0.17696441698353738, 76 | 0.17663983299280517, 77 | 0.18076049999217503, 78 | 0.18051687499973923, 79 | 0.1794402500090655, 80 | 0.17824170799576677, 81 | 0.17966912500560284, 82 | 0.18000287501490675, 83 | 0.17883608298143372, 84 | 0.1772286659979727, 85 | 0.17074179102201015, 86 | 0.1788426669954788, 87 | 0.17712083397782408, 88 | 0.17752358398865908, 89 | 0.17786058300407603, 90 | 0.17938104100176133, 91 | 0.17789508399437182, 92 | 0.17982129100710154, 93 | 0.17926004200126044, 94 | 0.17941241699736565, 95 | 0.17706845901557244, 96 | 0.17756150002242066, 97 | 0.17965420801192522, 98 | 0.17814670799998567, 99 | 0.17619208298856393, 100 | 0.17750854199402966, 101 | 0.17663995901239105, 102 | 0.17970875001628883, 103 | 0.1802513329894282, 104 | 0.17710229099611752, 105 | 0.1773068750044331, 106 | 0.17719570797635242, 107 | 0.1766349590034224, 108 | 0.17752995900809765, 109 | 0.17610691700247116, 110 | 0.17689754100865684, 111 | 0.17640399999800138, 112 | 0.17956375001813285, 113 | 0.18023837500368245, 114 | 0.17701083299471065, 115 | 0.17760500000440516, 116 | 0.17637616698630154, 117 | 0.18168737497762777, 118 | 0.18223700000089593 119 | ] 120 | } -------------------------------------------------------------------------------- /benchmarks/results/flatmap_xsmall.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "flatmap", 3 | "dataset": "xsmall", 4 | "dataset_size": 1000, 5 | "runtimes": [ 6 | 0.06271020899293944, 7 | 0.0635450420086272, 8 | 0.06335712500731461, 9 | 0.06321512500289828, 10 | 0.062088957987725735, 11 | 0.06404470797860995, 12 | 0.0631327080191113, 13 | 0.06216233299346641, 14 | 0.06263370800297707, 15 | 0.06245729100191966, 16 | 0.06296845900942571, 17 | 0.06254270899808034, 18 | 0.06320150001556613, 19 | 0.06327195902122185, 20 | 0.06219645799137652, 21 | 0.06233966600848362, 22 | 0.06254474999150261, 23 | 0.06455795798683539, 24 | 0.06295058300020173, 25 | 0.06280325000989251, 26 | 0.06276320898905396, 27 | 0.06324816701817326, 28 | 0.06330595898907632, 29 | 0.06303054100135341, 30 | 0.06244050001259893, 31 | 0.06298899999819696, 32 | 0.06360708401189186, 33 | 0.06316616598633118, 34 | 0.06316054199123755, 35 | 0.06246474999352358, 36 | 0.06387491698842496, 37 | 0.05970104201696813, 38 | 0.0637037499982398, 39 | 0.05698100000154227, 40 | 0.062209665979025885, 41 | 0.062292166985571384, 42 | 0.06463962499401532, 43 | 0.0631676249904558, 44 | 0.06258800000068732, 45 | 0.07127483398653567, 46 | 0.062426417018286884, 47 | 0.062181249988498166, 48 | 0.0633687500085216, 49 | 0.06301658399752341, 50 | 0.062204457994084805, 51 | 0.0635230419866275, 52 | 0.06324341602157801, 53 | 0.06341391601017676, 54 | 0.06350133300293237, 55 | 0.06179737497586757, 56 | 0.06287929098471068, 57 | 0.06224550001206808, 58 | 0.06272991601144895, 59 | 0.06310937501257285, 60 | 0.061683125008130446, 61 | 0.06274387502344325, 62 | 0.05475016700802371, 63 | 0.06346645799931139, 64 | 0.06321550000575371, 65 | 0.06312870798865333, 66 | 0.06330529201659374, 67 | 0.057836541993310675, 68 | 0.06378749999566935, 69 | 0.062172083009500057, 70 | 0.0622389999916777, 71 | 0.06221112501225434, 72 | 0.06303629197645932, 73 | 0.061823541997000575, 74 | 0.06333087501116097, 75 | 0.06266720799612813, 76 | 0.062348166975425556, 77 | 0.061840707989176735, 78 | 0.06385124998632818, 79 | 0.06369624999933876, 80 | 0.06562579199089669, 81 | 0.05890387500403449, 82 | 0.0646380000107456, 83 | 0.06226920799235813, 84 | 0.06261112500214949, 85 | 0.06252599999425001, 86 | 0.06316274998243898, 87 | 0.06251712498487905, 88 | 0.06276937498478219, 89 | 0.06257433400605805, 90 | 0.0631431249785237, 91 | 0.06309254097868688, 92 | 0.06353920898982324, 93 | 0.06316645900369622, 94 | 0.06292100000428036, 95 | 0.06184179100091569, 96 | 0.06192958299652673, 97 | 0.06376187500427477, 98 | 0.06397637500776909, 99 | 0.060634541005128995, 100 | 0.05874520802171901, 101 | 0.06295916601084173, 102 | 0.06267850002041087, 103 | 0.06178716701106168, 104 | 0.06267579199629836, 105 | 0.06213916698470712, 106 | 0.06340775001444854, 107 | 0.06389649998163804, 108 | 0.06311183399520814, 109 | 0.06350162500166334, 110 | 0.06177533301524818, 111 | 0.06338916599634103, 112 | 0.06310429199947976, 113 | 0.061465624981792644, 114 | 0.06373420800082386, 115 | 0.06199354201089591, 116 | 0.06215308399987407, 117 | 0.06257812498370185, 118 | 0.0633107080066111, 119 | 0.06273208401398733, 120 | 0.0631805420271121, 121 | 0.06331116700312123, 122 | 0.06246858401573263, 123 | 0.06368912500329316, 124 | 0.06410987497656606, 125 | 0.06336450000526384, 126 | 0.06258158400305547, 127 | 0.06312241600244306, 128 | 0.06379858302534558, 129 | 0.06289729100535624, 130 | 0.06289037500391714, 131 | 0.06203354100580327, 132 | 0.062200749991461635, 133 | 0.06217162500252016, 134 | 0.06172491700272076, 135 | 0.06542712499503978, 136 | 0.06341266701929271, 137 | 0.06175950000761077, 138 | 0.06402004201663658, 139 | 0.06260204099817201, 140 | 0.06249041701084934, 141 | 0.06243433299823664, 142 | 0.0627741249918472, 143 | 0.06282150000333786, 144 | 0.062061250006081536, 145 | 0.06406637499458157, 146 | 0.06280062498990446, 147 | 0.062304459017468616, 148 | 0.06356004200642928, 149 | 0.06283041698043235, 150 | 0.061882417008746415, 151 | 0.06290529097896069, 152 | 0.06289950001519173, 153 | 0.06333416598499753, 154 | 0.0628599580086302, 155 | 0.06355745901237242, 156 | 0.06196416600141674, 157 | 0.06188041699351743, 158 | 0.06376770898350514, 159 | 0.06300670900964178, 160 | 0.06230516699724831, 161 | 0.06191929100896232, 162 | 0.06300550000742078, 163 | 0.06300841600750573, 164 | 0.06200337500195019, 165 | 0.06354029200156219, 166 | 0.0659458750160411, 167 | 0.05894091600202955, 168 | 0.06434062501648441, 169 | 0.0626857919851318, 170 | 0.06448016598005779, 171 | 0.061678500002017245, 172 | 0.06217950000427663, 173 | 0.06420358398463577, 174 | 0.06217670798650943, 175 | 0.06327091698767617, 176 | 0.06339983301586471, 177 | 0.0625319580140058, 178 | 0.06349887500982732, 179 | 0.06309629100724123, 180 | 0.061857416993007064, 181 | 0.06333012497634627, 182 | 0.062418874993454665, 183 | 0.06239558299421333, 184 | 0.06274174997815862, 185 | 0.06306512499577366, 186 | 0.06182762500247918, 187 | 0.06269933300791308, 188 | 0.06308016699040309, 189 | 0.06277629200485535, 190 | 0.06207137499586679, 191 | 0.06244466599309817, 192 | 0.0633198749856092, 193 | 0.06167837500106543, 194 | 0.06364629199379124, 195 | 0.06294808301026933, 196 | 0.05437820800580084, 197 | 0.06281262502307072, 198 | 0.06303454199223779, 199 | 0.06329387499135919, 200 | 0.06553379198885523, 201 | 0.06447225000010803, 202 | 0.0625169170089066, 203 | 0.06290137500036508, 204 | 0.06351570802507922, 205 | 0.06383208397892304, 206 | 0.06378558400319889, 207 | 0.06266166700515896, 208 | 0.06600916699972004, 209 | 0.065376666985685, 210 | 0.06304545799503103, 211 | 0.06368670801748522, 212 | 0.06373416702263057, 213 | 0.06324483299977146, 214 | 0.0623699999996461, 215 | 0.06250420899596065, 216 | 0.0624531659996137, 217 | 0.061555457999929786, 218 | 0.06325995799852535, 219 | 0.06258933400386013, 220 | 0.06306745801703073, 221 | 0.06318608298897743, 222 | 0.06250525001087226, 223 | 0.06293795901001431, 224 | 0.06285270798252895, 225 | 0.06319583300501108, 226 | 0.06153624999569729, 227 | 0.06166912498883903, 228 | 0.06316866699489765, 229 | 0.0639053330232855, 230 | 0.06248512500314973, 231 | 0.06289591599488631, 232 | 0.06300166697474197, 233 | 0.06208249999326654, 234 | 0.06384591700043529, 235 | 0.06281524998485111, 236 | 0.06237158298608847, 237 | 0.062482915993314236, 238 | 0.06318995798937976, 239 | 0.06263566701090895, 240 | 0.06326208400423639, 241 | 0.06235483300406486, 242 | 0.062092083011521026, 243 | 0.062356249982258305, 244 | 0.06236979100503959, 245 | 0.06236908299615607, 246 | 0.06264866699348204, 247 | 0.0623248330084607, 248 | 0.06337700001313351, 249 | 0.06261341698700562, 250 | 0.06188200000906363, 251 | 0.06209420898812823, 252 | 0.06319358400651254, 253 | 0.06200754200108349, 254 | 0.06214087500120513, 255 | 0.06296975002624094, 256 | 0.06244700000388548, 257 | 0.062316083000041544, 258 | 0.06276762500056066, 259 | 0.06344370800070465, 260 | 0.06250591701245867, 261 | 0.06244683300610632, 262 | 0.062258125020889565, 263 | 0.0626247079926543, 264 | 0.06162374999257736, 265 | 0.06347287498647347, 266 | 0.06327266700100154, 267 | 0.06206795800244436, 268 | 0.06352941697696224, 269 | 0.06299666600534692, 270 | 0.06262033397797495, 271 | 0.0635691249917727, 272 | 0.06253383302828297, 273 | 0.06278875001589768, 274 | 0.055675582989351824, 275 | 0.06303029198898003, 276 | 0.06262429201160558, 277 | 0.06254154100315645, 278 | 0.06339245798881166, 279 | 0.06276429200079292, 280 | 0.06166370800929144, 281 | 0.06227633301750757, 282 | 0.06254345900379121, 283 | 0.06165145800332539, 284 | 0.06242929198197089, 285 | 0.06382783301523887, 286 | 0.06225416698725894, 287 | 0.06241558401961811, 288 | 0.06248187500750646, 289 | 0.06359933299245313, 290 | 0.06395112501922995, 291 | 0.06229025000357069, 292 | 0.06295412499457598, 293 | 0.06498241701046936, 294 | 0.06674179100082256, 295 | 0.07840916700661182, 296 | 0.06708204199094325, 297 | 0.061809292004909366, 298 | 0.06293816599645652, 299 | 0.06328933301847428, 300 | 0.06201166598475538, 301 | 0.06213245901744813, 302 | 0.062167500000214204, 303 | 0.06238087499514222, 304 | 0.06289520900463685, 305 | 0.06334875000175089, 306 | 0.06292608298826963, 307 | 0.06222179200267419, 308 | 0.06339583298540674, 309 | 0.06269570800941437, 310 | 0.06143370800418779, 311 | 0.0618649169919081, 312 | 0.06222004198934883, 313 | 0.06312449998222291, 314 | 0.06340012498549186, 315 | 0.06352550000883639, 316 | 0.06266974998288788, 317 | 0.0615700000198558, 318 | 0.06288187499740161, 319 | 0.06359874998452142, 320 | 0.06129166699247435, 321 | 0.06143262499244884, 322 | 0.06272533399169333 323 | ] 324 | } -------------------------------------------------------------------------------- /benchmarks/results/localIterator_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "localIterator", 3 | "dataset": "large", 4 | "dataset_size": 100000000, 5 | "runtimes": [ 6 | 142.63744066699292, 7 | 144.66499787499197, 8 | 144.58708516601473, 9 | 143.8303821659938, 10 | 144.1865681670024, 11 | 142.28104958301992, 12 | 141.77062158400076, 13 | 142.2637243749923, 14 | 142.05179520900128 15 | ] 16 | } -------------------------------------------------------------------------------- /benchmarks/results/localIterator_medium.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "localIterator", 3 | "dataset": "medium", 4 | "dataset_size": 10000000, 5 | "runtimes": [ 6 | 14.169408250018023, 7 | 14.201851833000546, 8 | 14.226777459000004, 9 | 14.27066791697871, 10 | 14.312426666991087, 11 | 14.300455041025998, 12 | 14.31601262500044, 13 | 14.306134959013434, 14 | 14.316025750013068, 15 | 14.288483624986839, 16 | 14.255477875005454, 17 | 14.252781917020911, 18 | 14.275479709001957, 19 | 14.253912209009286, 20 | 14.302826917002676, 21 | 14.300289417005843, 22 | 14.241876915999455, 23 | 14.261930708016735, 24 | 14.304426707996754, 25 | 14.276426333002746, 26 | 14.25401162498747, 27 | 14.275975541997468, 28 | 14.250861790991621, 29 | 14.247211042005802, 30 | 14.321850750013255, 31 | 14.335214499995345 32 | ] 33 | } -------------------------------------------------------------------------------- /benchmarks/results/localIterator_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "localIterator", 3 | "dataset": "small", 4 | "dataset_size": 100000, 5 | "runtimes": [ 6 | 0.1572923339845147, 7 | 0.15610516601009294, 8 | 0.15729875001125038, 9 | 0.15489737497409806, 10 | 0.15635183299309574, 11 | 0.15574870799900964, 12 | 0.15605116699589416, 13 | 0.15766783300205134, 14 | 0.15673399998922832, 15 | 0.156182542006718, 16 | 0.155715875007445, 17 | 0.15624404099071398, 18 | 0.15578624999034218, 19 | 0.1647759170155041, 20 | 0.15573512500850484, 21 | 0.15536620799684897, 22 | 0.15588595799636096, 23 | 0.15540254101506434, 24 | 0.1557722500001546, 25 | 0.15479129197774455, 26 | 0.15707508299965411, 27 | 0.1547249169962015, 28 | 0.1563042080088053, 29 | 0.16178824999951757, 30 | 0.15583224999136291, 31 | 0.15531091700540856, 32 | 0.1602989160164725, 33 | 0.163350624992745, 34 | 0.15613983297953382, 35 | 0.15645695800776593, 36 | 0.15639954200014472, 37 | 0.15600724998512305, 38 | 0.15622070801327936, 39 | 0.1562975829874631, 40 | 0.1560622500255704, 41 | 0.15612291701836511, 42 | 0.1556804169958923, 43 | 0.1590131660050247, 44 | 0.15829995801323093, 45 | 0.15580970799783245, 46 | 0.1557230000034906, 47 | 0.15624499999103136, 48 | 0.15629954199539497, 49 | 0.15596645799814723, 50 | 0.15629758397699334, 51 | 0.156585292017553, 52 | 0.157058666984085, 53 | 0.15553504100535065, 54 | 0.15594116700231098, 55 | 0.15548545902129263, 56 | 0.1660226249950938, 57 | 0.15594999998575076, 58 | 0.1553985000064131, 59 | 0.15553233301034197, 60 | 0.1561224999895785, 61 | 0.1560029999818653, 62 | 0.15547154101659544, 63 | 0.1558478329970967, 64 | 0.15539704100228846, 65 | 0.15595591699820943, 66 | 0.15522774998680688, 67 | 0.1551085830142256, 68 | 0.15504537502420135, 69 | 0.1558237909921445, 70 | 0.15684320899890736, 71 | 0.15544366600806825, 72 | 0.15511745799449272, 73 | 0.15626950000296347, 74 | 0.1555634169781115, 75 | 0.16012516702176072, 76 | 0.15576475000125356, 77 | 0.1555737080052495, 78 | 0.15493974997662008, 79 | 0.15651558301760815, 80 | 0.15583595901262015, 81 | 0.1553781669936143, 82 | 0.1554269159969408, 83 | 0.1567337499873247, 84 | 0.1555281670007389, 85 | 0.15598245899309404, 86 | 0.15549199999077246, 87 | 0.15640129099483602, 88 | 0.15553787499084137, 89 | 0.15569637500448152, 90 | 0.15516895800828934, 91 | 0.15679262499907054, 92 | 0.15679995800019242, 93 | 0.15697504099807702, 94 | 0.16654237499460578, 95 | 0.15597804100252688, 96 | 0.16012933300225995, 97 | 0.15807808400131762, 98 | 0.15614604199072346, 99 | 0.15587529199547134, 100 | 0.15717116600717418, 101 | 0.1560669590253383, 102 | 0.15600720801739953, 103 | 0.15588104201015085, 104 | 0.1548118340142537, 105 | 0.1565489580098074, 106 | 0.15605266598868184, 107 | 0.15582233399618417, 108 | 0.1555045830027666, 109 | 0.15628379202098586, 110 | 0.15611258300486952, 111 | 0.1557881670014467, 112 | 0.15488887502579018, 113 | 0.15548566699726507, 114 | 0.1555823750095442, 115 | 0.15565433399751782, 116 | 0.15696658301749267, 117 | 0.15633958400576375, 118 | 0.15659804100869223, 119 | 0.15663466698606499, 120 | 0.1551692089997232, 121 | 0.1610780830087606, 122 | 0.15597808299935423, 123 | 0.1559513749962207, 124 | 0.15642320801271126, 125 | 0.15623020901693963, 126 | 0.15619325000443496, 127 | 0.15604158301721327, 128 | 0.15494554198812693, 129 | 0.15633858399814926, 130 | 0.15586795800481923, 131 | 0.15480283298529685, 132 | 0.15543520799838006, 133 | 0.16281329200137407 134 | ] 135 | } -------------------------------------------------------------------------------- /benchmarks/results/map_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "map", 3 | "dataset": "large", 4 | "dataset_size": 100000000, 5 | "runtimes": [ 6 | 38.044695958000375, 7 | 37.88741087500239, 8 | 37.893524833983975, 9 | 38.120276041998295, 10 | 38.02909308302333, 11 | 37.9263411249849, 12 | 37.68712725001387, 13 | 37.93799850001233, 14 | 38.03957070899196, 15 | 38.126094834005926, 16 | 37.762346417002846, 17 | 38.304923457995756, 18 | 38.108259917004034, 19 | 38.04698508299771, 20 | 37.922059125005035, 21 | 37.88537779197213, 22 | 38.373752999992575, 23 | 37.6934795420093, 24 | 38.058965083007934, 25 | 37.86218554197694, 26 | 38.01557258400135, 27 | 38.16043354201247, 28 | 38.424862250016304, 29 | 38.14441895799246, 30 | 38.030545084009646, 31 | 38.190908041986404, 32 | 37.92010895800195, 33 | 38.539197249978315, 34 | 38.05922462500166, 35 | 38.042172666988336, 36 | 38.06336879200535, 37 | 37.96962983297999 38 | ] 39 | } -------------------------------------------------------------------------------- /benchmarks/results/map_medium.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "map", 3 | "dataset": "medium", 4 | "dataset_size": 10000000, 5 | "runtimes": [ 6 | 13.511881792015629, 7 | 13.617772792000324, 8 | 13.519385208026506, 9 | 13.490683124982752, 10 | 13.626960124995094, 11 | 13.508057041995926, 12 | 13.502069417008897, 13 | 13.563101709005423, 14 | 13.507099166017724, 15 | 13.552681833010865, 16 | 13.591525916999672, 17 | 13.551621666003484, 18 | 13.518412290984998, 19 | 13.451721041987184, 20 | 13.499396291008452, 21 | 13.614300333021674, 22 | 13.563594542007195, 23 | 13.463782207982149, 24 | 13.588725749985315, 25 | 13.636522250017151, 26 | 13.591557374980766, 27 | 13.512941416993272, 28 | 13.552888249978423, 29 | 13.606033250020118, 30 | 13.565899540990358, 31 | 13.561953833006555, 32 | 13.635592833015835 33 | ] 34 | } -------------------------------------------------------------------------------- /benchmarks/results/map_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "map", 3 | "dataset": "small", 4 | "dataset_size": 100000, 5 | "runtimes": [ 6 | 0.1891134589968715, 7 | 0.19210041698534042, 8 | 0.1904449590074364, 9 | 0.19111270899884403, 10 | 0.19105345799471252, 11 | 0.1876772920077201, 12 | 0.19078433298273012, 13 | 0.18876975000603124, 14 | 0.19013570799143054, 15 | 0.18736258399439976, 16 | 0.1916751669778023, 17 | 0.18777787499129772, 18 | 0.19012708301306702, 19 | 0.18837083299877122, 20 | 0.18670304099214263, 21 | 0.19053699998767115, 22 | 0.18963375000748783, 23 | 0.19170491700060666, 24 | 0.1877893749915529, 25 | 0.18843620899133384, 26 | 0.1901267080102116, 27 | 0.18844208301743492, 28 | 0.1905399159877561, 29 | 0.19096729197190143, 30 | 0.19023104198276997, 31 | 0.18846241701976396, 32 | 0.19002441599150188, 33 | 0.1901898330252152, 34 | 0.19257025001570582, 35 | 0.18755800000508316, 36 | 0.19046029102173634, 37 | 0.18996654197690077, 38 | 0.19062725000549108, 39 | 0.19637816699105315, 40 | 0.18948379199719056, 41 | 0.19232058300985955, 42 | 0.19094045800738968, 43 | 0.19090891699306667, 44 | 0.18941837499733083, 45 | 0.18869641597848386, 46 | 0.1972927499737125, 47 | 0.19141366600524634, 48 | 0.19254774998989888, 49 | 0.1912544580118265, 50 | 0.18878950001089834, 51 | 0.18837604200234637, 52 | 0.190177834010683, 53 | 0.19098016701173037, 54 | 0.1886746659874916, 55 | 0.18773966701701283, 56 | 0.19070679100695997, 57 | 0.18930591698153876, 58 | 0.1896120419842191, 59 | 0.18997891701292247, 60 | 0.18771916697733104, 61 | 0.19129891600459814, 62 | 0.19054650000180118, 63 | 0.19020145799731836, 64 | 0.19060395800624974, 65 | 0.18762170898844488, 66 | 0.1898857920023147, 67 | 0.19109079102054238, 68 | 0.19092062499839813, 69 | 0.1904410419811029, 70 | 0.18772612500470132, 71 | 0.1871835410129279, 72 | 0.19085220800479874, 73 | 0.19084308302262798, 74 | 0.18706516700331122, 75 | 0.18756166700040922, 76 | 0.18911158401169814, 77 | 0.1875695830094628, 78 | 0.19037070800550282, 79 | 0.19025616699946113, 80 | 0.18603216600604355, 81 | 0.18723708399920724, 82 | 0.18987504197866656, 83 | 0.191273666016059, 84 | 0.18810095801018178, 85 | 0.19217291599488817, 86 | 0.19653324998216704, 87 | 0.1821762080071494, 88 | 0.1908734999888111, 89 | 0.19063633400946856, 90 | 0.19348950000130571, 91 | 0.19051245800801553, 92 | 0.19168770901160315, 93 | 0.19086062497808598, 94 | 0.19057970799622126, 95 | 0.19091025000670925, 96 | 0.19122595799854025, 97 | 0.18856474998756312, 98 | 0.19025908398907632, 99 | 0.1902516660047695, 100 | 0.1871023330022581, 101 | 0.18909866700414568, 102 | 0.19003108300967142, 103 | 0.19090791599592194, 104 | 0.1916295829869341, 105 | 0.19268283300334588, 106 | 0.19054812501417473, 107 | 0.18951087500317954, 108 | 0.1900284580187872, 109 | 0.1899386669974774, 110 | 0.18759595800656825 111 | ] 112 | } -------------------------------------------------------------------------------- /benchmarks/results/map_xsmall.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "map", 3 | "dataset": "xsmall", 4 | "dataset_size": 1000, 5 | "runtimes": [ 6 | 0.06401224998990074, 7 | 0.06418708301498555, 8 | 0.062261334009235725, 9 | 0.06443562501226552, 10 | 0.0632782919856254, 11 | 0.06306379099260084, 12 | 0.06327087499084882, 13 | 0.06286170898238197, 14 | 0.06227070800377987, 15 | 0.06216416700044647, 16 | 0.05693904199870303, 17 | 0.06316337501630187, 18 | 0.06364212499465793, 19 | 0.06387220800388604, 20 | 0.06336487500811927, 21 | 0.06294449997949414, 22 | 0.06370962498476729, 23 | 0.06356845897971652, 24 | 0.0639103330031503, 25 | 0.06442991699441336, 26 | 0.063629915995989, 27 | 0.06369895898387767, 28 | 0.06332191699766554, 29 | 0.0631050419760868, 30 | 0.06261091600754298, 31 | 0.061854958010371774, 32 | 0.06402333299047314, 33 | 0.06473608399392106, 34 | 0.06247470900416374, 35 | 0.062201416003517807, 36 | 0.0645946660079062, 37 | 0.06295474999933504, 38 | 0.06320237502222881, 39 | 0.06425979200867005, 40 | 0.06295583400060423, 41 | 0.06250070800888352, 42 | 0.06336974998703226, 43 | 0.06363258301280439, 44 | 0.06313208301435225, 45 | 0.06288966699503362, 46 | 0.06368470800225623, 47 | 0.06350766698596999, 48 | 0.06388804101152346, 49 | 0.06388683401746675, 50 | 0.06406479200813919, 51 | 0.06303812499390915, 52 | 0.06303600000683218, 53 | 0.06386075000045821, 54 | 0.06234220799524337, 55 | 0.06294887498370372, 56 | 0.0634906250052154, 57 | 0.0660005829995498, 58 | 0.059778791008284315, 59 | 0.06416095801978372, 60 | 0.0634279579971917, 61 | 0.06308670801809058, 62 | 0.06403333399794064, 63 | 0.06426183399162255, 64 | 0.06455233300221153, 65 | 0.06467220798367634, 66 | 0.0640352499904111, 67 | 0.06321545800892636, 68 | 0.06419095798628405, 69 | 0.0643165830115322, 70 | 0.06453616698854603, 71 | 0.06275262500275858, 72 | 0.06344425000133924, 73 | 0.06365591599023901, 74 | 0.06306587500148453, 75 | 0.06312562499078922, 76 | 0.06393025000579655, 77 | 0.0638675410009455, 78 | 0.06407720799325034, 79 | 0.06418420898262411, 80 | 0.0635341249871999, 81 | 0.063094999990426, 82 | 0.06356812501326203, 83 | 0.06378241602214985, 84 | 0.06378399999812245, 85 | 0.06269395901472308, 86 | 0.064241290994687, 87 | 0.0636746249801945, 88 | 0.06431408401113003, 89 | 0.06347541700233705, 90 | 0.06370437500299886, 91 | 0.0634606670064386, 92 | 0.06362745899241418, 93 | 0.0643161250045523, 94 | 0.061846875003539026, 95 | 0.07015920800040476, 96 | 0.06450620800023898, 97 | 0.06384200000320561, 98 | 0.063679165992653, 99 | 0.06373883300693706, 100 | 0.06338008400052786, 101 | 0.0637870830250904, 102 | 0.06380837497999892, 103 | 0.064563249994535, 104 | 0.06406691600568593, 105 | 0.06349370800307952, 106 | 0.06414991701603867, 107 | 0.06779629099764861, 108 | 0.0652000840054825, 109 | 0.06393645799835213, 110 | 0.06343612502678297, 111 | 0.06305124997743405, 112 | 0.0645338750036899, 113 | 0.06420533399796113, 114 | 0.06327091701678, 115 | 0.06383362499764189, 116 | 0.06330470799002796, 117 | 0.0635799580195453, 118 | 0.06390908299363218, 119 | 0.0642478329828009, 120 | 0.06382416700944304, 121 | 0.06274441699497402, 122 | 0.0648237080022227, 123 | 0.06421766598941758, 124 | 0.06274287501582876, 125 | 0.06418754099286161, 126 | 0.06337112499750219, 127 | 0.06352683299337514, 128 | 0.06372895801905543, 129 | 0.06390520802233368, 130 | 0.06371245800983161, 131 | 0.06338716699974611, 132 | 0.06575258300290443, 133 | 0.06351533398265019, 134 | 0.06342225000844337, 135 | 0.06350208300864324, 136 | 0.06461916698026471, 137 | 0.063936584017938, 138 | 0.06390541599830613, 139 | 0.06407525000395253, 140 | 0.06340216699754819, 141 | 0.06282837499747984, 142 | 0.06462962500518188, 143 | 0.0641327920020558, 144 | 0.06488316599279642, 145 | 0.06303525000112131, 146 | 0.06462529199779965, 147 | 0.06373324999003671, 148 | 0.06337058398639783, 149 | 0.06313145800959319, 150 | 0.06430174998240545, 151 | 0.06479545799084008, 152 | 0.0637904159957543, 153 | 0.06566670801839791, 154 | 0.05965949999517761, 155 | 0.06499429099494591, 156 | 0.0647090419952292, 157 | 0.0642154160013888, 158 | 0.06312849998357706, 159 | 0.06434266699943691, 160 | 0.06338929201592691, 161 | 0.0636489580210764, 162 | 0.064640500000678, 163 | 0.06383749999804422, 164 | 0.06494691700208932, 165 | 0.06376024999190122, 166 | 0.06398558302316815, 167 | 0.06361199999810196, 168 | 0.06372262499644421, 169 | 0.0642287080117967, 170 | 0.06415083300089464, 171 | 0.06340141699183732, 172 | 0.06467233298462816, 173 | 0.06443112500710413, 174 | 0.06474558298941702, 175 | 0.06307387497508898, 176 | 0.06402662498294376, 177 | 0.06344970798818395, 178 | 0.06343229199410416, 179 | 0.06341170900850557, 180 | 0.06379575000028126, 181 | 0.06319029198493809, 182 | 0.06499416701262817, 183 | 0.06463583299773745, 184 | 0.06370729199261405, 185 | 0.06316495800274424, 186 | 0.06409333299961872, 187 | 0.06415633400320075, 188 | 0.06386249998467974, 189 | 0.0635423339845147, 190 | 0.06403995800064877, 191 | 0.06380958401132375, 192 | 0.06409708299906924, 193 | 0.06439775001490489, 194 | 0.06419975002063438, 195 | 0.06400912502431311, 196 | 0.06388608302222565, 197 | 0.06453558398061432, 198 | 0.06436237500747666, 199 | 0.0635280410060659, 200 | 0.06348933299886994, 201 | 0.06393875001231208, 202 | 0.06437704098061658, 203 | 0.06505916701280512, 204 | 0.06457829201826826, 205 | 0.062883333011996, 206 | 0.06464354100171477, 207 | 0.06355437499587424, 208 | 0.06430345799890347, 209 | 0.06429766697692685, 210 | 0.06383433402515948, 211 | 0.06387799998628907, 212 | 0.0643800419929903, 213 | 0.06438437500037253, 214 | 0.0638653339992743, 215 | 0.06345554100698791, 216 | 0.06449558300664648, 217 | 0.06388316600350663, 218 | 0.06479329202556983, 219 | 0.06254474999150261, 220 | 0.06371562500135042, 221 | 0.06407887500245124, 222 | 0.06431366599281318, 223 | 0.0644370420195628, 224 | 0.06472729201777838, 225 | 0.06416037501185201, 226 | 0.06474170801811852, 227 | 0.06363495800178498, 228 | 0.0645637080015149, 229 | 0.05807462500524707, 230 | 0.0649033329973463, 231 | 0.06361041698255576, 232 | 0.06451291698613204, 233 | 0.06469370899139903, 234 | 0.0640999170136638, 235 | 0.0660214580129832, 236 | 0.0646264590031933, 237 | 0.06487620898406021, 238 | 0.06401458298205398, 239 | 0.06325124998693354, 240 | 0.06438924997928552, 241 | 0.0642430419975426, 242 | 0.0638723750016652, 243 | 0.0642105000151787, 244 | 0.0652872080099769, 245 | 0.06383195900707506, 246 | 0.06463574999361299, 247 | 0.06503337499452755, 248 | 0.06614325000555255, 249 | 0.06922112501342781, 250 | 0.06523287500021979, 251 | 0.06401958299102262, 252 | 0.06478358301683329, 253 | 0.0634410829807166, 254 | 0.06447337500867434, 255 | 0.06433170801028609, 256 | 0.06458100001327693, 257 | 0.06401320800068788, 258 | 0.06439433299237862, 259 | 0.06442100001731887, 260 | 0.064551624993328, 261 | 0.06353462499100715, 262 | 0.06405608300701715, 263 | 0.06532541700289585, 264 | 0.06410345798940398, 265 | 0.06398383298073895, 266 | 0.0647152500168886, 267 | 0.06400387501344085, 268 | 0.06353183399187401, 269 | 0.06437620898941532, 270 | 0.06458066700724885, 271 | 0.06377395801246166, 272 | 0.06508633299381472, 273 | 0.06455716700293124, 274 | 0.06426795799052343, 275 | 0.06406862498261034, 276 | 0.06429037501220591, 277 | 0.06486216600751504, 278 | 0.06356041599065065, 279 | 0.06424241600325331, 280 | 0.06479370797751471, 281 | 0.06473379101953469, 282 | 0.06486316601512954, 283 | 0.06432462498196401, 284 | 0.06498587501118891, 285 | 0.0635620410030242, 286 | 0.0642983750149142, 287 | 0.06517600000370294, 288 | 0.06468945799861103, 289 | 0.06470412499038503, 290 | 0.06376245801220648, 291 | 0.064886917010881, 292 | 0.06441520800581202, 293 | 0.06461637502070516, 294 | 0.0650263330026064, 295 | 0.06352854199940339, 296 | 0.06553766599972732, 297 | 0.06482400000095367, 298 | 0.06408287500380538, 299 | 0.06470920800347812, 300 | 0.06507024998427369, 301 | 0.05939904198748991, 302 | 0.05856808397220448, 303 | 0.05792379201739095, 304 | 0.05875624998589046, 305 | 0.05821258298237808, 306 | 0.05797862500185147, 307 | 0.0582444169849623, 308 | 0.05888770800083876, 309 | 0.057990708999568596, 310 | 0.058774624980287626, 311 | 0.05769141699420288, 312 | 0.06280350001179613, 313 | 0.05891133300610818, 314 | 0.05840287497267127, 315 | 0.057056749996263534, 316 | 0.057819000008748844, 317 | 0.056933666026452556, 318 | 0.055553833983140066, 319 | 0.05672291701193899, 320 | 0.05713329097488895, 321 | 0.05706799999461509, 322 | 0.05758858399349265, 323 | 0.05769229101133533, 324 | 0.0572446660080459, 325 | 0.05748454199056141 326 | ] 327 | } -------------------------------------------------------------------------------- /benchmarks/results/toPandas_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "toPandas", 3 | "dataset": "large", 4 | "dataset_size": 100000000, 5 | "runtimes": [ 6 | 8.862778999988222, 7 | 8.811171500012279, 8 | 8.938347457995405, 9 | 8.947406374994898, 10 | 8.88868116599042, 11 | 8.990357999980915, 12 | 8.73990975000197, 13 | 8.638437292014714, 14 | 9.160425375011982, 15 | 9.038150041975314, 16 | 8.591716666996945, 17 | 9.168473375000758, 18 | 8.798064542002976, 19 | 8.936836874985602, 20 | 8.671541833988158, 21 | 8.662482666986762, 22 | 8.708136500004912, 23 | 8.692952374985907, 24 | 8.592529084009584, 25 | 8.740214041987201, 26 | 9.146632749994751, 27 | 8.8302964589966, 28 | 9.15225395897869, 29 | 9.106577541009756, 30 | 8.817999457998667, 31 | 8.631971499999054, 32 | 8.868299333000323, 33 | 8.840884500008542, 34 | 8.621281041996554, 35 | 8.586707083013607, 36 | 8.629861416004132, 37 | 8.58383437502198, 38 | 8.67459566600155, 39 | 8.966120708006201, 40 | 9.302168708003592, 41 | 8.56661416697898, 42 | 8.576364625012502, 43 | 9.334656874998473, 44 | 8.738957708003, 45 | 8.569964958005585, 46 | 9.004718665994005, 47 | 8.58318062502076, 48 | 8.604225666000275, 49 | 8.54163133300608, 50 | 8.606262207991676, 51 | 8.530463000002783, 52 | 8.523315916012507, 53 | 8.498393665999174, 54 | 8.456541958003072, 55 | 8.534131916996557, 56 | 8.56562666699756, 57 | 9.39233074997901, 58 | 9.234180207975442, 59 | 8.49574904202018, 60 | 8.959661500004586, 61 | 8.539121125009842, 62 | 8.487174874986522, 63 | 8.591410583001561, 64 | 8.695382541976869, 65 | 8.435281415993813, 66 | 8.502639499987708, 67 | 8.930086000007577, 68 | 8.565875666012289, 69 | 8.536115042021265, 70 | 8.796861499984516, 71 | 8.54752500000177, 72 | 8.55864475000999, 73 | 8.484635584027274, 74 | 8.522846042003948, 75 | 8.59690987499198, 76 | 9.052915917010978, 77 | 8.575967915996443, 78 | 8.669178208016092, 79 | 8.714108874992235, 80 | 9.386535500001628, 81 | 8.646265166986268, 82 | 8.482657792017562, 83 | 8.864741375000449, 84 | 8.946433333010646, 85 | 8.905033792019822, 86 | 8.449145749997115, 87 | 8.460354208014905, 88 | 8.46207508299267, 89 | 8.486035125009948, 90 | 8.531593207997503, 91 | 8.815072375000454, 92 | 8.717701290996047, 93 | 8.582990959024755, 94 | 8.68444795900723, 95 | 8.602480875008041, 96 | 9.267432209016988, 97 | 9.377239374996861, 98 | 8.727259999985108, 99 | 8.711020249989815, 100 | 8.912161042011576, 101 | 8.738440042012371, 102 | 8.81438700002036, 103 | 8.762875250016805, 104 | 8.676942000020063, 105 | 8.701102665974759, 106 | 8.623225249990355, 107 | 8.937765667011263, 108 | 8.805998249998083, 109 | 9.867303541017463, 110 | 8.744416041998193, 111 | 8.637591749982676, 112 | 9.320447209000122, 113 | 9.01535137501196, 114 | 9.542240958980983, 115 | 8.659673166985158, 116 | 8.71328033300233, 117 | 8.699457167007495, 118 | 8.678966707986547, 119 | 8.621516874991357, 120 | 8.565221416996792, 121 | 8.622395917016547, 122 | 9.136514207988512, 123 | 8.58931437501451 124 | ] 125 | } -------------------------------------------------------------------------------- /benchmarks/results/toPandas_medium.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "toPandas", 3 | "dataset": "medium", 4 | "dataset_size": 10000000, 5 | "runtimes": [ 6 | 1.2885572499944828, 7 | 1.2571123329980765, 8 | 1.256014916987624, 9 | 1.2624951250036247, 10 | 1.2701949999900535, 11 | 1.27012212498812, 12 | 1.2953377089870628, 13 | 1.28163666598266, 14 | 1.2605684580048546, 15 | 1.2856867499940563, 16 | 1.2515628750261385, 17 | 1.2650770000182092, 18 | 1.2583320830017328, 19 | 1.2575984589930158, 20 | 1.2693923329934478, 21 | 1.256740249984432, 22 | 1.2621407080150675, 23 | 1.2525152499729302, 24 | 1.27661829101271, 25 | 1.258097041019937, 26 | 1.2528983329830226, 27 | 1.254257416992914, 28 | 1.2604653750022408, 29 | 1.2601165829983074, 30 | 1.2579469589982182, 31 | 1.258330750017194, 32 | 1.264852874999633, 33 | 1.2719748750096187, 34 | 1.2666882920020726, 35 | 1.2630691660160664, 36 | 1.2763231249991804, 37 | 1.260669625014998, 38 | 1.259572000009939, 39 | 1.2746881250059232, 40 | 1.2601215410104487, 41 | 1.2655172920203768, 42 | 1.2657782919995952, 43 | 1.2550521249941085, 44 | 1.2478350419842172, 45 | 1.2497141670028213, 46 | 1.2692700409970712, 47 | 1.2729937089898158, 48 | 1.2649799170030747, 49 | 1.2598057499853894, 50 | 1.2757172500132583, 51 | 1.2751642079965677, 52 | 1.2607573750137817, 53 | 1.258303791983053, 54 | 1.2484310830186587, 55 | 1.2535902920062654, 56 | 1.2511154170206282, 57 | 1.2532207920157816, 58 | 1.256906374997925, 59 | 1.2576233340078034, 60 | 1.2554536249954253, 61 | 1.264078125008382, 62 | 1.2596141249814536, 63 | 1.2676740419992711, 64 | 1.2511124169977847, 65 | 1.2469390420010313, 66 | 1.248246333008865, 67 | 1.2562179579981603, 68 | 1.259968582977308, 69 | 1.2633217920083553, 70 | 1.2496773750171997, 71 | 1.255484167020768, 72 | 1.250518374989042, 73 | 1.253819665987976, 74 | 1.2618275830172934, 75 | 1.2681392919912469, 76 | 1.2453019999957178, 77 | 1.260975375014823, 78 | 1.271160583011806, 79 | 1.2457151250273455, 80 | 1.2655820829968434, 81 | 1.2527838750102092, 82 | 1.2574351250077598, 83 | 1.2535599590046331, 84 | 1.2713026250130497, 85 | 1.2475648749968968, 86 | 1.2485032090044115, 87 | 1.2522275419905782, 88 | 1.2653647499973886, 89 | 1.2641535000002477, 90 | 1.2570255419996101, 91 | 1.2574704999860842, 92 | 1.2512661659857258, 93 | 1.2690267080033664, 94 | 1.2580981670180336, 95 | 1.2658240419987123, 96 | 1.2544514170149341, 97 | 1.2501862089848146, 98 | 1.2534734169894364, 99 | 1.2411465829936787, 100 | 1.2681619999930263, 101 | 1.2595267920114566, 102 | 1.2521268330165185, 103 | 1.2558963330229744, 104 | 1.2550300410075579, 105 | 1.2594273750146385, 106 | 1.2700898330076598, 107 | 1.2613907079794444, 108 | 1.2615968750033062, 109 | 1.256476125010522, 110 | 1.2549062500183936, 111 | 1.2425632910162676, 112 | 1.2587947080028243, 113 | 1.2492519579827785, 114 | 1.2572470830054954, 115 | 1.257936542009702, 116 | 1.269242457987275, 117 | 1.2409304580069147, 118 | 1.2584901249792892, 119 | 1.251469167007599, 120 | 1.2582818329974543, 121 | 1.2683968750061467, 122 | 1.250196707987925, 123 | 1.2636364579957444, 124 | 1.2618374169978779, 125 | 1.2372403330227826, 126 | 1.2556332079984713, 127 | 1.273715458024526, 128 | 1.2490043340076227, 129 | 1.2452241249848157, 130 | 1.2518945840129163, 131 | 1.2659609169932082, 132 | 1.2602919999917503, 133 | 1.2622803749982268, 134 | 1.2564580829930492, 135 | 1.2583414999826346, 136 | 1.255592000001343, 137 | 1.2519122080120724, 138 | 1.2525597079948056, 139 | 1.2449430830019992, 140 | 1.258767541992711, 141 | 1.2542946659959853, 142 | 1.2580878750013653, 143 | 1.2641330419864971, 144 | 1.2617088750121184, 145 | 1.2378346659825183, 146 | 1.2531650409800932, 147 | 1.2640607500215992, 148 | 1.2595061250030994, 149 | 1.2388757499866188, 150 | 1.2576246660028119, 151 | 1.2584112079930492, 152 | 1.2468822920054663, 153 | 1.24673616598011, 154 | 1.254638749989681, 155 | 1.2438010000041686, 156 | 1.2463356249791104, 157 | 1.2494282499828842, 158 | 1.2595032919780351, 159 | 1.24727687498671, 160 | 1.2564306669810321, 161 | 1.2541845410014503, 162 | 1.2410721249761991, 163 | 1.2458839580067433, 164 | 1.2591591250093188, 165 | 1.2470217079971917, 166 | 1.254125416977331, 167 | 1.2584732499963138, 168 | 1.2589741249976214, 169 | 1.2620728749898262, 170 | 1.2665299999935087, 171 | 1.261897999997018, 172 | 1.2440591669874266, 173 | 1.2591024589783046, 174 | 1.2497527500090655, 175 | 1.2537597499904223, 176 | 1.250720125011867, 177 | 1.2480132080090698, 178 | 1.235797332978109, 179 | 1.2646380409714766, 180 | 1.2634682499920018, 181 | 1.242793207988143, 182 | 1.2498649170156568, 183 | 1.2549589590053074, 184 | 1.2531464160128962, 185 | 1.245336749998387, 186 | 1.2464906670211349, 187 | 1.2613908749772236, 188 | 1.2601894999970682, 189 | 1.2590537079959176, 190 | 1.248518416978186, 191 | 1.2670163750008214, 192 | 1.2589331250055693, 193 | 1.2547621669946238, 194 | 1.2601012089871801, 195 | 1.2606227079813834, 196 | 1.2661379160126671, 197 | 1.2611060409981292, 198 | 1.2527715420001186, 199 | 1.2650721249810886, 200 | 1.254195499990601, 201 | 1.2650972080009524, 202 | 1.2625275420141406, 203 | 1.2661464999837335, 204 | 1.2615302499907557, 205 | 1.2513069160049781, 206 | 1.2482542910147458, 207 | 1.2654491249995772, 208 | 1.2621469580044504, 209 | 1.2478159999882337, 210 | 1.255306584003847, 211 | 1.2464843330089934, 212 | 1.3007019170036074, 213 | 1.266680291009834, 214 | 1.2509966670186259, 215 | 1.2605993750039488, 216 | 1.250382541998988, 217 | 1.3018390409997664, 218 | 1.2543151670251973, 219 | 1.2627636669785716, 220 | 1.2599722500017378, 221 | 1.2687087090162095, 222 | 1.2725013339950237, 223 | 1.2627896670019254, 224 | 1.2572203340241686, 225 | 1.2694404170033522, 226 | 1.2506123329803813, 227 | 1.2539891250198707, 228 | 1.2666916250018403, 229 | 1.2588019579998218, 230 | 1.2651899160118774, 231 | 1.2561217090114951, 232 | 1.2594041250122245, 233 | 1.2495557079964783, 234 | 1.271390917012468, 235 | 1.24894195899833, 236 | 1.2602929170243442, 237 | 1.2597828749858309, 238 | 1.2529933750047348, 239 | 1.2619111250096466, 240 | 1.2601427090121433, 241 | 1.254465791018447, 242 | 1.268157540995162, 243 | 1.2684888750081882, 244 | 1.2474725829961244, 245 | 1.259887416003039, 246 | 1.2599989169975743, 247 | 1.2568535419995897, 248 | 1.2597891249752138, 249 | 1.2698568330088165, 250 | 1.2522419590095524, 251 | 1.270519375015283, 252 | 1.2659674169844948, 253 | 1.2543618329800665, 254 | 1.2571934580046218, 255 | 1.2723624169884715, 256 | 1.259315791015979, 257 | 1.2684716249932535, 258 | 1.2472508329956327, 259 | 1.2557019999949262, 260 | 1.261543167027412, 261 | 1.2660564169927966, 262 | 1.2707085419970099, 263 | 1.2638022909814026, 264 | 1.2595980420010164, 265 | 1.2619648329855409, 266 | 1.2532179999980144, 267 | 1.2595204589888453, 268 | 1.264111166994553, 269 | 1.2625636250013486, 270 | 1.2478563750046305, 271 | 1.26558091698098, 272 | 1.2610336250218097, 273 | 1.2406471249996684, 274 | 1.2575986250012647, 275 | 1.2457927499781363, 276 | 1.2491438330034725, 277 | 1.2553590000024997, 278 | 1.2700152920151595, 279 | 1.2501201249833684, 280 | 1.2646489159960765, 281 | 1.2433127920085099, 282 | 1.2392549159994815, 283 | 1.2527907500043511, 284 | 1.2585749159916304, 285 | 1.2368915829865728, 286 | 1.2542310419958085 287 | ] 288 | } -------------------------------------------------------------------------------- /benchmarks/visualize_benchmarks.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | from datetime import datetime as dt 17 | from pathlib import Path 18 | 19 | import pandas as pd 20 | import plotly.express as px 21 | import pyspark.sql.functions as F # noqa: N812 22 | import pytz 23 | from pyspark.sql import SparkSession 24 | 25 | 26 | def parse_results(spark: SparkSession) -> tuple[pd.DataFrame, pd.DataFrame, str]: 27 | """Parse benchmark results into a Pandas DataFrame.""" 28 | result_df = ( 29 | spark.read.json("benchmarks/results/*.json", multiLine=True) 30 | .select( 31 | "test_name", 32 | "dataset", 33 | "dataset_size", 34 | F.explode("runtimes").alias("runtime"), 35 | ) 36 | .withColumnRenamed("dataset", "dataset_name") 37 | .withColumn( 38 | "dataset_size_formatted", 39 | F.concat(F.lit("n="), F.format_number(F.col("dataset_size"), 0)), 40 | ) 41 | .withColumn( 42 | "dataset", 43 | F.concat( 44 | F.col("dataset_name"), 45 | F.lit(" ("), 46 | F.col("dataset_size_formatted"), 47 | F.lit(")"), 48 | ), 49 | ) 50 | .toPandas() 51 | ) 52 | 53 | if not isinstance(result_df, pd.DataFrame): 54 | raise TypeError 55 | 56 | result_df["dataset_name"] = pd.Categorical( 57 | result_df["dataset_name"], 58 | ["xsmall", "small", "medium", "large"], 59 | ) 60 | 61 | average_df = ( 62 | result_df[["test_name", "dataset_size", "runtime"]] 63 | .groupby(["test_name", "dataset_size"], observed=False) 64 | .mean() 65 | .reset_index() 66 | ) 67 | 68 | benchmark_date = get_benchmark_date(benchmark_path="benchmarks/results/") 69 | return result_df, average_df, benchmark_date 70 | 71 | 72 | def save_boxplot(df: pd.DataFrame, benchmark_date: str) -> None: 73 | """Displays faceted boxplot of benchmark results.""" 74 | machine_config = "Python 3.12.0, Spark 3.5, Pandas 2.1.3, M1 Macbook Pro 32GB RAM" 75 | subtitle = f"{benchmark_date} | {machine_config}" 76 | 77 | fig = px.box( 78 | df, 79 | x="dataset_size_formatted", 80 | y="runtime", 81 | color="test_name", 82 | facet_col="dataset_name", 83 | points="all", 84 | title=f"Column to List Benchmark Results
{subtitle}
", 85 | labels={"runtime": "Runtime (seconds)"}, 86 | category_orders={ 87 | "dataset_name": ["xsmall", "small", "medium", "large"], 88 | "test_name": [ 89 | "localIterator", 90 | "collectlist", 91 | "map", 92 | "flatmap", 93 | "toPandas", 94 | ], 95 | }, 96 | color_discrete_map={ 97 | "collectlist": "#636EFA", 98 | "localIterator": "#EF553B", 99 | "toPandas": "#00CC96", 100 | "map": "#AB63FA", 101 | "flatmap": "#FFA15A", 102 | }, 103 | ) 104 | fig.update_yaxes(matches=None) 105 | fig.update_yaxes({"tickfont": {"size": 9}}) 106 | fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True)) 107 | fig.update_xaxes(matches=None, title=None) 108 | fig.update_layout(legend_title_text="") 109 | 110 | fig.write_image( 111 | "benchmarks/images/column_to_list_boxplot.svg", 112 | width=1000, 113 | height=700, 114 | ) 115 | 116 | 117 | def save_line_plot(df: pd.DataFrame, benchmark_date: str) -> None: 118 | """Displays line plot of average benchmark results.""" 119 | machine_config = "Python 3.12.0, Spark 3.5, Pandas 2.1.3, M1 Macbook Pro 32GB RAM" 120 | subtitle = f"{benchmark_date} | {machine_config}" 121 | fig = px.line( 122 | df, 123 | x="dataset_size", 124 | y="runtime", 125 | log_x=True, 126 | color="test_name", 127 | title=f"Column to List Benchmark Results
{subtitle}
", 128 | labels={"runtime": "Runtime (seconds)", "dataset_size": "Number of Rows"}, 129 | category_orders={ 130 | "test_name": [ 131 | "localIterator", 132 | "collectlist", 133 | "map", 134 | "flatmap", 135 | "toPandas", 136 | ], 137 | }, 138 | color_discrete_map={ 139 | "collectlist": "#636EFA", 140 | "localIterator": "#EF553B", 141 | "toPandas": "#00CC96", 142 | "map": "#AB63FA", 143 | "flatmap": "#FFA15A", 144 | }, 145 | ) 146 | fig.update_traces(mode="markers+lines") 147 | fig.update_traces(marker={"size": 12}) 148 | fig.update_layout(legend_title_text="") 149 | 150 | fig.write_image( 151 | "benchmarks/images/column_to_list_line_plot.svg", 152 | width=900, 153 | height=450, 154 | ) 155 | 156 | 157 | def get_benchmark_date(benchmark_path: str) -> str: 158 | """Returns the date of the benchmark results.""" 159 | path = Path(benchmark_path) 160 | benchmark_ts = path.stat().st_mtime 161 | return dt.fromtimestamp( 162 | benchmark_ts, 163 | tz=pytz.timezone("US/Eastern"), 164 | ).strftime("%Y-%m-%d") 165 | 166 | 167 | if __name__ == "__main__": 168 | spark = ( 169 | SparkSession.builder.appName("MyApp") # type: ignore # noqa: PGH003 170 | .config("spark.executor.memory", "10G") 171 | .config("spark.driver.memory", "25G") 172 | .config("spark.sql.shuffle.partitions", "2") 173 | .getOrCreate() 174 | ) 175 | 176 | result_df, average_df, benchmark_date = parse_results(spark) 177 | save_boxplot(result_df, benchmark_date) 178 | save_line_plot(average_df, benchmark_date) 179 | -------------------------------------------------------------------------------- /docs/examples/index.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | Example Quinn code snippets 4 | 5 | - [Schema as Code](../notebooks/schema_as_code.ipynb) -------------------------------------------------------------------------------- /docs/gen_ref_pages.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | """Generate the code reference pages and navigation. 15 | 16 | Script was taken from 17 | https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages 18 | """ 19 | 20 | from pathlib import Path 21 | 22 | import mkdocs_gen_files 23 | 24 | nav = mkdocs_gen_files.Nav() 25 | 26 | for path in sorted(Path(".").rglob("quinn/**/*.py")): 27 | module_path = path.relative_to(".").with_suffix("") 28 | doc_path = path.relative_to(".").with_suffix(".md") 29 | full_doc_path = Path("reference", doc_path) 30 | 31 | parts = tuple(module_path.parts) 32 | 33 | if parts[-1] == "__init__": 34 | parts = parts[:-1] 35 | doc_path = doc_path.with_name("index.md") 36 | full_doc_path = full_doc_path.with_name("index.md") 37 | elif parts[-1] == "__main__": 38 | continue 39 | 40 | nav[parts] = doc_path.as_posix() # 41 | 42 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: 43 | ident = ".".join(parts) 44 | fd.write(f"::: {ident}") 45 | 46 | mkdocs_gen_files.set_edit_path(full_doc_path, path) 47 | 48 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: 49 | nav_file.writelines(nav.build_literate_nav()) 50 | -------------------------------------------------------------------------------- /docs/images/quinn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/docs/images/quinn.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Quinn 2 | 3 | ![quinn logo](images/quinn.png) 4 | 5 | Quinn contains PySpark helper methods that will make you more productive. 6 | 7 | Quinn is also a great way to learn about PySpark best practices like how to organize and unit test your code. 8 | 9 | ## Contributing 10 | 11 | We have a solid group of maintainers, chat on contributor meetings regularly, and eagerly accept contributions from other members. 12 | 13 | We want to help the world write beautiful PySpark and give them a wonderful developer experience. 14 | 15 | ### Code Style 16 | 17 | We are using [PySpark code-style](https://github.com/MrPowers/spark-style-guide/blob/main/PYSPARK_STYLE_GUIDE.md) and `sphinx` as docstrings format. For more details about `sphinx` format see [this tutorial](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html). A short example of `sphinx`-formatted docstring is placed below: 18 | 19 | ```python 20 | """[Summary] 21 | 22 | :param [ParamName]: [ParamDescription], defaults to [DefaultParamVal] 23 | :type [ParamName]: [ParamType](, optional) 24 | ... 25 | :raises [ErrorType]: [ErrorDescription] 26 | ... 27 | :return: [ReturnDescription] 28 | :rtype: [ReturnType] 29 | """ 30 | ``` 31 | -------------------------------------------------------------------------------- /docs/learn_more/column_to_list.md: -------------------------------------------------------------------------------- 1 | # Column to list performance 2 | 3 | In PySpark, there are many approaches to accomplish the same task. Given a test DataFrame containing two columns - mvv and count, here are five methods to produce an identical list of mvv values using base PySpark functionality. 4 | 5 | --- 6 | 7 | ## Setup 8 | 9 | ```python 10 | import pyspark.sql.functions as F 11 | from pyspark.sql import SparkSession 12 | ``` 13 | 14 | ```python 15 | spark = SparkSession.builder.getOrCreate() 16 | vals = [(0, 5), (1, 10), (2, 4), (3, 2), (4, 1)] 17 | df = spark.createDataFrame(count_vals, schema="mvv int, count int") 18 | ``` 19 | 20 | --- 21 | 22 | ## Approaches 23 | 24 | ### 1. toPandas() 25 | 26 | ```python 27 | list(df.select("mvv").toPandas()["mvv"]) 28 | # [0, 1, 2, 3, 4] 29 | ``` 30 | 31 | ### 2. flatMap 32 | 33 | ```python 34 | df.select("mvv").rdd.flatMap(lambda x: x).collect() 35 | # [0, 1, 2, 3, 4] 36 | ``` 37 | 38 | ### 3. map 39 | 40 | ```python 41 | df.select("mvv").rdd.map(lambda row: row[0]).collect() 42 | # [0, 1, 2, 3, 4] 43 | ``` 44 | 45 | ### 4. collect list comprehension 46 | 47 | ```python 48 | [row[0] for row in df.select("mvv").collect()] 49 | # [0, 1, 2, 3, 4] 50 | ``` 51 | 52 | ### 5. toLocalIterator() list comprehension 53 | 54 | ```python 55 | [row[0] for row in df.select("mvv").toLocalIterator()] 56 | # [0, 1, 2, 3, 4] 57 | ``` 58 | 59 | --- 60 | 61 | ## Benchmark Results 62 | 63 | Substantial runtime differences were observed on the medium and large datasets: 64 | 65 | ![box plot](../images/column_to_list_boxplot.svg) 66 | 67 | ![line plot](../images/column_to_list_line_plot.svg) 68 | 69 | All approaches have similar performance at 1K and 100k rows. `toPandas()` is consistently the fastest method across the tested dataset sizes, and exhibits the least variance in runtime. However, `pyarrow` and `pandas` are not required dependencies of Quinn so this method will only work with those packages available. For typical spark workloads, the `flatMap` approach is the next best option to use by default. 70 | 71 | --- 72 | 73 | ## Quinn Implementation 74 | 75 | [:material-api: `quinn.column_to_list`](https://mrpowers.github.io/quinn/reference/quinn/dataframe_helpers) 76 | 77 | To address these performance results, we updated `quinn.column_to_list()` to check the runtime environment and use the fastest method. If `pandas` and `pyarrow` are available, `toPandas()` is used. Otherwise, `flatmap` is used. 78 | 79 | --- 80 | 81 | ## More Information 82 | 83 | ### Datasets 84 | 85 | Four datasets were used for this benchmark. Each dataset contains two columns - mvv and index. The mvv column is a monotonically increasing integer and the count column is a random integer between 1 and 10. The datasets were created using the `create_benchmark_df.py` script in `quinn/benchmarks` 86 | 87 | | Dataset name | Number of rows | Number of files | Size on disk (mb) | 88 | | ------------ | -------------- | --------------- | ----------------- | 89 | | mvv_xsmall | 1,000 | 1 | 0.005 | 90 | | mvv_small | 100,000 | 1 | 0.45 | 91 | | mvv_medium | 10,000,000 | 1 | 45 | 92 | | mvv_large | 100,000,000 | 4 | 566 | 93 | 94 | --- 95 | 96 | ### Validation 97 | 98 | The code and results from this test are available in the `/benchmarks` directory of Quinn. To run this benchmark yourself: 99 | 100 | #### 1. install the required dependencies 101 | 102 | ```bash 103 | poetry install --with docs 104 | ``` 105 | 106 | #### 2. create the datasets 107 | 108 | ```bash 109 | poetry run python benchmarks/create_benchmark_df.py 110 | ``` 111 | 112 | #### 3. run the benchmark 113 | 114 | ```bash 115 | poetry run python benchmarks/benchmark_column_performance.py 116 | ``` 117 | 118 | Results will be stored in the `benchmarks/results` directory. 119 | By default each implementation will run for the following durations: 120 | 121 | | Dataset name | Duration (seconds) | 122 | | ------------ | ------------------ | 123 | | mvv_xsmall | 20 | 124 | | mvv_small | 20 | 125 | | mvv_medium | 360 | 126 | | mvv_large | 1200 | 127 | 128 | These can be adjusted in benchmarks/benchmark_column_performance.py if a shorter or longer duration is desired. 129 | 130 | #### 4. Visualize the results 131 | 132 | ```bash 133 | poetry run python benchmarks/visualize_benchmarks.py 134 | ``` 135 | 136 | .svg files will be saved in the `benchmarks/images` directory. 137 | -------------------------------------------------------------------------------- /docs/learn_more/index.md: -------------------------------------------------------------------------------- 1 | # Learn More 2 | 3 | Deeper explanations of design decisions and use cases for Quinn 4 | 5 | - [Convert PySpark DataFrame Columns to a Python List](column_to_list.md) 6 | -------------------------------------------------------------------------------- /docs/notebooks/schema_as_code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "57a1c914-7244-4759-8abc-9e27060eef7f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Print SCHEMA as code\n", 9 | "\n", 10 | "Function, that take `pyspark.sql.types.StructType` and print a valid `Python` code." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "2505c5c1-15cc-47ea-b71d-d53472ae67ae", 17 | "metadata": { 18 | "tags": [] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "from quinn import print_schema_as_code" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "04338b8d-f604-4b59-9904-afa2fa7c4e4d", 29 | "metadata": { 30 | "tags": [] 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from pyspark.sql import types as T" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "id": "68a70047-e805-4be8-be52-08b168a0363b", 41 | "metadata": { 42 | "tags": [] 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "schema = T.StructType(\n", 47 | " [\n", 48 | " T.StructField(\"string_field\", T.StringType()),\n", 49 | " T.StructField(\"decimal_38_10_field\", T.DecimalType(38, 10)),\n", 50 | " T.StructField(\"decimal_10_2_field\", T.DecimalType(10, 2)),\n", 51 | " T.StructField(\"array_of_double\", T.ArrayType(elementType=T.DoubleType())),\n", 52 | " T.StructField(\"map_type\", T.MapType(keyType=T.StringType(), valueType=T.ShortType())),\n", 53 | " T.StructField(\"struct_type\", T.StructType([T.StructField(\"t1\", T.StringType()), T.StructField(\"t2\", T.BooleanType())])),\n", 54 | " ]\n", 55 | ")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "id": "61a487be-765a-46bf-881c-cc08b292e951", 62 | "metadata": { 63 | "tags": [] 64 | }, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "StructType(\n", 71 | "\tfields=[\n", 72 | "\t\tStructField(\"string_field\", StringType(), True),\n", 73 | "\t\tStructField(\"decimal_38_10_field\", DecimalType(38, 10), True),\n", 74 | "\t\tStructField(\"decimal_10_2_field\", DecimalType(10, 2), True),\n", 75 | "\t\tStructField(\n", 76 | "\t\t\t\"array_of_double\",\n", 77 | "\t\t\tArrayType(DoubleType()),\n", 78 | "\t\t\tTrue,\n", 79 | "\t\t),\n", 80 | "\t\tStructField(\n", 81 | "\t\t\t\"map_type\",\n", 82 | "\t\t\tMapType(\n", 83 | "\t\t\t\tStringType(),\n", 84 | "\t\t\t\tShortType(),\n", 85 | "\t\t\t\tTrue,\n", 86 | "\t\t\t),\n", 87 | "\t\t\tTrue,\n", 88 | "\t\t),\n", 89 | "\t\tStructField(\n", 90 | "\t\t\t\"struct_type\",\n", 91 | "\t\t\tStructType(\n", 92 | "\t\t\t\tfields=[\n", 93 | "\t\t\t\t\tStructField(\"t1\", StringType(), True),\n", 94 | "\t\t\t\t\tStructField(\"t2\", BooleanType(), True),\n", 95 | "\t\t\t\t]\n", 96 | "\t\t\t),\n", 97 | "\t\t\tTrue,\n", 98 | "\t\t),\n", 99 | "\t]\n", 100 | ")\n" 101 | ] 102 | }, 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "StructType([StructField('string_field', StringType(), True), StructField('decimal_38_10_field', DecimalType(38,10), True), StructField('decimal_10_2_field', DecimalType(10,2), True), StructField('array_of_double', ArrayType(DoubleType(), True), True), StructField('map_type', MapType(StringType(), ShortType(), True), True), StructField('struct_type', StructType([StructField('t1', StringType(), True), StructField('t2', BooleanType(), True)]), True)])" 107 | ] 108 | }, 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "print(print_schema_as_code(schema))\n", 116 | "\n", 117 | "# Create a dictionary of PySpark SQL types to provide context to 'eval()' \n", 118 | "spark_type_dict = {k: getattr(T, k) for k in dir(T) if isinstance(getattr(T, k), type)}\n", 119 | "eval(print_schema_as_code(schema), {\"__builtins__\": None}, spark_type_dict)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "6fb30b81", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3 (ipykernel)", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.10.12" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 5 152 | } 153 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | ## Quinn Helper Functions 2 | 3 | ```python 4 | import quinn 5 | ``` 6 | 7 | ### DataFrame Validations 8 | 9 | **validate_presence_of_columns()** 10 | 11 | ```python 12 | quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"]) 13 | ``` 14 | 15 | Raises an exception unless `source_df` contains the `name`, `age`, and `fun` column. 16 | 17 | **validate_schema()** 18 | 19 | ```python 20 | quinn.validate_schema(source_df, required_schema) 21 | ``` 22 | 23 | Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`. 24 | 25 | **validate_absence_of_columns()** 26 | 27 | ```python 28 | quinn.validate_absence_of_columns(source_df, ["age", "cool"]) 29 | ``` 30 | 31 | Raises an exception if `source_df` contains `age` or `cool` columns. 32 | 33 | ### Functions 34 | 35 | **single_space()** 36 | 37 | ```python 38 | actual_df = source_df.withColumn( 39 | "words_single_spaced", 40 | quinn.single_space(col("words")) 41 | ) 42 | ``` 43 | 44 | Replaces all multispaces with single spaces (e.g. changes `"this has some"` to `"this has some"`. 45 | 46 | **remove_all_whitespace()** 47 | 48 | ```python 49 | actual_df = source_df.withColumn( 50 | "words_without_whitespace", 51 | quinn.remove_all_whitespace(col("words")) 52 | ) 53 | ``` 54 | 55 | Removes all whitespace in a string (e.g. changes `"this has some"` to `"thishassome"`. 56 | 57 | **anti_trim()** 58 | 59 | ```python 60 | actual_df = source_df.withColumn( 61 | "words_anti_trimmed", 62 | quinn.anti_trim(col("words")) 63 | ) 64 | ``` 65 | 66 | Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes `" this has some "` to `" thishassome "`. 67 | 68 | **remove_non_word_characters()** 69 | 70 | ```python 71 | actual_df = source_df.withColumn( 72 | "words_without_nonword_chars", 73 | quinn.remove_non_word_characters(col("words")) 74 | ) 75 | ``` 76 | 77 | Removes all non-word characters from a string (e.g. changes `"si%$#@!#$!@#mpsons"` to `"simpsons"`. 78 | 79 | **multi_equals()** 80 | 81 | ```python 82 | source_df.withColumn( 83 | "are_s1_and_s2_cat", 84 | quinn.multi_equals("cat")(col("s1"), col("s2")) 85 | ) 86 | ``` 87 | 88 | `multi_equals` returns true if `s1` and `s2` are both equal to `"cat"`. 89 | 90 | **approx_equal()** 91 | 92 | This function takes 3 arguments which are 2 Pyspark DataFrames and one integer values as threshold, and returns the Boolean column which tells if the columns are equal in the threshold. 93 | 94 | ``` 95 | let the columns be 96 | col1 = [1.2, 2.5, 3.1, 4.0, 5.5] 97 | col2 = [1.3, 2.3, 3.0, 3.9, 5.6] 98 | threshold = 0.2 99 | 100 | result = approx_equal(col("col1"), col("col2"), threshold) 101 | result.show() 102 | 103 | +-----+ 104 | |value| 105 | +-----+ 106 | | true| 107 | |false| 108 | | true| 109 | | true| 110 | | true| 111 | +-----+ 112 | ``` 113 | 114 | **array_choice()** 115 | 116 | This function takes a Column as a parameter and returns a PySpark column that contains a random value from the input column parameter 117 | 118 | ``` 119 | df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["values"]) 120 | result = df.select(array_choice(col("values"))) 121 | 122 | The output is := 123 | +--------------+ 124 | |array_choice()| 125 | +--------------+ 126 | | 2| 127 | +--------------+ 128 | 129 | ``` 130 | 131 | **regexp_extract_all()** 132 | 133 | The regexp_extract_all takes 2 parameters String `s` and `regexp` which is a regular expression. This function finds all the matches for the string which satisfies the regular expression. 134 | 135 | ``` 136 | print(regexp_extract_all("this is a example text message for testing application",r"\b\w*a\w*\b")) 137 | 138 | The output is := 139 | ['a', 'example', 'message', 'application'] 140 | 141 | ``` 142 | 143 | Where `r"\b\w*a\w*\b"` pattern checks for words containing letter `a` 144 | 145 | **week_start_date()** 146 | 147 | It takes 2 parameters, column and week_start_day. It returns a Spark Dataframe column which contains the start date of the week. By default the week_start_day is set to "Sun". 148 | 149 | For input `["2023-03-05", "2023-03-06", "2023-03-07", "2023-03-08"]` the Output is 150 | 151 | ``` 152 | result = df.select("date", week_start_date(col("date"), "Sun")) 153 | result.show() 154 | +----------+----------------+ 155 | | date|week_start_date | 156 | +----------+----------------+ 157 | |2023-03-05| 2023-03-05| 158 | |2023-03-07| 2023-03-05| 159 | |2023-03-08| 2023-03-05| 160 | +----------+----------------+ 161 | ``` 162 | 163 | **week_end_date()** 164 | 165 | It also takes 2 Paramters as Column and week_end_day, and returns the dateframe column which contains the end date of the week. By default the week_end_day is set to "sat" 166 | 167 | ``` 168 | +---------+-------------+ 169 | date|week_end_date| 170 | +---------+-------------+ 171 | 2023-03-05| 2023-03-05| 172 | 2023-03-07| 2023-03-12| 173 | 2023-03-08| 2023-03-12| 174 | +---------+-------------+ 175 | 176 | ``` 177 | 178 | **uuid5()** 179 | 180 | This function generates UUIDv5 in string form from the passed column and optionally namespace and optional extra salt. 181 | By default namespace is NAMESPACE_DNS UUID and no extra string used to reduce hash collisions. 182 | 183 | ``` 184 | 185 | df = spark.createDataFrame([("lorem",), ("ipsum",)], ["values"]) 186 | result = df.select(quinn.uuid5(F.col("values")).alias("uuid5")) 187 | result.show(truncate=False) 188 | 189 | The output is := 190 | +------------------------------------+ 191 | |uuid5 | 192 | +------------------------------------+ 193 | |35482fda-c10a-5076-8da2-dc7bf22d6be4| 194 | |51b79c1d-d06c-5b30-a5c6-1fadcd3b2103| 195 | +------------------------------------+ 196 | 197 | ``` 198 | 199 | ### Transformations 200 | 201 | **snake_case_col_names()** 202 | 203 | ```python 204 | quinn.snake_case_col_names(source_df) 205 | ``` 206 | 207 | Converts all the column names in a DataFrame to snake_case. It's annoying to write SQL queries when columns aren't snake cased. 208 | 209 | **sort_columns()** 210 | 211 | ```python 212 | quinn.sort_columns(df=source_df, sort_order="asc", sort_nested=True) 213 | ``` 214 | 215 | Sorts the DataFrame columns in alphabetical order, including nested columns if sort_nested is set to True. Wide DataFrames are easier to navigate when they're sorted alphabetically. 216 | 217 | ### DataFrame Helpers 218 | 219 | **column_to_list()** 220 | 221 | ```python 222 | quinn.column_to_list(source_df, "name") 223 | ``` 224 | 225 | Converts a column in a DataFrame to a list of values. 226 | 227 | **two_columns_to_dictionary()** 228 | 229 | ```python 230 | quinn.two_columns_to_dictionary(source_df, "name", "age") 231 | ``` 232 | 233 | Converts two columns of a DataFrame into a dictionary. In this example, `name` is the key and `age` is the value. 234 | 235 | **to_list_of_dictionaries()** 236 | 237 | ```python 238 | quinn.to_list_of_dictionaries(source_df) 239 | ``` 240 | 241 | Converts an entire DataFrame into a list of dictionaries. 242 | 243 | **show_output_to_df()** 244 | 245 | ```python 246 | quinn.show_output_to_df(output_str, spark) 247 | ``` 248 | 249 | Parses a spark DataFrame output string into a spark DataFrame. Useful for quickly pulling data from a log into a DataFrame. In this example, output_str is a string of the form: 250 | 251 | ``` 252 | +----+---+-----------+------+ 253 | |name|age| stuff1|stuff2| 254 | +----+---+-----------+------+ 255 | |jose| 1|nice person| yoyo| 256 | | li| 2|nice person| yoyo| 257 | | liz| 3|nice person| yoyo| 258 | +----+---+-----------+------+ 259 | ``` 260 | 261 | ### Schema Helpers 262 | 263 | **schema_from_csv()** 264 | 265 | ```python 266 | quinn.schema_from_csv("schema.csv") 267 | ``` 268 | 269 | Converts a CSV file into a PySpark schema (aka `StructType`). The CSV must contain the column name and type. The nullable and metadata columns are optional. 270 | 271 | Here's an example CSV file: 272 | 273 | ``` 274 | name,type 275 | person,string 276 | address,string 277 | phoneNumber,string 278 | age,int 279 | ``` 280 | 281 | Here's how to convert that CSV file to a PySpark schema: 282 | 283 | ```python 284 | schema = schema_from_csv(spark, "some_file.csv") 285 | 286 | StructType([ 287 | StructField("person", StringType(), True), 288 | StructField("address", StringType(), True), 289 | StructField("phoneNumber", StringType(), True), 290 | StructField("age", IntegerType(), True), 291 | ]) 292 | ``` 293 | 294 | Here's a more complex CSV file: 295 | 296 | ``` 297 | name,type,nullable,metadata 298 | person,string,false,{"description":"The person's name"} 299 | address,string 300 | phoneNumber,string,TRUE,{"description":"The person's phone number"} 301 | age,int,False 302 | ``` 303 | 304 | Here's how to read this CSV file into a PySpark schema: 305 | 306 | ```python 307 | another_schema = schema_from_csv(spark, "some_file.csv") 308 | 309 | StructType([ 310 | StructField("person", StringType(), False, {"description": "The person's name"}), 311 | StructField("address", StringType(), True), 312 | StructField("phoneNumber", StringType(), True, {"description": "The person's phone number"}), 313 | StructField("age", IntegerType(), False), 314 | ]) 315 | ``` 316 | 317 | **print_schema_as_code()** 318 | 319 | ```python 320 | fields = [ 321 | StructField("simple_int", IntegerType()), 322 | StructField("decimal_with_nums", DecimalType(19, 8)), 323 | StructField("array", ArrayType(FloatType())) 324 | ] 325 | schema = StructType(fields) 326 | printable_schema: str = quinn.print_schema_as_code(schema) 327 | ``` 328 | 329 | Converts a Spark `DataType` to a string of Python code that can be evaluated as code using eval(). If the `DataType` is a `StructType`, this can be used to print an existing schema in a format that can be copy-pasted into a Python script, log to a file, etc. 330 | 331 | For example: 332 | ```python 333 | print(printable_schema) 334 | ``` 335 | 336 | ``` 337 | StructType( 338 | fields=[ 339 | StructField("simple_int", IntegerType(), True), 340 | StructField("decimal_with_nums", DecimalType(19, 8), True), 341 | StructField( 342 | "array", 343 | ArrayType(FloatType()), 344 | True, 345 | ), 346 | ] 347 | ) 348 | ``` 349 | 350 | Once evaluated, the printable schema is a valid schema that can be used in dataframe creation, validation, etc. 351 | 352 | ```python 353 | from chispa.schema_comparer import assert_basic_schema_equality 354 | 355 | parsed_schema = eval(printable_schema) 356 | assert_basic_schema_equality(parsed_schema, schema) # passes 357 | ``` 358 | 359 | 360 | `print_schema_as_code()` can also be used to print other `DataType` objects. 361 | 362 | `ArrayType` 363 | ```python 364 | array_type = ArrayType(FloatType()) 365 | printable_type: str = quinn.print_schema_as_code(array_type) 366 | print(printable_type) 367 | ``` 368 | 369 | ``` 370 | ArrayType(FloatType()) 371 | ``` 372 | 373 | `MapType` 374 | ```python 375 | map_type = MapType(StringType(), FloatType()) 376 | printable_type: str = quinn.print_schema_as_code(map_type) 377 | print(printable_type) 378 | ``` 379 | 380 | ``` 381 | MapType( 382 | StringType(), 383 | FloatType(), 384 | True, 385 | ) 386 | ``` 387 | 388 | `IntegerType`, `StringType` etc. 389 | ```python 390 | integer_type = IntegerType() 391 | printable_type: str = quinn.print_schema_as_code(integer_type) 392 | print(printable_type) 393 | ``` 394 | 395 | ``` 396 | IntegerType() 397 | ``` 398 | 399 | ## Pyspark Core Class Extensions 400 | 401 | ``` 402 | import pyspark.sql.functions as F 403 | import quinn 404 | ``` 405 | 406 | ### Column Extensions 407 | 408 | **isFalsy()** 409 | 410 | ```python 411 | source_df.withColumn("is_stuff_falsy", quinn.is_falsy(F.col("has_stuff"))) 412 | ``` 413 | 414 | Returns `True` if `has_stuff` is `None` or `False`. 415 | 416 | **isTruthy()** 417 | 418 | ```python 419 | source_df.withColumn("is_stuff_truthy", quinn.is_truthy(F.col("has_stuff"))) 420 | ``` 421 | 422 | Returns `True` unless `has_stuff` is `None` or `False`. 423 | 424 | **isNullOrBlank()** 425 | 426 | ```python 427 | source_df.withColumn("is_blah_null_or_blank", quinn.is_null_or_blank(F.col("blah"))) 428 | ``` 429 | 430 | Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace). 431 | 432 | **isNotIn()** 433 | 434 | ```python 435 | source_df.withColumn("is_not_bobs_hobby", quinn.is_not_in(F.col("fun_thing"))) 436 | ``` 437 | 438 | Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list. 439 | 440 | **nullBetween()** 441 | 442 | ```python 443 | source_df.withColumn("is_between", quinn.null_between(F.col("age"), F.col("lower_age"), F.col("upper_age"))) 444 | ``` 445 | 446 | Returns `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`. -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Quinn 2 | site_url: "https://mrpowers.github.io/quinn/" 3 | repo_url: "https://github.com/MrPowers/quinn" 4 | repo_name: "MrPowers/quinn" 5 | 6 | theme: 7 | name: material 8 | palette: 9 | - media: "(prefers-color-scheme: light)" 10 | scheme: default 11 | toggle: 12 | icon: material/brightness-7 13 | name: Switch to dark mode 14 | - media: "(prefers-color-scheme: dark)" 15 | scheme: slate 16 | toggle: 17 | icon: material/brightness-4 18 | name: Switch to light mode 19 | features: 20 | - navigation.tracking 21 | - navigation.instant 22 | - navigation.tabs 23 | - navigation.tabs.sticky 24 | - navigation.footer 25 | - navigation.indexes 26 | - navigation.expand 27 | - content.tabs.link 28 | - content.code.copy 29 | - content.code.select 30 | 31 | plugins: 32 | - search 33 | - gen-files: 34 | scripts: 35 | - docs/gen_ref_pages.py 36 | - section-index 37 | - mkdocstrings: 38 | default_handler: python 39 | handlers: 40 | python: 41 | options: 42 | docstring_style: sphinx 43 | docstring_options: 44 | show_if_no_docstring: true 45 | show_source: true 46 | - mkdocs-jupyter 47 | - markdown-exec 48 | 49 | nav: 50 | - Home: index.md 51 | - Usage: usage.md 52 | - API Reference: reference/SUMMARY.md 53 | - Examples: 54 | - "examples/index.md" 55 | - "notebooks/schema_as_code.ipynb" 56 | - Learn more: 57 | - learn_more/index.md 58 | - learn_more/column_to_list.md 59 | 60 | 61 | markdown_extensions: 62 | - markdown_include.include: 63 | base_path: docs 64 | - attr_list 65 | - pymdownx.emoji: 66 | emoji_index: !!python/name:materialx.emoji.twemoji 67 | emoji_generator: !!python/name:materialx.emoji.to_svg -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "quinn" 3 | version = "0.10.3" 4 | description = "Pyspark helper methods to maximize developer efficiency" 5 | authors = ["MrPowers "] 6 | 7 | # Maintainers of the project 8 | maintainers = [ 9 | "SemyonSinchenko " 10 | ] 11 | 12 | readme = "README.md" 13 | homepage = "https://github.com/MrPowers/quinn/" 14 | keywords = ['apachespark', 'spark', 'pyspark'] 15 | 16 | [build-system] 17 | requires = ["poetry>=0.12"] 18 | build-backend = "poetry.masonry.api" 19 | 20 | ########################################################################### 21 | # MAIN DEPENDENCIES 22 | ########################################################################### 23 | 24 | [tool.poetry.dependencies] 25 | python = ">=3.7,<4.0" 26 | 27 | 28 | ########################################################################### 29 | # DEPENDENCY GROUPS 30 | ########################################################################### 31 | 32 | [tool.poetry.group.development] 33 | optional = true 34 | 35 | [tool.poetry.group.docs] 36 | optional = true 37 | 38 | [tool.poetry.group.testing] 39 | optional = true 40 | 41 | [tool.poetry.group.linting] 42 | optional = true 43 | 44 | [tool.poetry.group.development.dependencies] 45 | pyspark = ">2" 46 | semver = "^3" 47 | 48 | [tool.poetry.group.testing.dependencies] 49 | pytest = "^7" 50 | pytest-rerunfailures= "^13" 51 | chispa = "0.9.4" 52 | pytest-describe = "^2" 53 | pyspark = ">2" 54 | semver = "^3" 55 | 56 | [tool.poetry.group.linting.dependencies] 57 | ruff = "^0.0.291" 58 | 59 | [tool.poetry.group.docs.dependencies] 60 | mkdocstrings-python = "^0.8.3" 61 | mkdocs-gen-files = "^0.4.0" 62 | mkdocs-literate-nav = "^0.6.0" 63 | mkdocs-section-index = "^0.3.5" 64 | markdown-include = "^0.8.1" 65 | mkdocs = "^1" 66 | jupyterlab = "*" 67 | mkdocs-jupyter = "*" 68 | mkdocs-material = "*" 69 | pymdown-extensions = "*" 70 | mkdocs-macros-plugin = "*" 71 | mkdocs-material-extensions = "*" 72 | markdown-exec = "*" 73 | ########################################################################### 74 | # LINTING CONFIGURATION 75 | ########################################################################### 76 | 77 | [tool.ruff] 78 | select = ["ALL"] 79 | line-length = 150 80 | ignore = [ 81 | "D100", 82 | "D203", # Ignore blank line before summary of class 83 | "D213", # Ignore multiline summary second line 84 | "T201", # Allow print() in code. 85 | "D401", # Docstrings should be in imperative modes 86 | "D404", # Boring thing about how to write docsrings 87 | "FBT001", # Boolean positional arg is OK 88 | "FBT002", # Boolean default arg value is OK 89 | "D205", # It is broken 90 | "TCH003", # I have no idea what is it about 91 | "PLC1901", # Strange thing 92 | "UP007", # Not supported in py3.6 93 | "UP038", # Not supported in all py versions 94 | "SIM108", # Don't create long ternary operators 95 | "PTH123", # Don't force use of Pathlib 96 | "PTH207", # Don't force use of Pathlib 97 | "PTH113", # Don't force use of Pathlib 98 | ] 99 | extend-exclude = ["tests", "docs"] 100 | 101 | [tool.ruff.per-file-ignores] 102 | "quinn/extensions/column_ext.py" = ["FBT003", "N802"] 103 | "quinn/extensions/__init__.py" = ["F401", "F403"] 104 | "quinn/__init__.py" = ["F401", "F403"] 105 | "quinn/functions.py" = ["FBT003"] 106 | "quinn/keyword_finder.py" = ["A002"] 107 | -------------------------------------------------------------------------------- /quinn.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /quinn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/quinn.png -------------------------------------------------------------------------------- /quinn/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | """quinn API.""" 15 | 16 | from quinn.append_if_schema_identical import append_if_schema_identical 17 | from quinn.dataframe_helpers import ( 18 | column_to_list, 19 | create_df, 20 | print_athena_create_table, 21 | show_output_to_df, 22 | to_list_of_dictionaries, 23 | two_columns_to_dictionary, 24 | ) 25 | from quinn.dataframe_validator import ( 26 | DataFrameMissingColumnError, 27 | DataFrameMissingStructFieldError, 28 | DataFrameProhibitedColumnError, 29 | validate_absence_of_columns, 30 | validate_presence_of_columns, 31 | validate_schema, 32 | ) 33 | from quinn.functions import ( 34 | anti_trim, 35 | approx_equal, 36 | array_choice, 37 | business_days_between, 38 | exists, 39 | forall, 40 | is_false, 41 | is_falsy, 42 | is_not_in, 43 | is_null_or_blank, 44 | is_true, 45 | is_truthy, 46 | multi_equals, 47 | null_between, 48 | remove_all_whitespace, 49 | remove_non_word_characters, 50 | single_space, 51 | uuid5, 52 | week_end_date, 53 | week_start_date, 54 | ) 55 | from quinn.math import rand_laplace, rand_range, randn 56 | from quinn.schema_helpers import print_schema_as_code 57 | from quinn.split_columns import split_col 58 | from quinn.transformations import ( 59 | snake_case_col_names, 60 | sort_columns, 61 | to_snake_case, 62 | with_columns_renamed, 63 | with_some_columns_renamed, 64 | ) 65 | -------------------------------------------------------------------------------- /quinn/append_if_schema_identical.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from pyspark.sql import DataFrame 15 | 16 | 17 | class SchemaMismatchError(ValueError): 18 | """raise this when there's a schema mismatch between source & target schema.""" 19 | 20 | 21 | def append_if_schema_identical(source_df: DataFrame, target_df: DataFrame) -> DataFrame: 22 | """Compare the schema of source & target dataframe. 23 | 24 | :param source_df: Input DataFrame 25 | :type source_df: pyspark.sql.DataFrame 26 | :param target_df: Input DataFrame 27 | :type target_df: pyspark.sql.DataFrame 28 | :return: dataframe 29 | :rtype: pyspark.sql.DataFrame 30 | """ 31 | # Retrieve the schemas of the source and target dataframes 32 | source_schema = source_df.schema 33 | target_schema = target_df.schema 34 | 35 | # Convert the schemas to a list of tuples 36 | source_schema_list = [(field.name, str(field.dataType)) for field in source_schema] 37 | target_schema_list = [(field.name, str(field.dataType)) for field in target_schema] 38 | 39 | unmatched_cols = [ 40 | col for col in source_schema_list if col not in target_schema_list 41 | ] 42 | error_message = ( 43 | f"The schemas of the source and target dataframes are not identical." 44 | f"From source schema column {unmatched_cols} is missing in target schema" 45 | ) 46 | # Check if the column names in the source and target schemas are the same, regardless of their order 47 | if set(source_schema.fieldNames()) != set(target_schema.fieldNames()): 48 | raise SchemaMismatchError(error_message) 49 | # Check if the column names and data types in the source and target schemas are the same, in the same order 50 | if sorted(source_schema_list) != sorted(target_schema_list): 51 | raise SchemaMismatchError(error_message) 52 | 53 | # Append the dataframes if the schemas are identical 54 | return target_df.unionByName(source_df) 55 | -------------------------------------------------------------------------------- /quinn/dataframe_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | from typing import TYPE_CHECKING 17 | 18 | if TYPE_CHECKING: 19 | from pyspark.sql import DataFrame, SparkSession 20 | import sys 21 | import warnings 22 | from typing import Any 23 | 24 | from pyspark.sql.types import StructField, StructType 25 | 26 | 27 | def column_to_list(df: DataFrame, col_name: str) -> list[Any]: 28 | """Collect column to list of values. 29 | 30 | :param df: Input DataFrame 31 | :type df: pyspark.sql.DataFrame 32 | :param col_name: Column to collect 33 | :type col_name: str 34 | :return: List of values 35 | :rtype: List[Any] 36 | """ 37 | if "pyspark" not in sys.modules: 38 | raise ImportError 39 | 40 | # sparksession from df is not available in older versions of pyspark 41 | if sys.modules["pyspark"].__version__ < "3.3.0": 42 | return [row[0] for row in df.select(col_name).collect()] 43 | 44 | spark_session = df.sparkSession.getActiveSession() 45 | if spark_session is None: 46 | return [row[0] for row in df.select(col_name).collect()] 47 | 48 | pyarrow_enabled = ( 49 | spark_session.conf.get( 50 | "spark.sql.execution.arrow.pyspark.enabled", 51 | ) 52 | == "true" 53 | ) 54 | 55 | pyarrow_valid = pyarrow_enabled and sys.modules["pyarrow"].__version__ >= "0.17.0" 56 | 57 | pandas_exists = "pandas" in sys.modules 58 | pandas_valid = pandas_exists and sys.modules["pandas"].__version__ >= "0.24.2" 59 | 60 | if pyarrow_valid and pandas_valid: 61 | return df.select(col_name).toPandas()[col_name].tolist() 62 | 63 | return [row[0] for row in df.select(col_name).collect()] 64 | 65 | 66 | def two_columns_to_dictionary( 67 | df: DataFrame, 68 | key_col_name: str, 69 | value_col_name: str, 70 | ) -> dict[str, Any]: 71 | """Collect two columns as dictionary when first column is key and second is value. 72 | 73 | :param df: Input DataFrame 74 | :type df: pyspark.sql.DataFrame 75 | :param key_col_name: Key-column 76 | :type key_col_name: str 77 | :param value_col_name: Value-column 78 | :type value_col_name: str 79 | :return: Dictionary with values 80 | :rtype: Dict[str, Any] 81 | """ 82 | k, v = key_col_name, value_col_name 83 | return {x[k]: x[v] for x in df.select(k, v).collect()} 84 | 85 | 86 | def to_list_of_dictionaries(df: DataFrame) -> list[dict[str, Any]]: 87 | """Convert a Spark DataFrame to a list of dictionaries. 88 | 89 | :param df: The Spark DataFrame to convert. 90 | :type df: :py:class:`pyspark.sql.DataFrame` 91 | :return: A list of dictionaries representing the rows in the DataFrame. 92 | :rtype: List[Dict[str, Any]] 93 | """ 94 | return list(map(lambda r: r.asDict(), df.collect())) # noqa: C417 95 | 96 | 97 | def print_athena_create_table( 98 | df: DataFrame, 99 | athena_table_name: str, 100 | s3location: str, 101 | ) -> None: 102 | """Generate the Athena create table statement for a given DataFrame. 103 | :param df: The pyspark.sql.DataFrame to use 104 | :param athena_table_name: The name of the athena table to generate 105 | :param s3location: The S3 location of the parquet data 106 | :return: None. 107 | """ 108 | warnings.warn( 109 | "Function print_athena_create_table is deprecated and will be removed in the version 1.0", 110 | category=DeprecationWarning, 111 | stacklevel=2, 112 | ) 113 | 114 | fields = df.schema 115 | 116 | print(f"CREATE EXTERNAL TABLE IF NOT EXISTS `{athena_table_name}` ( ") 117 | 118 | for field in fields.fieldNames()[:-1]: 119 | print("\t", f"`{fields[field].name}` {fields[field].dataType.simpleString()}, ") 120 | last = fields[fields.fieldNames()[-1]] 121 | print("\t", f"`{last.name}` {last.dataType.simpleString()} ") 122 | 123 | print(")") 124 | print("STORED AS PARQUET") 125 | print(f"LOCATION '{s3location}'\n") 126 | 127 | 128 | def show_output_to_df(show_output: str, spark: SparkSession) -> DataFrame: 129 | """Show output as spark DataFrame. 130 | 131 | :param show_output: String representing output of 'show' command in spark 132 | :type show_output: str 133 | :param spark: SparkSession object 134 | :type spark: SparkSession 135 | :return: DataFrame object containing output of a show command in spark 136 | :rtype: Dataframe 137 | """ 138 | lines = show_output.split("\n") 139 | ugly_column_names = lines[1] 140 | pretty_column_names = [i.strip() for i in ugly_column_names[1:-1].split("|")] 141 | pretty_data = [] 142 | ugly_data = lines[3:-1] 143 | for row in ugly_data: 144 | r = [i.strip() for i in row[1:-1].split("|")] 145 | pretty_data.append(tuple(r)) 146 | return spark.createDataFrame(pretty_data, pretty_column_names) 147 | 148 | 149 | def create_df(spark: SparkSession, rows_data, col_specs) -> DataFrame: # noqa: ANN001 150 | """Create a new DataFrame from the given data and column specs. 151 | 152 | The returned DataFrame s created using the StructType and StructField classes provided by PySpark. 153 | 154 | :param spark: SparkSession object 155 | :type spark: SparkSession 156 | :param rows_data: the data used to create the DataFrame 157 | :type rows_data: array-like 158 | :param col_specs: list of tuples containing the name and type of the field 159 | :type col_specs: list of tuples 160 | :return: a new DataFrame 161 | :rtype: DataFrame 162 | """ 163 | struct_fields = list(map(lambda x: StructField(*x), col_specs)) # noqa: C417 164 | return spark.createDataFrame(data=rows_data, schema=StructType(struct_fields)) 165 | -------------------------------------------------------------------------------- /quinn/dataframe_validator.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | import copy 17 | from typing import TYPE_CHECKING, Union 18 | 19 | if TYPE_CHECKING: 20 | from pyspark.sql import DataFrame 21 | from pyspark.sql.types import StructType 22 | 23 | 24 | class DataFrameMissingColumnError(ValueError): 25 | """Raise this when there's a DataFrame column error.""" 26 | 27 | 28 | class DataFrameMissingStructFieldError(ValueError): 29 | """Raise this when there's a DataFrame column error.""" 30 | 31 | 32 | class DataFrameProhibitedColumnError(ValueError): 33 | """Raise this when a DataFrame includes prohibited columns.""" 34 | 35 | 36 | def validate_presence_of_columns(df: DataFrame, required_col_names: list[str], return_bool: bool = False) -> Union[None, bool]: 37 | """Validate the presence of column names in a DataFrame. 38 | :param df: A spark DataFrame. 39 | :type df: DataFrame 40 | :param required_col_names: List of the required column names for the DataFrame. 41 | :type required_col_names: list[str] 42 | :param return_bool: If True, return a boolean instead of raising an exception. 43 | :type return_bool: bool 44 | :return: None if return_bool is False, otherwise a boolean indicating if validation passed. 45 | :raises DataFrameMissingColumnError: if any of the requested column names are 46 | not present in the DataFrame and return_bool is False. 47 | """ 48 | all_col_names = df.columns 49 | missing_col_names = [x for x in required_col_names if x not in all_col_names] 50 | 51 | if missing_col_names: 52 | error_message = f"The {missing_col_names} columns are not included in the DataFrame with the following columns {all_col_names}" 53 | if return_bool: 54 | return False 55 | raise DataFrameMissingColumnError(error_message) 56 | 57 | return True if return_bool else None 58 | 59 | 60 | def validate_schema( 61 | df: DataFrame, 62 | required_schema: StructType, 63 | ignore_nullable: bool = False, 64 | return_bool: bool = False, 65 | ) -> Union[None, bool]: 66 | """Function that validate if a given DataFrame has a given StructType as its schema. 67 | :param df: DataFrame to validate 68 | :type df: DataFrame 69 | :param required_schema: StructType required for the DataFrame 70 | :type required_schema: StructType 71 | :param ignore_nullable: (Optional) A flag for if nullable fields should be 72 | ignored during validation 73 | :type ignore_nullable: bool, optional 74 | :param return_bool: If True, return a boolean instead of raising an exception. 75 | :type return_bool: bool 76 | :return: None if return_bool is False, otherwise a boolean indicating if validation passed. 77 | :raises DataFrameMissingStructFieldError: if any StructFields from the required 78 | schema are not included in the DataFrame schema and return_bool is False. 79 | """ 80 | _all_struct_fields = copy.deepcopy(df.schema) 81 | _required_schema = copy.deepcopy(required_schema) 82 | 83 | if ignore_nullable: 84 | for x in _all_struct_fields: 85 | x.nullable = None 86 | 87 | for x in _required_schema: 88 | x.nullable = None 89 | 90 | missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields] 91 | 92 | if missing_struct_fields: 93 | error_message = ( 94 | f"The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {_all_struct_fields}" 95 | ) 96 | if return_bool: 97 | return False 98 | raise DataFrameMissingStructFieldError(error_message) 99 | 100 | return True if return_bool else None 101 | 102 | 103 | def validate_absence_of_columns(df: DataFrame, prohibited_col_names: list[str], return_bool: bool = False) -> Union[None, bool]: 104 | """Validate that none of the prohibited column names are present among specified DataFrame columns. 105 | :param df: DataFrame containing columns to be checked. 106 | :param prohibited_col_names: List of prohibited column names. 107 | :param return_bool: If True, return a boolean instead of raising an exception. 108 | :type return_bool: bool 109 | :return: None if return_bool is False, otherwise a boolean indicating if validation passed. 110 | :raises DataFrameProhibitedColumnError: If the prohibited column names are 111 | present among the specified DataFrame columns and return_bool is False. 112 | """ 113 | all_col_names = df.columns 114 | extra_col_names = [x for x in all_col_names if x in prohibited_col_names] 115 | 116 | if extra_col_names: 117 | error_message = f"The {extra_col_names} columns are not allowed to be included in the DataFrame with the following columns {all_col_names}" 118 | if return_bool: 119 | return False 120 | raise DataFrameProhibitedColumnError(error_message) 121 | 122 | return True if return_bool else None 123 | -------------------------------------------------------------------------------- /quinn/extensions/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | """Extensions API.""" 15 | 16 | from quinn.extensions.dataframe_ext import _ext_function 17 | from quinn.extensions.spark_session_ext import create_df 18 | -------------------------------------------------------------------------------- /quinn/extensions/dataframe_ext.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import warnings 15 | 16 | from pyspark.sql import SparkSession 17 | from pyspark.sql.dataframe import DataFrame 18 | 19 | 20 | def _ext_function(spark: SparkSession, f: object) -> object: 21 | warnings.warn( 22 | "Extensions may be removed in the future versions of quinn. Please use explicit functions instead", 23 | category=DeprecationWarning, 24 | stacklevel=2, 25 | ) 26 | return f(spark) 27 | 28 | 29 | DataFrame.transform = getattr(DataFrame, "transform", _ext_function) 30 | -------------------------------------------------------------------------------- /quinn/extensions/spark_session_ext.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | import warnings 17 | 18 | from pyspark.sql import DataFrame, SparkSession 19 | from pyspark.sql.types import StructField, StructType 20 | 21 | 22 | def create_df( 23 | spark: SparkSession, 24 | rows_data: list[tuple], 25 | col_specs: list[tuple], 26 | ) -> DataFrame: 27 | """Creates a new DataFrame from the given data and column specs. 28 | 29 | The returned DataFrame is created using the StructType and StructField classes provided by PySpark. 30 | 31 | :param rows_data: the data used to create the DataFrame 32 | :type rows_data: array-like 33 | :param col_specs: list of tuples containing the name and type of the field 34 | :type col_specs: list of tuples 35 | :return: a new DataFrame 36 | :rtype: DataFrame 37 | """ 38 | warnings.warn( 39 | "Extensions may be removed in the future versions of quinn. Please use `quinn.create_df()` instead", 40 | category=DeprecationWarning, 41 | stacklevel=2, 42 | ) 43 | 44 | struct_fields = [StructField(*x) for x in col_specs] 45 | return spark.createDataFrame(data=rows_data, schema=StructType(struct_fields)) 46 | 47 | 48 | SparkSession.create_df = create_df 49 | -------------------------------------------------------------------------------- /quinn/keyword_finder.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | import os 17 | from dataclasses import dataclass 18 | from glob import iglob 19 | 20 | default_keywords = [ 21 | "_jsc", 22 | "_jconf", 23 | "_jvm", 24 | "_jsparkSession", 25 | "_jreader", 26 | "_jc", 27 | "_jseq", 28 | "_jdf", 29 | "_jmap", 30 | "_jco", 31 | "emptyRDD", 32 | "range", 33 | "init_batched_serializer", 34 | "parallelize", 35 | "pickleFile", 36 | "textFile", 37 | "wholeTextFiles", 38 | "binaryFiles", 39 | "binaryRecords", 40 | "sequenceFile", 41 | "newAPIHadoopFile", 42 | "newAPIHadoopRDD", 43 | "hadoopFile", 44 | "hadoopRDD", 45 | "union", 46 | "runJob", 47 | "setSystemProperty", 48 | "uiWebUrl", 49 | "stop", 50 | "setJobGroup", 51 | "setLocalProperty", 52 | "getCon", 53 | "rdd", 54 | "sparkContext", 55 | ] 56 | 57 | @dataclass 58 | class SearchResult: 59 | """Class to hold the results of a file search. 60 | file_path: The path to the file that was searched. 61 | word_count: A dictionary containing the number of times each keyword was found in the file. 62 | """ 63 | 64 | file_path: str 65 | word_count: dict[str, int] 66 | 67 | 68 | def search_file(path: str, keywords: list[str] = default_keywords) -> SearchResult: 69 | """Searches a file for keywords and prints the line number and line containing the keyword. 70 | 71 | :param path: The path to the file to search. 72 | :type path: str 73 | :param keywords: The list of keywords to search for. 74 | :type keywords: list[str] 75 | :returns: A dictionary containing a file path and the number of lines containing a keyword in `keywords`. 76 | :rtype: SearchResult 77 | 78 | """ 79 | match_results = SearchResult(file_path=path, word_count={keyword: 0 for keyword in keywords}) 80 | 81 | print(f"\nSearching: {path}") 82 | with open(path) as f: 83 | for line_number, line in enumerate(f, 1): 84 | line_printed = False 85 | for keyword in keywords: 86 | if keyword in line: 87 | match_results.word_count[keyword] += 1 88 | 89 | if not line_printed: 90 | print(f"{line_number}: {keyword_format(line)}", end="") 91 | line_printed = True 92 | 93 | return match_results 94 | 95 | 96 | def search_files(path: str, keywords: list[str] = default_keywords) -> list[SearchResult]: 97 | """Searches all files in a directory for keywords. 98 | 99 | :param path: The path to the directory to search. 100 | :type path: str 101 | :param keywords: The list of keywords to search for. 102 | :type keywords: list[str] 103 | :returns: A list of dictionaries containing file paths and the number of lines containing a keyword in `keywords`. 104 | :rtype: list[SearchResult] 105 | 106 | """ 107 | rootdir_glob = f"{path}/**/*" 108 | file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f)] 109 | return [search_file(f, keywords) for f in file_list] 110 | 111 | 112 | def keyword_format(input: str, keywords: list[str] = default_keywords) -> str: 113 | """Formats the input string to highlight the keywords. 114 | 115 | :param input: The string to format. 116 | :type input: str 117 | :param keywords: The list of keywords to highlight. 118 | :type keywords: list[str] 119 | 120 | """ 121 | nc = "\033[0m" 122 | red = "\033[31m" 123 | bold = "\033[1m" 124 | res = input 125 | for keyword in keywords: 126 | res = surround_substring(res, keyword, red + bold, nc) 127 | return res 128 | 129 | 130 | def surround_substring(input: str, substring: str, surround_start: str, surround_end: str) -> str: 131 | """Surrounds a substring with the given start and end strings. 132 | 133 | :param input: The string to search. 134 | :type input: str 135 | :param substring: The substring to surround. 136 | :type substring: str 137 | :param surround_start: The string to start the surrounding with. 138 | :type surround_start: str 139 | :param surround_end: The string to end the surrounding with. 140 | :type surround_end: str 141 | :returns: The input string with the substring surrounded. 142 | :rtype: str 143 | 144 | """ 145 | return input.replace( 146 | substring, 147 | surround_start + substring + surround_end, 148 | ) 149 | -------------------------------------------------------------------------------- /quinn/math.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | """Math routines for PySpark.""" 15 | from __future__ import annotations 16 | 17 | from typing import Optional, Union 18 | 19 | from pyspark.sql import Column 20 | from pyspark.sql import functions as F # noqa: N812 21 | 22 | 23 | def rand_laplace( 24 | mu: Union[float, Column], 25 | beta: Union[float, Column], 26 | seed: Optional[int] = None, 27 | ) -> Column: 28 | """Generate random numbers from Laplace(mu, beta). 29 | 30 | :param mu: mu parameter of Laplace distribution 31 | :param beta: beta parameter of Laplace distribution 32 | :param seed: random seed value (optional, default None) 33 | :returns: column with random numbers 34 | """ 35 | if not isinstance(mu, Column): 36 | mu = F.lit(mu) 37 | 38 | if not isinstance(beta, Column): 39 | beta = F.lit(beta) 40 | 41 | u = F.rand(seed) - F.lit(0.5) 42 | return (mu - beta * F.signum(u) * F.log(F.lit(1) - (F.lit(2) * F.abs(u)))).alias( 43 | "laplace_random", 44 | ) 45 | 46 | 47 | def rand_range( 48 | minimum: Union[int, Column], 49 | maximum: Union[int, Column], 50 | seed: Optional[int] = None, 51 | ) -> Column: 52 | """Generate random numbers uniformly distributed in [`minimum`, `maximum`). 53 | 54 | :param minimum: minimum value of the random numbers 55 | :param maximum: maximum value of the random numbers 56 | :param seed: random seed value (optional, default None) 57 | :returns: column with random numbers 58 | """ 59 | if not isinstance(minimum, Column): 60 | minimum = F.lit(minimum) 61 | 62 | if not isinstance(maximum, Column): 63 | maximum = F.lit(maximum) 64 | 65 | u = F.rand(seed) 66 | 67 | return minimum + (maximum - minimum) * u 68 | 69 | 70 | def randn( 71 | mean: Union[float, Column], 72 | variance: Union[float, Column], 73 | seed: Optional[int] = None, 74 | ) -> Column: 75 | """Generate a column with independent and identically distributed (i.i.d.) samples from 76 | the standard normal distribution with given `mean` and `variance`.. 77 | 78 | :param mean: Mean of the normal distribution of the random numbers 79 | :param variance: variance of the normal distribution of the random numbers 80 | :param seed: random seed value (optional, default None) 81 | :returns: column with random numbers 82 | """ 83 | if not isinstance(mean, Column): 84 | mean = F.lit(mean) 85 | 86 | if not isinstance(variance, Column): 87 | variance = F.lit(variance) 88 | 89 | return F.randn(seed) * F.sqrt(variance) + mean 90 | 91 | 92 | def div_or_else( 93 | cola: Column, 94 | colb: Column, 95 | default: Union[float, Column] = 0.0, 96 | ) -> Column: 97 | """Return result of division of cola by colb or default if colb is zero. 98 | 99 | :param cola: dividend 100 | :param colb: divisor 101 | :param default: default value 102 | :returns: result of division or zero 103 | """ 104 | if not isinstance(default, Column): 105 | default = F.lit(default) 106 | 107 | return F.when(colb == F.lit(0.0), default).otherwise(cola / colb) 108 | -------------------------------------------------------------------------------- /quinn/schema_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | import json 17 | from typing import Optional 18 | 19 | from pyspark.sql import SparkSession 20 | from pyspark.sql import types as T # noqa: N812 21 | 22 | 23 | def print_schema_as_code(dtype: T.DataType) -> str: 24 | """Represent DataType (including StructType) as valid Python code. 25 | 26 | :param dtype: The input DataType or Schema object 27 | :type dtype: pyspark.sql.types.DataType 28 | :return: A valid python code which generate the same schema. 29 | :rtype: str 30 | """ 31 | res = [] 32 | if isinstance(dtype, T.StructType): 33 | res.append("StructType(\n\tfields=[") 34 | for field in dtype.fields: 35 | for line in _repr_column(field).split("\n"): 36 | res.append("\n\t\t") 37 | res.append(line) 38 | res.append(",") 39 | res.append("\n\t]\n)") 40 | 41 | elif isinstance(dtype, T.ArrayType): 42 | res.append("ArrayType(") 43 | res.append(print_schema_as_code(dtype.elementType)) 44 | res.append(")") 45 | 46 | elif isinstance(dtype, T.MapType): 47 | res.append("MapType(") 48 | res.append(f"\n\t{print_schema_as_code(dtype.keyType)},") 49 | for line in print_schema_as_code(dtype.valueType).split("\n"): 50 | res.append("\n\t") 51 | res.append(line) 52 | res.append(",") 53 | res.append(f"\n\t{dtype.valueContainsNull},") 54 | res.append("\n)") 55 | 56 | elif isinstance(dtype, T.DecimalType): 57 | res.append(f"DecimalType({dtype.precision}, {dtype.scale})") 58 | 59 | elif str(dtype).endswith("()"): 60 | # PySpark 3.3+ 61 | res.append(str(dtype)) 62 | else: 63 | res.append(f"{dtype}()") 64 | 65 | return "".join(res) 66 | 67 | 68 | def _repr_column(column: T.StructField) -> str: 69 | res = [] 70 | 71 | if isinstance(column.dataType, (T.ArrayType, T.MapType, T.StructType)): 72 | res.append(f'StructField(\n\t"{column.name}",') 73 | for line in print_schema_as_code(column.dataType).split("\n"): 74 | res.append("\n\t") 75 | res.append(line) 76 | res.append(",") 77 | res.append(f"\n\t{column.nullable},") 78 | res.append("\n)") 79 | 80 | else: 81 | res.append( 82 | f'StructField("{column.name}", {print_schema_as_code(column.dataType)}, {column.nullable})', 83 | ) 84 | 85 | return "".join(res) 86 | 87 | 88 | def schema_from_csv(spark: SparkSession, file_path: str) -> T.StructType: # noqa: C901 89 | """Return a StructType from a CSV file containing schema configuration. 90 | 91 | :param spark: The SparkSession object 92 | :type spark: pyspark.sql.session.SparkSession 93 | 94 | :param file_path: The path to the CSV file containing the schema configuration 95 | :type file_path: str 96 | 97 | :raises ValueError: If the CSV file does not contain the expected columns: name, type, nullable, description 98 | 99 | :return: A StructType object representing the schema configuration 100 | :rtype: pyspark.sql.types.StructType 101 | """ 102 | 103 | def _validate_json(metadata: Optional[str]) -> dict: 104 | if metadata is None: 105 | return {} 106 | 107 | try: 108 | metadata_dict = json.loads(metadata) 109 | 110 | except json.JSONDecodeError as exc: 111 | msg = f"Invalid JSON: {metadata}" 112 | raise ValueError(msg) from exc 113 | 114 | return metadata_dict 115 | 116 | def _lookup_type(type_str: str) -> T.DataType: 117 | type_lookup = { 118 | "string": T.StringType(), 119 | "int": T.IntegerType(), 120 | "float": T.FloatType(), 121 | "double": T.DoubleType(), 122 | "boolean": T.BooleanType(), 123 | "bool": T.BooleanType(), 124 | "timestamp": T.TimestampType(), 125 | "date": T.DateType(), 126 | "binary": T.BinaryType(), 127 | } 128 | 129 | if type_str not in type_lookup: 130 | msg = f"Invalid type: {type_str}. Expecting one of: {type_lookup.keys()}" 131 | raise ValueError(msg) 132 | 133 | return type_lookup[type_str] 134 | 135 | def _convert_nullable(null_str: str) -> bool: 136 | if null_str is None: 137 | return True 138 | 139 | parsed_val = null_str.lower() 140 | if parsed_val not in ["true", "false"]: 141 | msg = f"Invalid nullable value: {null_str}. Expecting True or False." 142 | raise ValueError(msg) 143 | 144 | return parsed_val == "true" 145 | 146 | schema_df = spark.read.csv(file_path, header=True) 147 | possible_columns = ["name", "type", "nullable", "metadata"] 148 | num_cols = len(schema_df.columns) 149 | expected_columns = possible_columns[0:num_cols] 150 | 151 | # ensure that csv contains the expected columns: name, type, nullable, description 152 | if schema_df.columns != expected_columns: 153 | msg = f"CSV must contain columns in this order: {expected_columns}" 154 | raise ValueError(msg) 155 | 156 | # create a StructType per field 157 | fields = [] 158 | for row in schema_df.collect(): 159 | field = T.StructField( 160 | name=row["name"], 161 | dataType=_lookup_type(row["type"]), 162 | nullable=_convert_nullable(row["nullable"]) if "nullable" in row else True, 163 | metadata=_validate_json(row["metadata"] if "metadata" in row else None), 164 | ) 165 | fields.append(field) 166 | 167 | return T.StructType(fields=fields) 168 | 169 | 170 | def complex_fields(schema: T.StructType) -> dict[str, object]: 171 | """Returns a dictionary of complex field names and their data types from the input DataFrame's schema. 172 | 173 | :param df: The input PySpark DataFrame. 174 | :type df: DataFrame 175 | :return: A dictionary with complex field names as keys and their respective data types as values. 176 | :rtype: Dict[str, object] 177 | """ 178 | return { 179 | field.name: field.dataType 180 | for field in schema.fields 181 | if isinstance(field.dataType, (T.ArrayType, T.StructType, T.MapType)) 182 | } 183 | -------------------------------------------------------------------------------- /quinn/split_columns.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from __future__ import annotations 15 | 16 | from typing import TYPE_CHECKING, Optional 17 | 18 | from pyspark.sql.functions import length, split, trim, udf, when 19 | from pyspark.sql.types import IntegerType 20 | 21 | if TYPE_CHECKING: 22 | from pyspark.sql import DataFrame 23 | 24 | 25 | def split_col( # noqa: PLR0913 26 | df: DataFrame, 27 | col_name: str, 28 | delimiter: str, 29 | new_col_names: list[str], 30 | mode: str = "permissive", 31 | default: Optional[str] = None, 32 | ) -> DataFrame: 33 | """Splits the given column based on the delimiter and creates new columns with the split values. 34 | 35 | :param df: The input DataFrame 36 | :type df: pyspark.sql.DataFrame 37 | :param col_name: The name of the column to split 38 | :type col_name: str 39 | :param delimiter: The delimiter to split the column on 40 | :type delimiter: str 41 | :param new_col_names: A list of two strings for the new column names 42 | :type new_col_names: (List[str]) 43 | :param mode: The split mode. Can be "strict" or "permissive". Default is "permissive" 44 | :type mode: str 45 | :param default: If the mode is "permissive" then default value will be assigned to column 46 | :type mode: str 47 | :return: dataframe: The resulting DataFrame with the split columns 48 | :rtype: pyspark.sql.DataFrame. 49 | """ 50 | # Check if the column to be split exists in the DataFrame 51 | if col_name not in df.columns: 52 | msg = f"Column '{col_name}' not found in DataFrame." 53 | raise ValueError(msg) 54 | 55 | # Check if the delimiter is a string 56 | if not isinstance(delimiter, str): 57 | msg = "Delimiter must be a string." 58 | raise TypeError(msg) 59 | 60 | # Check if the new column names are a list of strings 61 | if not isinstance(new_col_names, list): 62 | msg = "New column names must be a list of strings." 63 | raise TypeError(msg) 64 | 65 | # Define a UDF to check the occurrence of delimitter 66 | def _num_delimiter(col_value1: str) -> int: 67 | # Get the count of delimiter and store the result in no_of_delimiter 68 | no_of_delimiter = col_value1.count(delimiter) 69 | # Split col_value based on delimiter and store the result in split_value 70 | split_value = col_value1.split(delimiter) 71 | 72 | # Check if col_value is not None 73 | if col_value1 is not None: 74 | # Check if the no of delimiters in split_value is not as expected 75 | if no_of_delimiter != len(new_col_names) - 1: 76 | # If the length is not same, raise an IndexError with the message mentioning the expected and found length 77 | msg = f"Expected {len(new_col_names)} elements after splitting on delimiter, found {len(split_value)} elements" 78 | raise IndexError( 79 | msg, 80 | ) 81 | 82 | # If the length of split_value is same as new_col_names, check if any of the split values is None or empty string 83 | elif any( # noqa: RET506 84 | x is None or x.strip() == "" for x in split_value[: len(new_col_names)] 85 | ): 86 | msg = "Null or empty values are not accepted for columns in strict mode" 87 | raise ValueError( 88 | msg, 89 | ) 90 | 91 | # If the above checks pass, return the count of delimiter 92 | return int(no_of_delimiter) 93 | 94 | # If col_value is None, return 0 95 | return 0 96 | 97 | num_udf = udf(lambda y: None if y is None else _num_delimiter(y), IntegerType()) 98 | 99 | # Get the column expression for the column to be split 100 | col_expr = df[col_name] 101 | 102 | # Split the column by the delimiter 103 | split_col_expr = split(trim(col_expr), delimiter) 104 | 105 | # Check the split mode 106 | if mode == "strict": 107 | # Create an array of select expressions to create new columns from the split values 108 | select_exprs = [ 109 | when(split_col_expr.getItem(i) != "", split_col_expr.getItem(i)).alias( 110 | new_col_names[i], 111 | ) 112 | for i in range(len(new_col_names)) 113 | ] 114 | 115 | # Select all the columns from the input DataFrame, along with the new split columns 116 | df = df.select("*", *select_exprs) # noqa: PD901 117 | df = df.withColumn("del_length", num_udf(df[col_name])) # noqa: PD901 118 | df.cache() 119 | # Drop the original column if the new columns were created successfully 120 | df = df.select( # noqa: PD901 121 | [c for c in df.columns if c not in {"del_length", col_name}], 122 | ) 123 | 124 | elif mode == "permissive": 125 | # Create an array of select expressions to create new columns from the split values 126 | # Use the default value if a split value is missing or empty 127 | select_exprs = select_exprs = [ 128 | when(length(split_col_expr.getItem(i)) > 0, split_col_expr.getItem(i)) 129 | .otherwise(default) 130 | .alias(new_col_names[i]) 131 | for i in range(len(new_col_names)) 132 | ] 133 | 134 | # Select all the columns from the input DataFrame, along with the new split columns 135 | # Drop the original column if the new columns were created successfully 136 | df = df.select("*", *select_exprs).drop(col_name) # noqa: PD901 137 | df.cache() 138 | 139 | else: 140 | msg = f"Invalid mode: {mode}" 141 | raise ValueError(msg) 142 | 143 | # Return the DataFrame with the split columns 144 | return df 145 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/tests/__init__.py -------------------------------------------------------------------------------- /tests/extensions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrpowers-io/quinn/20156582034c5d25a52223b3c4ca992d37c656fa/tests/extensions/__init__.py -------------------------------------------------------------------------------- /tests/extensions/dataframe_transformations.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.functions import lit 2 | 3 | 4 | def with_greeting(df): 5 | return df.withColumn("greeting", lit("hi")) 6 | 7 | 8 | def with_something(df, something): 9 | return df.withColumn("something", lit(something)) 10 | 11 | 12 | def with_funny(word): 13 | def inner(df): 14 | return df.withColumn("funny", lit(word)) 15 | 16 | return inner 17 | 18 | 19 | def with_jacket(word, df): 20 | return df.withColumn("jacket", lit(word)) 21 | -------------------------------------------------------------------------------- /tests/extensions/test_dataframe_ext.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import pytest 3 | import pyspark 4 | import chispa 5 | from pyspark.sql.functions import col 6 | 7 | from ..spark import spark 8 | 9 | from .dataframe_transformations import ( 10 | with_greeting, 11 | with_something, 12 | with_funny, 13 | with_jacket, 14 | ) 15 | 16 | 17 | def test_verbose_code_without_transform(): 18 | data = [("jose", 1), ("li", 2), ("liz", 3)] 19 | source_df = spark.createDataFrame(data, ["name", "age"]) 20 | df1 = with_greeting(source_df) 21 | df2 = with_something(df1, "moo") 22 | expected_data = [ 23 | ("jose", 1, "hi", "moo"), 24 | ("li", 2, "hi", "moo"), 25 | ("liz", 3, "hi", "moo"), 26 | ] 27 | expected_df = spark.createDataFrame( 28 | expected_data, ["name", "age", "greeting", "something"] 29 | ) 30 | chispa.assert_df_equality(df2, expected_df, ignore_nullable=True) 31 | 32 | 33 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0") 34 | def test_transform_with_lambda(): 35 | data = [("jose", 1), ("li", 2), ("liz", 3)] 36 | source_df = spark.createDataFrame(data, ["name", "age"]) 37 | actual_df = source_df.transform( 38 | lambda df: df.withColumn("age_times_two", col("age") * 2) 39 | ) 40 | expected_data = [("jose", 1, 2), ("li", 2, 4), ("liz", 3, 6)] 41 | expected_df = spark.createDataFrame(expected_data, ["name", "age", "age_times_two"]) 42 | chispa.assert_df_equality(actual_df, expected_df) 43 | 44 | 45 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0") 46 | def test_transform_with_no_arg_fun(): 47 | data = [("jose", 1), ("li", 2), ("liz", 3)] 48 | source_df = spark.createDataFrame(data, ["name", "age"]) 49 | actual_df = source_df.transform(lambda df: with_greeting(df)) 50 | expected_data = [("jose", 1, "hi"), ("li", 2, "hi"), ("liz", 3, "hi")] 51 | expected_df = spark.createDataFrame(expected_data, ["name", "age", "greeting"]) 52 | chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True) 53 | 54 | 55 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0") 56 | def test_transform_with_one_arg_fun(): 57 | data = [("jose", 1), ("li", 2), ("liz", 3)] 58 | source_df = spark.createDataFrame(data, ["name", "age"]) 59 | actual_df = source_df.transform(lambda df: with_something(df, "crazy")) 60 | expected_data = [("jose", 1, "crazy"), ("li", 2, "crazy"), ("liz", 3, "crazy")] 61 | expected_df = spark.createDataFrame(expected_data, ["name", "age", "something"]) 62 | chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True) 63 | 64 | 65 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0") 66 | def test_chain_transforms(): 67 | data = [("jose", 1), ("li", 2), ("liz", 3)] 68 | source_df = spark.createDataFrame(data, ["name", "age"]) 69 | actual_df = source_df.transform(with_greeting).transform( 70 | lambda df: with_something(df, "crazy") 71 | ) 72 | expected_data = [ 73 | ("jose", 1, "hi", "crazy"), 74 | ("li", 2, "hi", "crazy"), 75 | ("liz", 3, "hi", "crazy"), 76 | ] 77 | expected_df = spark.createDataFrame( 78 | expected_data, ["name", "age", "greeting", "something"] 79 | ) 80 | chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True) 81 | 82 | 83 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0") 84 | def test_transform_with_closure(): 85 | data = [("jose", 1), ("li", 2), ("liz", 3)] 86 | source_df = spark.createDataFrame(data, ["name", "age"]) 87 | actual_df = source_df.transform(with_greeting).transform( # no lambda required 88 | with_funny("haha") 89 | ) 90 | expected_data = [ 91 | ("jose", 1, "hi", "haha"), 92 | ("li", 2, "hi", "haha"), 93 | ("liz", 3, "hi", "haha"), 94 | ] 95 | expected_df = spark.createDataFrame( 96 | expected_data, ["name", "age", "greeting", "funny"] 97 | ) 98 | chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True) 99 | 100 | 101 | @pytest.mark.skipif(pyspark.__version__ < '3.0', reason="df.transform not available for Spark<3.0") 102 | def test_transform_with_functools_partial(): 103 | data = [("jose", 1), ("li", 2), ("liz", 3)] 104 | source_df = spark.createDataFrame(data, ["name", "age"]) 105 | actual_df = source_df.transform( 106 | partial(with_greeting) 107 | ).transform( # partial is optional for transformations that only take a single DataFrame argument 108 | partial(with_jacket, "warm") 109 | ) 110 | expected_data = [ 111 | ("jose", 1, "hi", "warm"), 112 | ("li", 2, "hi", "warm"), 113 | ("liz", 3, "hi", "warm"), 114 | ] 115 | expected_df = spark.createDataFrame( 116 | expected_data, ["name", "age", "greeting", "jacket"] 117 | ) 118 | chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True) 119 | -------------------------------------------------------------------------------- /tests/extensions/test_spark_session_ext.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import StructType, StructField, StringType 2 | 3 | from ..spark import spark 4 | 5 | import chispa 6 | import quinn 7 | 8 | 9 | def test_create_df(): 10 | schema = StructType( 11 | [ 12 | StructField("name", StringType(), True), 13 | StructField("blah", StringType(), True), 14 | ] 15 | ) 16 | data = [("jose", "a"), ("li", "b"), ("sam", "c")] 17 | actual_df = spark.createDataFrame(data, schema) 18 | 19 | expected_df = quinn.create_df( 20 | spark, 21 | [("jose", "a"), ("li", "b"), ("sam", "c")], 22 | [("name", StringType(), True), ("blah", StringType(), True)], 23 | ) 24 | 25 | chispa.assert_df_equality(expected_df, actual_df) 26 | -------------------------------------------------------------------------------- /tests/spark.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.master("local").appName("chispa").getOrCreate() 4 | -------------------------------------------------------------------------------- /tests/test_append_if_schema_identical.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import StructType, StructField, IntegerType, StringType 2 | import quinn 3 | from .spark import spark 4 | from chispa.schema_comparer import assert_basic_schema_equality 5 | from quinn.append_if_schema_identical import SchemaMismatchError 6 | 7 | 8 | def test_append_if_schema_identical(): 9 | source_data = [(1, "cape town", "Alice"), (2, "delhi", "Bob")] 10 | target_data = [(3, "Charlie", "New York"), (4, "Dave", "Los Angeles")] 11 | bad_data = [(5, "Eve", "London", "extra_column")] 12 | 13 | source_df = spark.createDataFrame( 14 | source_data, 15 | schema=StructType( 16 | [ 17 | StructField("id", IntegerType()), 18 | StructField("city", StringType()), 19 | StructField("name", StringType()), 20 | ] 21 | ), 22 | ) 23 | 24 | target_df = spark.createDataFrame( 25 | target_data, 26 | schema=StructType( 27 | [ 28 | StructField("id", IntegerType()), 29 | StructField("name", StringType()), 30 | StructField("city", StringType()), 31 | ] 32 | ), 33 | ) 34 | 35 | unidentical_df = spark.createDataFrame( 36 | bad_data, 37 | schema=StructType( 38 | [ 39 | StructField("id", IntegerType()), 40 | StructField("name", StringType()), 41 | StructField("city", StringType()), 42 | StructField("extra", StringType()), 43 | ] 44 | ), 45 | ) 46 | 47 | check_if_error_caught = False 48 | expected_names = ["Charlie", "Dave", "Alice", "Bob"] 49 | expected_cities = ["New York", "Los Angeles", "cape town", "delhi"] 50 | 51 | # Call the append_if_schema_identical function 52 | result = quinn.append_if_schema_identical(source_df, target_df) 53 | 54 | # check result content 55 | names = [i.name for i in result.select("name").collect()] 56 | cities = [i.city for i in result.select("city").collect()] 57 | 58 | if result.count() != 4: 59 | raise AssertionError("result should have 4 rows") 60 | 61 | if names != expected_names: 62 | raise AssertionError("result should have the correct names") 63 | 64 | if cities != expected_cities: 65 | raise AssertionError("result should have the correct cities") 66 | 67 | assert_basic_schema_equality(target_df.schema, result.schema) 68 | 69 | try: 70 | quinn.append_if_schema_identical(source_df, unidentical_df) 71 | except SchemaMismatchError: 72 | check_if_error_caught = True 73 | 74 | if not check_if_error_caught: 75 | raise AssertionError( 76 | "append_if_schema_identical should raise an error if the schemas are not identical" 77 | ) 78 | -------------------------------------------------------------------------------- /tests/test_dataframe_helpers.py: -------------------------------------------------------------------------------- 1 | import quinn 2 | from .spark import spark 3 | import chispa 4 | from pyspark.sql.types import IntegerType, StringType, StructType, StructField 5 | 6 | 7 | def describe_column_to_list(): 8 | def it_returns_a_list(): 9 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 10 | source_df = spark.createDataFrame(data, ["name", "age"]) 11 | actual = quinn.column_to_list(source_df, "name") 12 | assert ["jose", "li", "luisa"] == actual 13 | 14 | 15 | def describe_two_columns_to_dictionary(): 16 | def it_returns_a_dictionary(): 17 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 18 | source_df = spark.createDataFrame(data, ["name", "age"]) 19 | actual = quinn.two_columns_to_dictionary(source_df, "name", "age") 20 | assert {"jose": 1, "li": 2, "luisa": 3} == actual 21 | 22 | 23 | def describe_to_list_of_dictionaries(): 24 | def returns_a_list_of_dicts(): 25 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 26 | source_df = spark.createDataFrame(data, ["name", "age"]) 27 | actual = quinn.to_list_of_dictionaries(source_df) 28 | expected = [ 29 | {"name": "jose", "age": 1}, 30 | {"name": "li", "age": 2}, 31 | {"name": "luisa", "age": 3}, 32 | ] 33 | assert expected == actual 34 | 35 | 36 | def describe_show_output_to_df(): 37 | def it_converts_a_show_string_to_a_dataframe(): 38 | s = """+----+---+-----------+------+ 39 | |name|age| stuff1|stuff2| 40 | +----+---+-----------+------+ 41 | |jose| 1|nice person| yoyo| 42 | | li| 2|nice person| yoyo| 43 | | liz| 3|nice person| yoyo| 44 | +----+---+-----------+------+""" 45 | actual_df = quinn.show_output_to_df(s, spark) 46 | expected_data = [ 47 | ("jose", "1", "nice person", "yoyo"), 48 | ("li", "2", "nice person", "yoyo"), 49 | ("liz", "3", "nice person", "yoyo"), 50 | ] 51 | expected_df = spark.createDataFrame( 52 | expected_data, ["name", "age", "stuff1", "stuff2"] 53 | ) 54 | chispa.assert_df_equality(expected_df, actual_df) 55 | 56 | 57 | def describe_print_athena_create_table(): 58 | def it_prints_a_create_table_string_for_athena(capsys): 59 | source_df = spark.createDataFrame( 60 | [("jets", "football", 45), ("nacional", "soccer", 10)], 61 | ["team", "sport", "goals_for"], 62 | ) 63 | quinn.print_athena_create_table(source_df, "athena_table", "s3://mock") 64 | out, _ = capsys.readouterr() 65 | assert ( 66 | out 67 | == "CREATE EXTERNAL TABLE IF NOT EXISTS `athena_table` ( \n\t `team` string, \n\t `sport` string, \n\t `goals_for` bigint \n)\nSTORED AS PARQUET\nLOCATION 's3://mock'\n\n" # noqa 68 | ) 69 | 70 | 71 | def test_create_df(): 72 | rows_data = [("jose", 1), ("li", 2), ("luisa", 3)] 73 | col_specs = [("name", StringType()), ("age", IntegerType())] 74 | 75 | expected_schema = StructType( 76 | [ 77 | StructField("name", StringType(), True), 78 | StructField("age", IntegerType(), True), 79 | ] 80 | ) 81 | actual = quinn.create_df(spark, rows_data, col_specs) 82 | expected = spark.createDataFrame(rows_data, expected_schema) 83 | chispa.assert_df_equality(actual, expected) 84 | -------------------------------------------------------------------------------- /tests/test_dataframe_validator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark.sql.types import StructType, StructField, StringType, LongType 3 | import semver 4 | import quinn 5 | from .spark import spark 6 | 7 | 8 | def describe_validate_presence_of_columns(): 9 | def it_raises_if_a_required_column_is_missing_and_return_bool_is_false(): 10 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 11 | source_df = spark.createDataFrame(data, ["name", "age"]) 12 | with pytest.raises(quinn.DataFrameMissingColumnError) as excinfo: 13 | quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"], False) 14 | assert ( 15 | excinfo.value.args[0] 16 | == "The ['fun'] columns are not included in the DataFrame with the following columns ['name', 'age']" 17 | ) 18 | 19 | def it_does_nothing_if_all_required_columns_are_present_and_return_bool_is_false(): 20 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 21 | source_df = spark.createDataFrame(data, ["name", "age"]) 22 | quinn.validate_presence_of_columns(source_df, ["name"], False) 23 | 24 | def it_returns_false_if_a_required_column_is_missing_and_return_bool_is_true(): 25 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 26 | source_df = spark.createDataFrame(data, ["name", "age"]) 27 | result = quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"], True) 28 | assert result is False 29 | 30 | def it_returns_true_if_all_required_columns_are_present_and_return_bool_is_true(): 31 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 32 | source_df = spark.createDataFrame(data, ["name", "age"]) 33 | result = quinn.validate_presence_of_columns(source_df, ["name"], True) 34 | assert result is True 35 | 36 | 37 | def describe_validate_schema(): 38 | def it_raises_when_struct_field_is_missing_and_return_bool_is_false(): 39 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 40 | source_df = spark.createDataFrame(data, ["name", "age"]) 41 | required_schema = StructType( 42 | [ 43 | StructField("name", StringType(), True), 44 | StructField("city", StringType(), True), 45 | ] 46 | ) 47 | with pytest.raises(quinn.DataFrameMissingStructFieldError) as excinfo: 48 | quinn.validate_schema(source_df, required_schema, return_bool = False) 49 | 50 | current_spark_version = semver.Version.parse(spark.version) 51 | spark_330 = semver.Version.parse("3.3.0") 52 | if semver.Version.compare(current_spark_version, spark_330) >= 0: # Spark 3.3+ 53 | expected_error_message = "The [StructField('city', StringType(), True)] StructFields are not included in the DataFrame with the following StructFields StructType([StructField('name', StringType(), True), StructField('age', LongType(), True)])" # noqa 54 | else: 55 | expected_error_message = "The [StructField(city,StringType,true)] StructFields are not included in the DataFrame with the following StructFields StructType(List(StructField(name,StringType,true),StructField(age,LongType,true)))" # noqa 56 | assert excinfo.value.args[0] == expected_error_message 57 | 58 | def it_does_nothing_when_the_schema_matches_and_return_bool_is_false(): 59 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 60 | source_df = spark.createDataFrame(data, ["name", "age"]) 61 | required_schema = StructType( 62 | [ 63 | StructField("name", StringType(), True), 64 | StructField("age", LongType(), True), 65 | ] 66 | ) 67 | quinn.validate_schema(source_df, required_schema, return_bool = False) 68 | 69 | def it_returns_false_when_struct_field_is_missing_and_return_bool_is_true(): 70 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 71 | source_df = spark.createDataFrame(data, ["name", "age"]) 72 | required_schema = StructType( 73 | [ 74 | StructField("name", StringType(), True), 75 | StructField("city", StringType(), True), 76 | ] 77 | ) 78 | result = quinn.validate_schema(source_df, required_schema, return_bool = True) 79 | assert result is False 80 | 81 | def it_returns_true_when_the_schema_matches_and_return_bool_is_true(): 82 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 83 | source_df = spark.createDataFrame(data, ["name", "age"]) 84 | required_schema = StructType( 85 | [ 86 | StructField("name", StringType(), True), 87 | StructField("age", LongType(), True), 88 | ] 89 | ) 90 | result = quinn.validate_schema(source_df, required_schema, return_bool = True) 91 | assert result is True 92 | 93 | def nullable_column_mismatches_are_ignored(): 94 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 95 | source_df = spark.createDataFrame(data, ["name", "age"]) 96 | required_schema = StructType( 97 | [ 98 | StructField("name", StringType(), True), 99 | StructField("age", LongType(), False), 100 | ] 101 | ) 102 | quinn.validate_schema(source_df, required_schema, ignore_nullable=True, return_bool = False) 103 | 104 | 105 | def describe_validate_absence_of_columns(): 106 | def it_raises_when_a_unallowed_column_is_present_and_return_bool_is_false(): 107 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 108 | source_df = spark.createDataFrame(data, ["name", "age"]) 109 | with pytest.raises(quinn.DataFrameProhibitedColumnError) as excinfo: 110 | quinn.validate_absence_of_columns(source_df, ["age", "cool"], False) 111 | assert ( 112 | excinfo.value.args[0] 113 | == "The ['age'] columns are not allowed to be included in the DataFrame with the following columns ['name', 'age']" # noqa 114 | ) 115 | 116 | def it_does_nothing_when_no_unallowed_columns_are_present_and_return_bool_is_false(): 117 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 118 | source_df = spark.createDataFrame(data, ["name", "age"]) 119 | quinn.validate_absence_of_columns(source_df, ["favorite_color"], False) 120 | 121 | def it_returns_false_when_a_unallowed_column_is_present_and_return_bool_is_true(): 122 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 123 | source_df = spark.createDataFrame(data, ["name", "age"]) 124 | result = quinn.validate_absence_of_columns(source_df, ["age", "cool"], True) 125 | assert result is False 126 | 127 | def it_returns_true_when_no_unallowed_columns_are_present_and_return_bool_is_true(): 128 | data = [("jose", 1), ("li", 2), ("luisa", 3)] 129 | source_df = spark.createDataFrame(data, ["name", "age"]) 130 | result = quinn.validate_absence_of_columns(source_df, ["favorite_color"], True) 131 | assert result is True 132 | -------------------------------------------------------------------------------- /tests/test_files/bad_schema.csv: -------------------------------------------------------------------------------- 1 | whatever,type,nullable,metadata 2 | blah,string,false,{"description":"The person's name"} 3 | no,string -------------------------------------------------------------------------------- /tests/test_files/good_schema1.csv: -------------------------------------------------------------------------------- 1 | name,type,nullable,metadata 2 | person,string,false,{"description":"The person's name"} 3 | address,string 4 | phoneNumber,string,TRUE,{"description":"The person's phone number"} 5 | age,int,False -------------------------------------------------------------------------------- /tests/test_files/good_schema2.csv: -------------------------------------------------------------------------------- 1 | name,type 2 | person,string 3 | address,string 4 | phoneNumber,string 5 | age,int -------------------------------------------------------------------------------- /tests/test_files/some_pyspark.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | 4 | print("hi") 5 | 6 | spark = SparkSession.builder.appName('my_app').getOrCreate() 7 | sparkContext = spark.sparkContext 8 | rdd=sparkContext.parallelize([1,2,3,4,5]) 9 | rddCollect = rdd.collect() 10 | print("Number of Partitions: "+str(rdd.getNumPartitions())) 11 | print("Action: First element: "+str(rdd.first())) 12 | print(rddCollect) 13 | 14 | print("bye") -------------------------------------------------------------------------------- /tests/test_keyword_finder.py: -------------------------------------------------------------------------------- 1 | from quinn.keyword_finder import search_file, search_files, keyword_format, surround_substring 2 | 3 | 4 | def test_search_file(): 5 | file_path = "tests/test_files/some_pyspark.py" 6 | results = search_file(file_path) 7 | 8 | assert results.word_count["rdd"] == 5 9 | assert results.word_count["sparkContext"] == 2 10 | 11 | 12 | def test_search_files(): 13 | results = search_files("tests/test_files") 14 | 15 | pyspark_file = [result for result in results if result.file_path == "tests/test_files/some_pyspark.py"][0] 16 | csv_file = [result for result in results if result.file_path == "tests/test_files/good_schema1.csv"][0] 17 | 18 | assert pyspark_file.word_count["rdd"] == 5 19 | assert pyspark_file.word_count["sparkContext"] == 2 20 | assert csv_file.word_count["rdd"] == 0 21 | 22 | 23 | def test_keyword_format(): 24 | print(keyword_format("spark rdd stuff")) 25 | print(keyword_format("spark rdd stuff with bad _jvm")) 26 | print(keyword_format("nice string")) 27 | print(keyword_format("")) 28 | 29 | 30 | def test_surround_substring(): 31 | 32 | assert "spark **rdd|| stuff" == surround_substring("spark rdd stuff", "rdd", "**", "||") 33 | assert "spark **rdd|| stuff with **rdd||" == surround_substring("spark rdd stuff with rdd", "rdd", "**", "||") 34 | assert "spark **rdd||dd stuff" == surround_substring("spark rdddd stuff", "rdd", "**", "||") 35 | -------------------------------------------------------------------------------- /tests/test_math.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | import pytest 4 | import quinn 5 | import math 6 | from .spark import spark 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "mean, scale", 11 | [ 12 | (1.0, 2.0), 13 | (2.0, 3.0), 14 | (3.0, 4.0), 15 | ], 16 | ) 17 | @pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"]) 18 | def test_rand_laplace(mean: float, scale: float): 19 | stats = ( 20 | spark.range(100000) 21 | .select(quinn.rand_laplace(mean, scale, 42)) 22 | .agg( 23 | F.mean("laplace_random").alias("mean"), 24 | F.stddev("laplace_random").alias("std_dev"), 25 | ) 26 | .first() 27 | ) 28 | 29 | laplace_mean = stats["mean"] 30 | laplace_stddev = stats["std_dev"] 31 | 32 | # Laplace distribution with mean=0.0 and scale=1.0 has mean=0.0 and stddev=sqrt(2.0) 33 | assert abs(laplace_mean - mean) <= 0.1 34 | assert abs(laplace_stddev - scale * math.sqrt(2.0)) <= 0.1 35 | 36 | 37 | @pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"]) 38 | def test_rand_range(): 39 | lower_bound = 5 40 | upper_bound = 10 41 | stats = ( 42 | spark.range(1000) 43 | .select(quinn.rand_range(lower_bound, upper_bound).alias("rand_uniform")) 44 | .agg(F.min("rand_uniform").alias("min"), F.min("rand_uniform").alias("max")) 45 | .first() 46 | ) 47 | 48 | uniform_min = stats["min"] 49 | uniform_max = stats["max"] 50 | 51 | assert lower_bound <= uniform_min <= uniform_max <= upper_bound 52 | 53 | 54 | @pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"]) 55 | def test_randn(): 56 | mean = 1.0 57 | variance = 2.0 58 | stats = ( 59 | spark.range(1000) 60 | .select(quinn.randn(mean, variance).alias("rand_normal")) 61 | .agg( 62 | F.mean("rand_normal").alias("agg_mean"), 63 | F.variance("rand_normal").alias("agg_variance"), 64 | ) 65 | .first() 66 | ) 67 | 68 | agg_mean = stats["agg_mean"] 69 | agg_variance = stats["agg_variance"] 70 | 71 | assert agg_mean - mean <= 0.1 72 | assert agg_variance - variance <= 0.2 73 | -------------------------------------------------------------------------------- /tests/test_schema_helpers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pyspark.sql.types import ( 4 | StructType, 5 | IntegerType, 6 | DecimalType, 7 | ArrayType, 8 | FloatType, 9 | MapType, 10 | StringType, 11 | DoubleType, 12 | TimestampType, 13 | StructField, 14 | ) 15 | import pyspark.sql.dataframe 16 | 17 | from quinn.schema_helpers import print_schema_as_code, schema_from_csv, complex_fields 18 | 19 | from chispa.schema_comparer import assert_basic_schema_equality 20 | import pytest 21 | 22 | from .spark import spark 23 | 24 | 25 | def test_print_schema_as_code(): 26 | fields = [] 27 | fields.append(StructField("simple_int", IntegerType())) 28 | fields.append(StructField("decimal_with_nums", DecimalType(19, 8))) 29 | fields.append(StructField("array", ArrayType(FloatType()))) 30 | fields.append(StructField("map", MapType(StringType(), ArrayType(DoubleType())))) 31 | fields.append( 32 | StructField( 33 | "struct", 34 | StructType( 35 | [ 36 | StructField("first", StringType()), 37 | StructField("second", TimestampType()), 38 | ] 39 | ), 40 | ) 41 | ) 42 | 43 | schema = StructType(fields=fields) 44 | 45 | assert_basic_schema_equality(schema, eval(print_schema_as_code(schema))) 46 | 47 | 48 | def test_schema_from_csv_good_schema1(): 49 | expected_schema = StructType( 50 | [ 51 | StructField( 52 | "person", StringType(), False, {"description": "The person's name"} 53 | ), 54 | StructField("address", StringType(), True), 55 | StructField( 56 | "phoneNumber", 57 | StringType(), 58 | True, 59 | {"description": "The person's phone number"}, 60 | ), 61 | StructField("age", IntegerType(), False), 62 | ] 63 | ) 64 | path = "tests/test_files/good_schema1.csv" 65 | assert_basic_schema_equality(expected_schema, schema_from_csv(spark, path)) 66 | 67 | 68 | def test_schema_from_csv_good_schema2(): 69 | expected_schema = StructType( 70 | [ 71 | StructField("person", StringType(), True), 72 | StructField("address", StringType(), True), 73 | StructField("phoneNumber", StringType(), True), 74 | StructField("age", IntegerType(), True), 75 | ] 76 | ) 77 | path = "tests/test_files/good_schema2.csv" 78 | assert_basic_schema_equality(expected_schema, schema_from_csv(spark, path)) 79 | 80 | 81 | def test_schema_from_csv_equality_for_bad_csv(): 82 | path = "tests/test_files/bad_schema.csv" 83 | with pytest.raises(ValueError) as excinfo: 84 | schema_from_csv(spark, path) 85 | assert ( 86 | excinfo.value.args[0] 87 | == "CSV must contain columns in this order: ['name', 'type', 'nullable', 'metadata']" 88 | ) 89 | 90 | 91 | def test_complex_fields(): 92 | schema = StructType( 93 | [ 94 | StructField("id", IntegerType(), True), 95 | StructField( 96 | "details", 97 | StructType( 98 | [ 99 | StructField("name", StringType(), True), 100 | StructField("address", StringType(), True), 101 | StructField("age", IntegerType(), True), 102 | ] 103 | ), 104 | True, 105 | ), 106 | ] 107 | ) 108 | expected = { 109 | "details": StructType( 110 | [ 111 | StructField("name", StringType(), True), 112 | StructField("address", StringType(), True), 113 | StructField("age", IntegerType(), True), 114 | ] 115 | ) 116 | } 117 | assert complex_fields(schema) == expected 118 | -------------------------------------------------------------------------------- /tests/test_split_columns.py: -------------------------------------------------------------------------------- 1 | import quinn 2 | import chispa 3 | import pytest 4 | from .spark import spark 5 | 6 | 7 | def test_split_columns(): 8 | data = [ 9 | ("chrisXXmoe", 2025, "bio"), 10 | ("davidXXbb", 2026, "physics"), 11 | (None, 2025, "physics"), 12 | ] 13 | df = spark.createDataFrame(data, ["student_name", "graduation_year", "major"]) 14 | new_df = quinn.split_col( 15 | df, 16 | col_name="student_name", 17 | delimiter="XX", 18 | new_col_names=["student_first_name", "student_last_name"], 19 | mode="permissive", 20 | ) 21 | data = [ 22 | (2025, "bio", "chris", "moe"), 23 | (2026, "physics", "david", "bb"), 24 | (2025, "physics", None, None), 25 | ] 26 | expected = spark.createDataFrame( 27 | data, ["graduation_year", "major", "student_first_name", "student_last_name"] 28 | ) 29 | chispa.assert_df_equality(new_df, expected) 30 | 31 | 32 | def test_split_columns_advanced(): 33 | data = [ 34 | ("chrisXXsomethingXXmoe", 2025, "bio"), 35 | ("davidXXbb", 2026, "physics"), 36 | (None, 2025, "physics"), 37 | ] 38 | df = spark.createDataFrame(data, ["student_name", "graduation_year", "major"]) 39 | new_df = quinn.split_col( 40 | df, 41 | col_name="student_name", 42 | delimiter="XX", 43 | new_col_names=[ 44 | "student_first_name", 45 | "student_middle_name", 46 | "student_last_name", 47 | ], 48 | mode="permissive", 49 | ) 50 | data = [ 51 | (2025, "bio", "chris", "something", "moe"), 52 | (2026, "physics", "david", "bb", None), 53 | (2025, "physics", None, None, None), 54 | ] 55 | expected = spark.createDataFrame( 56 | data, 57 | [ 58 | "graduation_year", 59 | "major", 60 | "student_first_name", 61 | "student_middle_name", 62 | "student_last_name", 63 | ], 64 | ) 65 | chispa.assert_df_equality(new_df, expected) 66 | 67 | 68 | def test_split_columns_strict(): 69 | data = [ 70 | ("chrisXXsomethingXXmoe", 2025, "bio"), 71 | ("davidXXbb", 2026, "physics"), 72 | (None, 2025, "physics"), 73 | ] 74 | df = spark.createDataFrame(data, ["student_name", "graduation_year", "major"]) 75 | df2 = quinn.split_col( 76 | df, 77 | col_name="student_name", 78 | delimiter="XX", 79 | new_col_names=[ 80 | "student_first_name", 81 | "student_middle_name", 82 | "student_last_name", 83 | ], 84 | mode="strict", 85 | default="hi", 86 | ) 87 | with pytest.raises( 88 | Exception 89 | ): # there is no way to make it work for all the versions 90 | df2.show() 91 | --------------------------------------------------------------------------------