├── .git-blame-ignore-revs ├── .gitattributes ├── .github ├── CODEOWNERS ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml ├── scripts │ └── docker_compose_ready.sh └── workflows │ ├── nightly_tests.yml │ ├── release.yml │ ├── test.yml │ ├── tests.yml │ └── update-lockfiles.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── LICENSE ├── README.md ├── docker-compose.yaml ├── docker_db2.env_list ├── docs ├── Makefile ├── make.bat ├── package │ └── README.md └── source │ ├── best_practices.md │ ├── changelog.md │ ├── conf.py │ ├── database_testing.md │ ├── examples.md │ ├── examples │ ├── best_practices_inline.md │ ├── best_practices_instances.md │ ├── best_practices_sql.md │ ├── color_legend.svg │ ├── environment.yml │ ├── group_and_visualize.ipynb │ ├── group_and_visualize.md │ ├── group_and_visualize01.svg │ ├── group_and_visualize02.svg │ ├── group_and_visualize03.svg │ ├── group_and_visualize04.svg │ ├── group_and_visualize05.svg │ ├── group_and_visualize06.svg │ ├── imperative_materialization.md │ ├── interactive_development.md │ ├── multi_instance_pipeline.md │ ├── multi_instance_pipeline.zip │ ├── raw_sql.md │ ├── raw_sql.zip │ ├── realistic_pipeline.md │ ├── realistic_pipeline.zip │ ├── simple_pipeline.md │ ├── simple_pipeline01.svg │ ├── stage_validation.md │ └── stage_validation.svg │ ├── index.md │ ├── license.md │ ├── quickstart.md │ ├── reference │ ├── api.rst │ ├── cli.md │ └── config.md │ └── table_backends.md ├── example ├── run_pipeline.py ├── simple_pipeline.py ├── stage_validation.py ├── visualization.py └── visualization_legend.py ├── example_imperative ├── failing_example.py └── run_pipeline.py ├── example_interactive ├── failing_flow_after_successful_debugging.py └── run_tasks_interactively.py ├── example_postgres ├── docker-compose.yaml ├── pipedag.yaml └── run_pipeline.py ├── pipedag.yaml ├── pixi.lock ├── pixi.toml ├── pyproject.toml ├── pytest.ini ├── src └── pydiverse │ ├── .gitignore │ └── pipedag │ ├── __init__.py │ ├── _typing.py │ ├── backend │ ├── __init__.py │ ├── blob.py │ ├── lock │ │ ├── __init__.py │ │ ├── base.py │ │ ├── database.py │ │ ├── filelock.py │ │ ├── nolock.py │ │ └── zookeeper.py │ └── table │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cache │ │ ├── __init__.py │ │ ├── base.py │ │ └── parquet.py │ │ ├── dict.py │ │ ├── sql │ │ ├── __init__.py │ │ ├── ddl.py │ │ ├── dialects │ │ │ ├── __init__.py │ │ │ ├── duckdb.py │ │ │ ├── ibm_db2.py │ │ │ ├── mssql.py │ │ │ ├── postgres.py │ │ │ └── snowflake.py │ │ ├── hooks.py │ │ ├── reflection.py │ │ └── sql.py │ │ └── util │ │ ├── __init__.py │ │ └── dtype.py │ ├── container │ └── __init__.py │ ├── context │ ├── __init__.py │ ├── context.py │ ├── run_context.py │ └── trace_hook.py │ ├── core │ ├── __init__.py │ ├── config.py │ ├── flow.py │ ├── group_node.py │ ├── result.py │ ├── stage.py │ └── task.py │ ├── debug │ └── __init__.py │ ├── engine │ ├── __init__.py │ ├── base.py │ ├── dask.py │ ├── prefect.py │ └── sequential.py │ ├── errors │ └── __init__.py │ ├── management │ ├── __init__.py │ ├── cli.py │ └── commands │ │ ├── __init__.py │ │ ├── clear_metadata.py │ │ └── delete_schemas.py │ ├── materialize │ ├── __init__.py │ ├── cache.py │ ├── core.py │ ├── debug.py │ ├── details.py │ ├── metadata.py │ └── store.py │ └── util │ ├── __init__.py │ ├── computation_tracing.py │ ├── deep_map.py │ ├── deep_merge.py │ ├── disposable.py │ ├── hashing.py │ ├── import_.py │ ├── ipc.py │ ├── json.py │ ├── naming.py │ └── structlog.py └── tests ├── __init__.py ├── conftest.py ├── fixtures ├── __init__.py └── instances.py ├── parallelize ├── README.md ├── __init__.py ├── hooks.py ├── plugin.py ├── sesson.py ├── util.py └── worker.py ├── test_cache ├── test_auto_version.py ├── test_basic_cache_invalidation.py ├── test_ignore_cache_function.py └── test_local_table_cache.py ├── test_compression.py ├── test_core.py ├── test_dask.py ├── test_flows ├── complex_config_flows │ ├── pipedag_anchor.yaml │ ├── pipedag_complex.yaml │ ├── postgres_password.yaml │ ├── test_instance_selection.py │ └── test_locking_instances.py ├── raw_sql_scripts │ ├── mssql │ │ ├── create_db_helpers.sql │ │ ├── prep │ │ │ ├── entity_checks.sql │ │ │ └── more_tables.sql │ │ └── raw │ │ │ └── raw_views.sql │ ├── mssql_pytsql │ │ ├── create_db_helpers.sql │ │ ├── prep │ │ │ ├── entity_checks.sql │ │ │ └── more_tables.sql │ │ └── raw │ │ │ └── raw_views.sql │ └── mssql_pytsql_isolate │ │ ├── create_db_helpers.sql │ │ ├── prep │ │ ├── entity_checks.sql │ │ └── more_tables.sql │ │ └── raw │ │ └── raw_views.sql ├── sql_scripts │ ├── script1-db2.sql │ ├── script1.sql │ └── script2.sql ├── test_example.py ├── test_flow.py ├── test_raw_sql_pipeline.py ├── test_simple_flow.py ├── test_source_invalidation.py └── test_sql_text_node.py ├── test_indexes.py ├── test_input_stage_versions.py ├── test_inputs.py ├── test_lock_manager.py ├── test_materialize.py ├── test_materializing_task.py ├── test_raw_sql ├── scripts │ ├── mssql │ │ ├── create_tables │ │ │ └── simple_tables.sql │ │ └── schema_swap │ │ │ ├── check_objects.sql │ │ │ └── create_objects.sql │ └── postgres │ │ └── create_tables │ │ └── simple_tables.sql ├── test_raw_sql_input.py ├── test_raw_sql_schema_swap.py └── util.py ├── test_run_group_node.json ├── test_run_group_node.py ├── test_run_subflow.py ├── test_sql_ddl.py ├── test_sql_dialect ├── scripts │ ├── lock │ ├── simple_nicknames.sql │ └── simple_table_spaces.sql ├── test_ibm_db2.py └── test_postgres.py ├── test_table_hooks ├── lock ├── test_dtype_pandas.py ├── test_dtype_polars.py ├── test_dtype_pyarrow.py ├── test_dtype_sqlalchemy.py ├── test_ibis.py ├── test_pandas_hook.py ├── test_pdtransform.py ├── test_polars.py ├── test_sql_table_reference.py └── test_tidypolars.py ├── test_unicode.py ├── test_util.py └── util ├── __init__.py ├── baseline.py ├── dask_patch.py ├── pytest_raises.py ├── spy.py ├── sql.py ├── tasks_library.py └── tasks_library_imperative.py /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Turn into installable package 2 | af97118ac7596c7b83abf5c4739451bf70fefa18 3 | # Reformat using black 4 | a8fc1a37386d867759f6526f159e8f586bdaedc3 5 | # Ruff 6 | e14623fce7efea34513b2efa2701ccf59b4df559 7 | 7eed813324e91c98052faa7177fdbf4320fb1ee1 8 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | 3 | *.{diff,patch} binary 4 | 5 | *.{py,yaml,yml,sh} text eol=lf 6 | *.bat text eol=crlf 7 | 8 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true 9 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @pydiverse/code-owners -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | # Checklist 7 | 8 | - [ ] Added a `docs/source/changelog.md` entry 9 | - [ ] Added/updated documentation in `docs/source/` 10 | - [ ] Added/updated examples in `docs/source/examples.md` 11 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: / 5 | schedule: 6 | interval: monthly 7 | groups: 8 | gh-actions: 9 | patterns: 10 | - "*" 11 | -------------------------------------------------------------------------------- /.github/scripts/docker_compose_ready.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script checks if all the services defined in our docker compose file 3 | # are up and running. 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | running_services=$(docker compose ps --services --status running) 9 | 10 | if [[ "$running_services" =~ "postgres" ]]; then 11 | docker compose logs postgres 2>&1 | grep "database system is ready to accept connections" > /dev/null 12 | fi 13 | 14 | if [[ "$running_services" =~ "mssql" ]]; then 15 | docker compose logs mssql 2>&1 | grep "SQL Server is now ready for client connections" > /dev/null 16 | fi 17 | 18 | if [[ "$running_services" =~ "ibm_db2" ]]; then 19 | docker compose logs ibm_db2 2>&1 | grep "Setup has completed" > /dev/null 20 | fi 21 | 22 | if [[ "$running_services" =~ "zoo" ]]; then 23 | echo ruok | nc localhost 2181 > /dev/null 24 | fi 25 | -------------------------------------------------------------------------------- /.github/workflows/nightly_tests.yml: -------------------------------------------------------------------------------- 1 | name: Nightly Tests 2 | 3 | on: 4 | schedule: 5 | - cron: "0 2 * * *" 6 | workflow_dispatch: 7 | 8 | jobs: 9 | check: 10 | runs-on: ubuntu-latest 11 | name: Check latest commit 12 | outputs: 13 | should-run: ${{ steps.should-run.outputs.should-run }} 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: check if latest commit is within 24 hrs 18 | id: should-run 19 | continue-on-error: true 20 | if: ${{ github.event_name == 'schedule' }} 21 | run: test -z $(git rev-list --after=\"24 hours\" ${{ github.sha }}) && echo \"::set-output name=should-run::false\" 22 | 23 | os_test: 24 | name: OS Test 25 | needs: [check] 26 | if: ${{ needs.check.outputs.should-run != 'false' }} 27 | strategy: 28 | matrix: 29 | os: 30 | - ubuntu-latest 31 | - macos-latest 32 | - windows-latest 33 | environment: 34 | - py312 35 | - py311 36 | - py39 37 | - py39pdsa1 38 | uses: ./.github/workflows/test.yml 39 | with: 40 | os: ${{ matrix.os }} 41 | environment: ${{ matrix.environment }} 42 | docker-services: | 43 | postgres 44 | zoo 45 | pytest-arguments: --mssql -m mssql --polars --ibis --pdtransform 46 | 47 | library_version_test: 48 | name: Library Version Test 49 | needs: [check] 50 | if: ${{ needs.check.outputs.should-run != 'false' }} 51 | strategy: 52 | matrix: 53 | os: 54 | - ubuntu-latest 55 | - macos-latest 56 | - windows-latest 57 | environment: 58 | - py312all 59 | - py311all 60 | - py310all 61 | - py39pdsa1all 62 | - py311pdsa1all 63 | uses: ./.github/workflows/test.yml 64 | with: 65 | os: ${{ matrix.os }} 66 | environment: ${{ matrix.environment }} 67 | docker-services: | 68 | postgres 69 | zoo 70 | pytest-arguments: --mssql --ibm_db2 --snowflake --polars --pdtransform 71 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - '*.*.*' 9 | pull_request: 10 | 11 | jobs: 12 | build: 13 | name: Build Package 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout branch 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up pixi 20 | uses: prefix-dev/setup-pixi@v0.8.1 21 | with: 22 | environments: release 23 | 24 | - name: Ensure tag matches version 25 | if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') }} 26 | run: | 27 | version="$(pixi exec -s go-yq -- yq .project.version pyproject.toml)" 28 | tag="${{ github.ref_name }}" 29 | if [ "$version" != "$tag" ]; then 30 | echo "Tag $tag does not match version $version" 31 | exit 1 32 | fi 33 | 34 | - name: Build 35 | run: pixi run -e release hatch build 36 | 37 | - name: Check build 38 | run: pixi run -e release twine check dist/* 39 | 40 | - name: List files 41 | run: ls -l dist/ 42 | 43 | - name: Upload package 44 | uses: actions/upload-artifact@v4 45 | with: 46 | name: artifact 47 | path: dist/* 48 | 49 | release: 50 | name: Publish Package 51 | if: startsWith(github.ref, 'refs/tags/') 52 | needs: [build] 53 | runs-on: ubuntu-latest 54 | permissions: 55 | id-token: write 56 | contents: write 57 | environment: pypi 58 | steps: 59 | - uses: actions/download-artifact@v4 60 | with: 61 | name: artifact 62 | path: dist 63 | - name: Publish package on PyPi 64 | uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 65 | with: 66 | # the twine version in the container is outdated 67 | # and results in a false positive 68 | verify-metadata: false 69 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_call: 3 | inputs: 4 | os: 5 | required: true 6 | type: string 7 | environment: 8 | required: true 9 | type: string 10 | docker-services: 11 | required: false 12 | type: string 13 | pytest-arguments: 14 | required: false 15 | type: string 16 | workers: 17 | default: 4 18 | type: number 19 | timeout-minutes: 20 | default: 30 21 | type: number 22 | secrets: 23 | SNOWFLAKE_PASSWORD: 24 | required: false 25 | SNOWFLAKE_ACCOUNT: 26 | required: false 27 | SNOWFLAKE_USER: 28 | required: false 29 | 30 | jobs: 31 | test: 32 | name: pytest 33 | runs-on: ${{ inputs.os }} 34 | timeout-minutes: ${{ inputs.timeout-minutes }} 35 | steps: 36 | - uses: actions/checkout@v4 37 | 38 | - name: Setup Pixi 39 | uses: prefix-dev/setup-pixi@v0.8.1 40 | with: 41 | environments: ${{ inputs.environment }} 42 | 43 | - name: Start Docker Compose 44 | if: ${{ inputs.docker-services != '' }} 45 | uses: isbang/compose-action@e5813a5909aca4ae36058edae58f6e52b9c971f8 46 | with: 47 | compose-file: docker-compose.yaml 48 | services: ${{ inputs.docker-services }} 49 | 50 | - name: Install Microsoft ODBC 51 | if: ${{ contains(inputs.docker-services, 'mssql') }} 52 | run: sudo ACCEPT_EULA=Y apt-get install -y msodbcsql18 53 | 54 | - name: Wait for Docker Servers 55 | if: ${{ inputs.docker-services != '' }} 56 | run: | 57 | until bash ./.github/scripts/docker_compose_ready.sh; do 58 | sleep 1 59 | done 60 | 61 | - name: Run tests 62 | env: 63 | SNOWFLAKE_PASSWORD: ${{ secrets.SNOWFLAKE_PASSWORD }} 64 | SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWFLAKE_ACCOUNT }} 65 | SNOWFLAKE_USER: ${{ secrets.SNOWFLAKE_USER }} 66 | run: | 67 | pixi run -e ${{ inputs.environment }} pytest tests ${RUNNER_DEBUG:+-v} --color=yes --workers=${{ inputs.workers }} ${{ inputs.pytest-arguments }} 68 | -------------------------------------------------------------------------------- /.github/workflows/update-lockfiles.yml: -------------------------------------------------------------------------------- 1 | name: Update lockfiles 2 | 3 | permissions: 4 | contents: write 5 | pull-requests: write 6 | 7 | on: 8 | workflow_dispatch: 9 | schedule: 10 | - cron: 0 5 1 * * 11 | 12 | jobs: 13 | pixi-update: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up pixi 18 | uses: prefix-dev/setup-pixi@v0.8.1 19 | with: 20 | run-install: false 21 | - name: Update lockfiles 22 | run: | 23 | set -euo pipefail 24 | pixi update --json | pixi exec pixi-diff-to-markdown >> diff.md 25 | - name: Create pull request 26 | uses: peter-evans/create-pull-request@v6 27 | with: 28 | token: ${{ secrets.GITHUB_TOKEN }} 29 | commit-message: Update pixi lockfile 30 | title: Update pixi lockfile 31 | body-path: diff.md 32 | branch: update-pixi 33 | base: main 34 | labels: pixi 35 | delete-branch: true 36 | add-paths: pixi.lock 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | .envrc 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | .asv 29 | pip-wheel-metadata 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | /.pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | docs/api/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # pycharm 109 | /.idea/ 110 | 111 | 112 | # experiments 113 | private_* 114 | 115 | # mlflow 116 | mlruns 117 | 118 | # vscode 119 | .vscode 120 | 121 | # direnv 122 | .envrc 123 | 124 | # baseline update files 125 | *.updated.json 126 | 127 | # pixi 128 | .pixi -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.3.0 4 | hooks: 5 | - id: black 6 | language_version: python3.9 7 | - repo: https://github.com/charliermarsh/ruff-pre-commit 8 | rev: v0.0.270 9 | hooks: 10 | - id: ruff 11 | - repo: https://github.com/asottile/pyupgrade 12 | rev: v3.3.1 13 | hooks: 14 | - id: pyupgrade 15 | args: 16 | - --py39-plus 17 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | python: mambaforge-latest 9 | commands: 10 | - mamba install -c conda-forge -c nodefaults pixi 11 | - pixi run -e docs postinstall 12 | - pixi run -e docs docs 13 | - pixi run -e docs readthedocs 14 | sphinx: 15 | configuration: docs/source/conf.py 16 | formats: 17 | - pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, pydiverse 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | postgres: 3 | image: postgres 4 | environment: 5 | POSTGRES_USER: sa 6 | POSTGRES_PASSWORD: Pydiverse23 7 | ports: 8 | - "6543:5432" 9 | mssql: 10 | image: mcr.microsoft.com/azure-sql-edge 11 | environment: 12 | ACCEPT_EULA: Y 13 | SA_PASSWORD: PydiQuant27 14 | ports: 15 | - "1433:1433" 16 | zoo: 17 | image: zookeeper 18 | environment: 19 | ZOO_4LW_COMMANDS_WHITELIST: ruok 20 | ZOO_MAX_CLIENT_CNXNS: 100 21 | ports: 22 | - "2181:2181" 23 | ibm_db2: 24 | platform: linux/x86_64 25 | image: icr.io/db2_community/db2 26 | privileged: true 27 | environment: 28 | LICENSE: accept 29 | DB2INSTANCE: db2inst1 30 | DB2INST1_PASSWORD: password 31 | DBNAME: testdb 32 | UPDATEAVAIL: NO 33 | ports: 34 | - 50000:50000 35 | -------------------------------------------------------------------------------- /docker_db2.env_list: -------------------------------------------------------------------------------- 1 | LICENSE=accept 2 | DB2INSTANCE=db2inst1 3 | DB2INST1_PASSWORD=password 4 | DBNAME=testdb 5 | BLU=false 6 | ENABLE_ORACLE_COMPATIBILITY=false 7 | UPDATEAVAIL=NO 8 | TO_CREATE_SAMPLEDB=false 9 | REPODB=false 10 | IS_OSXFS=false 11 | PERSISTENT_HOME=true 12 | HADR_ENABLED=false 13 | ETCD_ENDPOINT= 14 | ETCD_USERNAME= 15 | ETCD_PASSWORD= 16 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | livehtml: 23 | sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) --watch ../src $(O) 24 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/best_practices.md: -------------------------------------------------------------------------------- 1 | # Best Practices for data pipelines 2 | 3 | The Python community is very concerned with enabling users to stitch together a few code snippets that run as a py file 4 | or jupyter notebook. However, in practice, projects trying to extract significant business impact from data analytics 5 | very quickly reach a size where more sophisticated code organization is needed. On the one hand, this relates to software 6 | engineering principles like modularization, unit/integration testing, IDE support, CI/CD. On the other hand, data processing 7 | steps are best organized as a pipeline or graph of steps/tasks. Those data pipelines are the focus of the following 8 | best practice suggestions: 9 | 10 | * [moving from Raw SQL over handwritten SELECT statements to programmatic SQL](/examples/best_practices_sql) 11 | * [multiple instances: full_fresh, full_stable, mini_stable, midi_stable](/examples/best_practices_instances) 12 | * [inline views, CTEs, and subqueries](/examples/best_practices_inline) 13 | 14 | ```{toctree} 15 | /examples/best_practices_sql 16 | /examples/best_practices_instances 17 | /examples/best_practices_inline 18 | ``` -------------------------------------------------------------------------------- /docs/source/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | pydiverse.pipedag already has quite a rich set of features. So we like to provide some examples showing typical use cases. 4 | 5 | * [Quickstart example](/quickstart) 6 | * [Simple pipeline](/examples/simple_pipeline) 7 | * [Working with real database](/database_testing) 8 | * [Imperative materialization / Materializing Subqueries](/examples/imperative_materialization) 9 | * [Interactive development](/examples/interactive_development) 10 | * [Grouping Tasks/Stages and visualization](/examples/group_and_visualize) 11 | * [Stage validation before schema swap](/examples/stage_validation) 12 | * [Slightly more realistic pipeline](/examples/realistic_pipeline) 13 | * [Introduction to vectorization principle with some example pipelines](https://github.com/Quantco/vectorization-tutorial/blob/main/README.md) 14 | * [Multiple instances: full, mini, midi](/examples/multi_instance_pipeline) 15 | * [Raw SQL example](/examples/raw_sql) 16 | * [Best practices / moving from Raw SQL over handwritten SELECT statements to programmatic SQL](/examples/best_practices_sql) 17 | * [Best practices / multiple instances: full_fresh, full_stable, mini_stable, midi_stable](/examples/best_practices_instances) 18 | * [Best practices / inline views, CTEs, and subqueries](/examples/best_practices_inline) 19 | 20 | ```{toctree} 21 | /quickstart 22 | /examples/simple_pipeline 23 | /database_testing 24 | /examples/imperative_materialization 25 | /examples/interactive_development 26 | /examples/group_and_visualize 27 | /examples/stage_validation 28 | /examples/realistic_pipeline 29 | /examples/multi_instance_pipeline 30 | /examples/raw_sql 31 | ``` -------------------------------------------------------------------------------- /docs/source/examples/best_practices_instances.md: -------------------------------------------------------------------------------- 1 | # Best practices: multiple instances: full_fresh, full_stable, mini_stable, midi_stable 2 | 3 | This story expands on the [multi_instance_pipeline example](multi_instance_pipeline) storyline. 4 | 5 | In general, data pipelines are processing a considerable amount of information be it tables with 100k to 100 million rows 6 | or even billions. Thus processing times will be many minutes or hours. However, iteration speed of software development 7 | on the pipeline is key because the pipeline is used to transform the data in a way that increases understanding and from 8 | better understanding come changes to the code in the data pipeline. 9 | 10 | As a consequence, you should not just have one data pipeline. You should always have at least two little siblings for any 11 | pipeline: 12 | * mini: The minimal amount of data that allows the pipeline code to run through technically. 13 | * midi: A somewhat reasonable selection of data which reaches a high level of code coverage, triggers most edge cases the 14 | pipeline code is concerned with, and may be sampled in a way that allows for statistically sound conclusions be it with 15 | reduced statistical prediction power or higher error margins. If all goals cannot be met with one subset of the input 16 | data, more pipeline instances may be needed. 17 | 18 | Another concern to worry about is that for some purposes, fresh data is required, however, for understanding data and 19 | developing statistically significant models, it is actually rather harmful to have changing data and changing code at the 20 | same time. If you train your model on 1-3 years worth of data, then adding the latest days or weeks does not really provide 21 | much value. Thus it may even be beneficial to have separate pipelines working on fresh data and on stable data. 22 | 23 | The prototypical setup of a pipeline instances with different sizes and different freshness is: 24 | - full fresh pipeline (sources raw input layer) 25 | - full stable pipeline (feeds from full fresh raw input layer) 26 | - midi stable pipeline (feeds from full stable raw input layer and filters) 27 | - mini stable pipeline (feeds from full stable raw input layer and filters) 28 | 29 | Filtering between pipeline instances is nice because like this, it is guaranteed that all stable pipelines are in-sync 30 | capturing the same data version. And it is nice because generic filtering technology can be developed independent of 31 | where data is sourced from. In the future this code could also be provided by a separate pydiverse library. 32 | 33 | For development and test of the source loading technology, it might also be nice to keep an additional instance which, 34 | however, should not be used for developing the actual pipeline: 35 | - mini fresh: performs the same loading technology than the full fresh pipeline, but only loads a minimal amount of data. 36 | 37 | For the full stable pipeline, it is important that the data is not changing. This can be achieved by switching the 38 | option `cache_validation: mode:` to "assert_no_fresh_input" for this pipeline once the data is loaded and should be 39 | kept stable. On the one hand, this implies ignoring cache functions mentioned in @materialize() decorator and additionally, 40 | it fails in case a task changes that has a cache function and thus might bring in data from external sources. 41 | 42 | An example showing how to implement this can be found here: [](/examples/multi_instance_pipeline). -------------------------------------------------------------------------------- /docs/source/examples/environment.yml: -------------------------------------------------------------------------------- 1 | name: pipedag-howto-jupyter 2 | channels: 3 | - conda-forge 4 | - nodefaults 5 | dependencies: 6 | - pydiverse-pipedag 7 | - duckdb 8 | - duckdb-engine 9 | - ipython 10 | - jupyter 11 | -------------------------------------------------------------------------------- /docs/source/examples/group_and_visualize01.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_s_stage1 12 | 13 | stage1 14 | 15 | 16 | cluster_n_pJH1RGfh/oo= 17 | 18 | group1 19 | 20 | 21 | 22 | 1 23 | 24 | task_within_group 25 | 26 | 27 | 28 | 2 29 | 30 | task_within_group2 31 | 32 | 33 | 34 | 1->2 35 | 36 | 37 | 38 | 39 | 40 | 0 41 | 42 | any_task 43 | 44 | 45 | 46 | 3 47 | 48 | any_task 49 | 50 | 51 | -------------------------------------------------------------------------------- /docs/source/examples/group_and_visualize02.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_s_stage1 12 | 13 | stage1 14 | 15 | 16 | cluster_n_Wb6yquV3NOU= 17 | 18 | group1 19 | 20 | 21 | 22 | 2 23 | 24 | task_within_group 25 | 26 | 27 | 28 | 3 29 | 30 | task_within_group2 31 | 32 | 33 | 34 | 2->3 35 | 36 | 37 | 38 | 39 | 40 | 5 41 | 42 | any_task 43 | 44 | 45 | 46 | 2->5 47 | 48 | 49 | 50 | 51 | 52 | 0 53 | 54 | any_task 55 | 56 | 57 | 58 | 0->2 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /docs/source/examples/group_and_visualize03.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_s_stage1 12 | 13 | stage1 14 | 15 | 16 | 17 | 0 18 | 19 | any_task 20 | 21 | 22 | 23 | EoQtt0BXamM= 24 | 25 | group1 26 | 27 | 28 | 29 | 0->EoQtt0BXamM= 30 | 31 | 32 | 33 | 34 | 35 | 5 36 | 37 | any_task 38 | 39 | 40 | 41 | EoQtt0BXamM=->5 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /docs/source/examples/group_and_visualize04.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_s_stage1 12 | 13 | stage1 14 | 15 | 16 | 17 | 0 18 | 19 | any_task 20 | 21 | 22 | 23 | 1 24 | 25 | Entry Barrier 'stage1.group1' 26 | 27 | 28 | 29 | 0->1 30 | 31 | 32 | 33 | 34 | 35 | 2 36 | 37 | task_within_group 38 | 39 | 40 | 41 | 1->2 42 | 43 | 44 | 45 | 46 | 47 | 3 48 | 49 | task_within_group2 50 | 51 | 52 | 53 | 2->3 54 | 55 | 56 | 57 | 58 | 59 | 4 60 | 61 | Exit Barrier 'stage1.group1' 62 | 63 | 64 | 65 | 3->4 66 | 67 | 68 | 69 | 70 | 71 | 5 72 | 73 | any_task 74 | 75 | 76 | 77 | 4->5 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /docs/source/examples/group_and_visualize05.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_n_/yomQjTIaWc= 12 | 13 | group1 14 | 15 | 16 | cluster_s_stage1 17 | 18 | stage1 19 | 20 | 21 | cluster_s_stage2 22 | 23 | stage2 24 | 25 | 26 | 27 | 0 28 | 29 | task_within_group 30 | 31 | 32 | 33 | 1 34 | 35 | task_within_group2 36 | 37 | 38 | 39 | 0->1 40 | 41 | 42 | 43 | 44 | 45 | 3 46 | 47 | any_task 48 | 49 | 50 | -------------------------------------------------------------------------------- /docs/source/examples/group_and_visualize06.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_s_stage1 12 | 13 | stage1 14 | 15 | 16 | cluster_n_x519AWP0zso= 17 | 18 | Group 2 19 | 20 | 21 | cluster_s_stage2 22 | 23 | stage2 24 | 25 | 26 | 27 | 0 28 | 29 | any_task 30 | 31 | 32 | 33 | gPJludGyOp0= 34 | 35 | Group 1 36 | 37 | 38 | 39 | 4 40 | 41 | any_task 42 | 43 | 44 | -------------------------------------------------------------------------------- /docs/source/examples/multi_instance_pipeline.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/docs/source/examples/multi_instance_pipeline.zip -------------------------------------------------------------------------------- /docs/source/examples/raw_sql.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/docs/source/examples/raw_sql.zip -------------------------------------------------------------------------------- /docs/source/examples/realistic_pipeline.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/docs/source/examples/realistic_pipeline.zip -------------------------------------------------------------------------------- /docs/source/examples/simple_pipeline01.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_s_inputs 12 | 13 | inputs 14 | 15 | 16 | cluster_s_features 17 | 18 | features 19 | 20 | 21 | 22 | 0 23 | 24 | input_tables 25 | 26 | 27 | 28 | 2 29 | 30 | join_tables 31 | 32 | 33 | 34 | 0->2 35 | 36 | 37 | 38 | 39 | 40 | 3 41 | 42 | print_dataframe 43 | 44 | 45 | 46 | 2->3 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /docs/source/examples/stage_validation.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | G 9 | 10 | 11 | cluster_s_stage_1 12 | 13 | stage_1 14 | 15 | 16 | cluster_n_LTQuvx9lqTs= 17 | 18 | 19 | 20 | 21 | 4 22 | 23 | validate_stage1 24 | 25 | 26 | 27 | 0 28 | 29 | lazy_task_1 30 | 31 | 32 | 33 | 2 34 | 35 | lazy_task_2 36 | 37 | 38 | 39 | 0->2 40 | 41 | 42 | 43 | 44 | 45 | 1 46 | 47 | eager_inputs 48 | 49 | 50 | 51 | 1->2 52 | 53 | 54 | 55 | 56 | 57 | 2->4 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /docs/source/license.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | ```{literalinclude} ../../LICENSE 4 | :language: none 5 | ``` -------------------------------------------------------------------------------- /docs/source/reference/api.rst: -------------------------------------------------------------------------------- 1 | *** 2 | API 3 | *** 4 | 5 | Public 6 | ====== 7 | 8 | .. py:module:: pydiverse.pipedag 9 | 10 | .. autoclass:: Flow 11 | :members: 12 | :inherited-members: 13 | :special-members: __getitem__ 14 | 15 | .. autoclass:: Stage 16 | :members: 17 | :inherited-members: 18 | :special-members: __getitem__ 19 | 20 | .. autodecorator:: materialize 21 | 22 | .. autodecorator:: input_stage_versions 23 | 24 | .. autodata:: AUTO_VERSION 25 | 26 | .. autoclass:: Table 27 | 28 | .. autoclass:: RawSql 29 | :members: 30 | :special-members: __iter__, __getitem__, __contains__ 31 | 32 | .. autoclass:: Blob 33 | 34 | .. autoclass:: GroupNode 35 | 36 | .. autoclass:: VisualizationStyle 37 | 38 | .. autoclass:: Schema 39 | :members: 40 | 41 | .. autoclass:: Result 42 | :members: 43 | 44 | .. autoclass:: PipedagConfig 45 | :inherited-members: 46 | 47 | .. autoclass:: ConfigContext 48 | :inherited-members: 49 | 50 | .. autoclass:: StageLockContext 51 | :inherited-members: 52 | 53 | 54 | Related Classes 55 | =============== 56 | 57 | .. autoclass:: pydiverse.pipedag.materialize.core.UnboundMaterializingTask(__overload__) 58 | .. autoclass:: pydiverse.pipedag.materialize.core.MaterializingTask(__overload__) 59 | :members: get_output_from_store 60 | :special-members: __getitem__ 61 | .. autoclass:: pydiverse.pipedag.materialize.core.MaterializingTaskGetItem(__overload__) 62 | :members: get_output_from_store 63 | :special-members: __getitem__ 64 | 65 | Backend Classes 66 | =============== 67 | 68 | Table Store 69 | ----------- 70 | .. autoclass:: pydiverse.pipedag.backend.table.SQLTableStore 71 | 72 | SQLTableStore Dialects 73 | ^^^^^^^^^^^^^^^^^^^^^^ 74 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.PostgresTableStore 75 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.DuckDBTableStore 76 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.MSSqlTableStore 77 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.IBMDB2TableStore 78 | 79 | Local Table Cache 80 | ^^^^^^^^^^^^^^^^^ 81 | .. autoclass:: pydiverse.pipedag.backend.table.cache.ParquetTableCache 82 | 83 | Blob Store 84 | ---------- 85 | .. autoclass:: pydiverse.pipedag.backend.blob.FileBlobStore 86 | 87 | Lock Manager 88 | ------------ 89 | .. autoclass:: pydiverse.pipedag.backend.lock.DatabaseLockManager 90 | .. autoclass:: pydiverse.pipedag.backend.lock.ZooKeeperLockManager 91 | .. autoclass:: pydiverse.pipedag.backend.lock.FileLockManager 92 | .. autoclass:: pydiverse.pipedag.backend.lock.NoLockManager 93 | 94 | Orchestration Engine 95 | -------------------- 96 | .. autoclass:: pydiverse.pipedag.engine.SequentialEngine 97 | .. autoclass:: pydiverse.pipedag.engine.DaskEngine 98 | 99 | .. py:class:: PrefectEngine 100 | :canonical: pydiverse.pipedag.engine.prefect.PrefectEngine 101 | 102 | Alias for either 103 | :class:`PrefectOneEngine ` or 104 | :class:`PrefectTwoEngine ` 105 | depending on the version of Prefect that is installed. 106 | 107 | .. autoclass:: pydiverse.pipedag.engine.prefect.PrefectOneEngine 108 | .. autoclass:: pydiverse.pipedag.engine.prefect.PrefectTwoEngine 109 | 110 | Special Table Types 111 | ------------------- 112 | 113 | .. autoclass:: pydiverse.pipedag.materialize.container.ExternalTableReference 114 | -------------------------------------------------------------------------------- /docs/source/reference/cli.md: -------------------------------------------------------------------------------- 1 | # Command Line Utility 2 | 3 | Pipedag comes with a command line utility called `pipedag-manage` to help with some common pipedag related management operations. 4 | These are all the available commands: 5 | 6 | ```{eval-rst} 7 | .. click:: pydiverse.pipedag.management.cli:cli 8 | :prog: pipedag-manage 9 | :nested: full 10 | ``` -------------------------------------------------------------------------------- /docs/source/table_backends.md: -------------------------------------------------------------------------------- 1 | # Table Backends 2 | 3 | We currently only support one table backend battle tested: 4 | 5 | - [](#pydiverse.pipedag.backend.table.SQLTableStore) 6 | 7 | ## [](#pydiverse.pipedag.backend.table.SQLTableStore) 8 | 9 | This backend is highly flexible in terms of database dialects and task implementation styles for which it can 10 | materialize/dematerialize tables. Internally, this is abstracted as Hooks like: 11 | 12 | ```python 13 | @SQLTableStore.register_table() 14 | class SQLAlchemyTableHook(TableHook[SQLTableStore]): 15 | ``` 16 | 17 | Which need to implement the following functions: 18 | 19 | ```python 20 | def can_materialize(cls, type_) -> bool: 21 | def can_retrieve(cls, type_) -> bool: 22 | def materialize(cls, store: SQLTableStore, table: Table, stage_name): 23 | def retrieve(cls, store, table, stage_name, as_type: type): 24 | def lazy_query_str(cls, store, obj) -> str: 25 | ``` 26 | 27 | The SQLTableStore currently supports the following SQL databases/dialects: 28 | 29 | - Postgres 30 | - Snowflake 31 | - Microsoft SQL Server/TSQL 32 | - IBM DB2 (LUW) 33 | - DuckDB (rather used for testing so far) 34 | - Every dialect unknown to pipedag will be treated like a postgres database (issues are likely) 35 | 36 | Example connection strings: 37 | - Postgres: `postgresql://user:password@localhost:5432/{instance_id}` 38 | - Snowflake: `snowflake://{$SNOWFLAKE_USER}:{$SNOWFLAKE_PASSWORD}@{$SNOWFLAKE_ACCOUNT}/database_name/DBO?warehouse=warehouse_name&role=access_role` 39 | - Microsoft SQL Server: `mssql+pyodbc://user:password@127.0.0.1:1433/{instance_id}?driver=ODBC+Driver+18+for+SQL+Server&encrypt=no` 40 | - IBM DB2: `db2+ibm_db://db2inst1:password@localhost:50000/testdb`, `schema_prefix: "{instance_id}_"` 41 | - DuckDB: `duckdb:////tmp/pipedag/{instance_id}/db.duckdb` 42 | 43 | See [Database Testing](database_testing.md) for an example how to spin up a database for testing. 44 | 45 | SQLTableStore supports the following `input_type` arguments to the {py:func}`@materialize ` 46 | decorator out-of-the-box: 47 | 48 | - `sqlalchemy.Table` (see [https://www.sqlalchemy.org/](https://www.sqlalchemy.org/); recommended with `lazy=True`; 49 | can also be used for composing handwritten SQL strings) 50 | - `pydiverse.transform.eager.PandasTableImpl` (see 51 | [https://pydiversetransform.readthedocs.io/en/latest/](https://pydiversetransform.readthedocs.io/en/latest/); 52 | recommended with manual version bumping and `version="X.Y.Z"`) 53 | - `pydiverse.transform.lazy.SQLTableImpl` ( 54 | see [https://pydiversetransform.readthedocs.io/en/latest/](https://pydiversetransform.readthedocs.io/en/latest/); 55 | recommended with `lazy=True`) 56 | - `ibis.Table` (see [https://ibis-project.org/](https://ibis-project.org/); recommended with `lazy=True`) 57 | - `tidypolars.Tibble` (see [https://github.com/markfairbanks/tidypolars](https://github.com/markfairbanks/tidypolars); 58 | recommended with `lazy=True`) 59 | - `pandas.DataFrame` (see [https://pandas.pydata.org/](https://pandas.pydata.org/); recommended with manual version 60 | bumping and `version="X.Y.Z"`) 61 | - `polars.DataFrame` (see [https://pola.rs/](https://pola.rs/); recommended with manual version bumping 62 | and `version="X.Y.Z"`) 63 | - `polars.LazyFrame` (see [https://pola.rs/](https://pola.rs/); recommended with `version=AUTO_VERSION`) -------------------------------------------------------------------------------- /example/run_pipeline.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tempfile 4 | 5 | import pandas as pd 6 | import sqlalchemy as sa 7 | 8 | from pydiverse.pipedag import Flow, Stage, Table, materialize 9 | from pydiverse.pipedag.context import StageLockContext 10 | from pydiverse.pipedag.core.config import create_basic_pipedag_config 11 | from pydiverse.pipedag.util.structlog import setup_logging 12 | 13 | 14 | @materialize(lazy=True) 15 | def lazy_task_1(): 16 | return sa.select( 17 | sa.literal(1).label("x"), 18 | sa.literal(2).label("y"), 19 | ) 20 | 21 | 22 | @materialize(lazy=True, input_type=sa.Table) 23 | def lazy_task_2(input1: sa.sql.expression.Alias, input2: sa.sql.expression.Alias): 24 | query = sa.select( 25 | (input1.c.x * 5).label("x5"), 26 | input2.c.a, 27 | ).select_from(input1.outerjoin(input2, input2.c.x == input1.c.x)) 28 | 29 | return Table(query, name="task_2_out", primary_key=["a"]) 30 | 31 | 32 | @materialize(lazy=True, input_type=sa.Table) 33 | def lazy_task_3(input1: sa.sql.expression.Alias): 34 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}") 35 | 36 | 37 | @materialize(lazy=True, input_type=sa.Table) 38 | def lazy_task_4(input1: sa.sql.expression.Alias): 39 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}") 40 | 41 | 42 | @materialize(nout=2, version="1.0.0") 43 | def eager_inputs(): 44 | dfA = pd.DataFrame( 45 | { 46 | "a": [0, 1, 2, 4], 47 | "b": [9, 8, 7, 6], 48 | } 49 | ) 50 | dfB = pd.DataFrame( 51 | { 52 | "a": [2, 1, 0, 1], 53 | "x": [1, 1, 2, 2], 54 | } 55 | ) 56 | return Table(dfA, "dfA"), Table(dfB, "dfB_%%") 57 | 58 | 59 | @materialize(version="1.0.0", input_type=pd.DataFrame) 60 | def eager_task(tbl1: pd.DataFrame, tbl2: pd.DataFrame): 61 | return tbl1.merge(tbl2, on="x") 62 | 63 | 64 | def main(): 65 | with tempfile.TemporaryDirectory() as temp_dir: 66 | cfg = create_basic_pipedag_config( 67 | f"duckdb:///{temp_dir}/db.duckdb", 68 | disable_stage_locking=True, # This is special for duckdb 69 | # Attention: If uncommented, stage and task names might be sent to the 70 | # following URL. You can self-host kroki if you like: 71 | # https://docs.kroki.io/kroki/setup/install/ 72 | # kroki_url="https://kroki.io", 73 | ).get("default") 74 | with cfg: 75 | with Flow() as f: 76 | with Stage("stage_1"): 77 | lazy_1 = lazy_task_1() 78 | a, b = eager_inputs() 79 | 80 | with Stage("stage_2"): 81 | lazy_2 = lazy_task_2(lazy_1, b) 82 | lazy_3 = lazy_task_3(lazy_2) 83 | eager = eager_task(lazy_1, b) 84 | 85 | with Stage("stage_3"): 86 | lazy_4 = lazy_task_4(lazy_2) 87 | _ = lazy_3, lazy_4, eager # unused terminal output tables 88 | 89 | # Run flow 90 | result = f.run() 91 | assert result.successful 92 | 93 | # Run in a different way for testing 94 | with StageLockContext(): 95 | result = f.run() 96 | assert result.successful 97 | assert result.get(lazy_1, as_type=pd.DataFrame)["x"][0] == 1 98 | 99 | 100 | if __name__ == "__main__": 101 | setup_logging() # you can setup the logging and/or structlog libraries as you wish 102 | main() 103 | -------------------------------------------------------------------------------- /example/simple_pipeline.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import sqlalchemy as sa 5 | 6 | from pydiverse.pipedag import Flow, Stage, materialize 7 | 8 | 9 | # Define the different tasks our flow consists of 10 | @materialize(version="1.0", nout=2) 11 | def input_tables(): 12 | names = pd.DataFrame( 13 | { 14 | "id": [1, 2, 3], 15 | "name": ["Alice", "Bob", "Charlie"], 16 | } 17 | ) 18 | 19 | ages = pd.DataFrame( 20 | { 21 | "id": [1, 2, 3], 22 | "age": [20, 40, 60], 23 | } 24 | ) 25 | 26 | return names, ages 27 | 28 | 29 | @materialize(lazy=True, input_type=sa.Table) 30 | def join_tables(names, ages): 31 | return sa.select(names.c.id, names.c.name, ages.c.age).join_from( 32 | names, ages, names.c.id == ages.c.id 33 | ) 34 | 35 | 36 | @materialize(input_type=pd.DataFrame) 37 | def print_dataframe(df): 38 | print(df) 39 | 40 | 41 | def main(): 42 | # Define how the different tasks should be wired 43 | with Flow("flow") as flow: 44 | with Stage("inputs"): 45 | names, ages = input_tables() 46 | 47 | with Stage("features"): 48 | joined_table = join_tables(names, ages) 49 | print_dataframe(joined_table) 50 | 51 | # # In case you provide a pipedag.yaml, you can run the flow as simple as: 52 | # flow.run() 53 | 54 | # run flow with a duckdb configuration in a random temporary directory (this is 55 | # easier to get started) 56 | import tempfile 57 | 58 | from pydiverse.pipedag.core.config import create_basic_pipedag_config 59 | 60 | with tempfile.TemporaryDirectory() as temp_dir: 61 | cfg = create_basic_pipedag_config( 62 | f"duckdb:///{temp_dir}/db.duckdb", 63 | disable_stage_locking=True, # This is special for duckdb 64 | ).get("default") 65 | # Execute the flow 66 | flow.run(config=cfg) 67 | 68 | 69 | if __name__ == "__main__": 70 | from pydiverse.pipedag.util.structlog import setup_logging 71 | 72 | setup_logging() # you can setup the logging and/or structlog libraries as you wish 73 | main() 74 | -------------------------------------------------------------------------------- /example/visualization.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tempfile 4 | 5 | from pydiverse.pipedag import Flow, GroupNode, Stage, VisualizationStyle, materialize 6 | from pydiverse.pipedag.core.config import create_basic_pipedag_config 7 | from pydiverse.pipedag.util.structlog import setup_logging 8 | 9 | 10 | @materialize 11 | def any_task(): 12 | return 1 13 | 14 | 15 | @materialize 16 | def task_within_group(): 17 | return 2 18 | 19 | 20 | @materialize 21 | def task_within_group2(input1: int): 22 | return input1 + 1 23 | 24 | 25 | def main(): 26 | with tempfile.TemporaryDirectory() as temp_dir: 27 | cfg = create_basic_pipedag_config( 28 | f"duckdb:///{temp_dir}/db.duckdb", 29 | disable_stage_locking=True, # This is special for duckdb 30 | # Attention: stage and task names might be sent to the 31 | # following URL. You can self-host kroki if you like: 32 | # https://docs.kroki.io/kroki/setup/install/ 33 | kroki_url="https://kroki.io", 34 | ).get("default") 35 | with cfg: 36 | with Flow() as flow: 37 | with Stage("stage1"): 38 | _ = any_task() 39 | with GroupNode( 40 | "group1", 41 | ordering_barrier=True, 42 | style=VisualizationStyle( 43 | hide_content=True, box_color_always="#ccccff" 44 | ), 45 | ): 46 | task1 = task_within_group() 47 | _ = task_within_group2(task1) 48 | _ = any_task() 49 | 50 | # Run flow 51 | result = flow.run() 52 | assert result.successful 53 | 54 | # you can also visualize the flow explicitly: 55 | # kroki_url = result.visualize_url() 56 | # result.visualize() 57 | 58 | 59 | if __name__ == "__main__": 60 | setup_logging() # you can setup the logging and/or structlog libraries as you wish 61 | main() 62 | -------------------------------------------------------------------------------- /example/visualization_legend.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tempfile 4 | 5 | from pydiverse.pipedag import Flow, GroupNode, Stage, VisualizationStyle, materialize 6 | from pydiverse.pipedag.core.config import create_basic_pipedag_config 7 | from pydiverse.pipedag.util.structlog import setup_logging 8 | 9 | 10 | @materialize 11 | def failed(): 12 | raise AssertionError("This task is supposed to fail") 13 | 14 | 15 | @materialize(version=None) 16 | def completed_but_cache_invalid(): 17 | return 1 18 | 19 | 20 | @materialize(version="1.0") 21 | def cache_valid(): 22 | return 2 23 | 24 | 25 | @materialize(version="1.0") 26 | def cache_valid2(): 27 | return 3 28 | 29 | 30 | @materialize 31 | def skipped(out): 32 | return out + 1 33 | 34 | 35 | def main(): 36 | with tempfile.TemporaryDirectory() as temp_dir: 37 | cfg = ( 38 | create_basic_pipedag_config( 39 | f"duckdb:///{temp_dir}/db.duckdb", 40 | disable_stage_locking=True, # This is special for duckdb 41 | # Attention: stage and task names might be sent to the 42 | # following URL. You can self-host kroki if you like: 43 | # https://docs.kroki.io/kroki/setup/install/ 44 | kroki_url="https://kroki.io", 45 | fail_fast=False, 46 | ) 47 | .get("default") 48 | .evolve(swallow_exceptions=True) 49 | ) 50 | with cfg: 51 | with Flow() as flow: 52 | with Stage("stage1"): 53 | _ = completed_but_cache_invalid() 54 | _ = cache_valid() 55 | with Stage("stage2"): 56 | out = failed() 57 | with Stage("stage3"): 58 | _ = skipped(out) 59 | with GroupNode( 60 | "group_none_cache_valid", 61 | style=VisualizationStyle(hide_content=True), 62 | ): 63 | _ = completed_but_cache_invalid() 64 | with GroupNode( 65 | "group_any_cache_valid", 66 | style=VisualizationStyle(hide_content=True), 67 | ): 68 | _ = completed_but_cache_invalid() 69 | _ = cache_valid() 70 | with GroupNode( 71 | "group_all_cache_valid", 72 | style=VisualizationStyle(hide_content=True), 73 | ): 74 | # avoid memoization (not counted as cache valid) 75 | _ = cache_valid2() 76 | with GroupNode( 77 | "group_any_failed", style=VisualizationStyle(hide_content=True) 78 | ): 79 | _ = completed_but_cache_invalid() 80 | out = failed() 81 | with GroupNode( 82 | "group_all_skipped", style=VisualizationStyle(hide_content=True) 83 | ): 84 | _ = skipped(out) 85 | 86 | # Run flow 87 | result = flow.run() 88 | assert not result.successful 89 | 90 | # Run flow again for cache validity 91 | result = flow.run() 92 | assert not result.successful 93 | 94 | # you can also visualize the flow explicitly: 95 | # kroki_url = result.visualize_url() 96 | # result.visualize() 97 | 98 | 99 | if __name__ == "__main__": 100 | setup_logging() # you can setup the logging and/or structlog libraries as you wish 101 | main() 102 | -------------------------------------------------------------------------------- /example_imperative/failing_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tempfile 4 | 5 | import sqlalchemy as sa 6 | 7 | from pydiverse.pipedag import Flow, Stage, Table, materialize 8 | from pydiverse.pipedag.core.config import create_basic_pipedag_config 9 | from pydiverse.pipedag.util.structlog import setup_logging 10 | 11 | 12 | @materialize(lazy=True) 13 | def lazy_task_1(): 14 | return Table(sa.text("")).materialize() 15 | 16 | 17 | def main(): 18 | with tempfile.TemporaryDirectory() as temp_dir: 19 | cfg = create_basic_pipedag_config( 20 | f"duckdb:///{temp_dir}/db.duckdb", 21 | disable_stage_locking=True, # This is special for duckdb 22 | # Attention: If uncommented, stage and task names might be sent to the 23 | # following URL. You can self-host kroki if you like: 24 | # https://docs.kroki.io/kroki/setup/install/ 25 | # kroki_url="https://kroki.io", 26 | ).get("default") 27 | with cfg: 28 | with Flow() as f: 29 | with Stage("stage_1"): 30 | lazy_task_1() 31 | 32 | # Run flow 33 | f.run() 34 | 35 | 36 | if __name__ == "__main__": 37 | setup_logging() # you can setup the logging and/or structlog libraries as you wish 38 | main() 39 | -------------------------------------------------------------------------------- /example_interactive/failing_flow_after_successful_debugging.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import tempfile 5 | 6 | import sqlalchemy as sa 7 | from sqlalchemy.exc import ProgrammingError 8 | 9 | from pydiverse.pipedag import Flow, Stage, Table, materialize 10 | from pydiverse.pipedag.core.config import create_basic_pipedag_config 11 | from pydiverse.pipedag.util.structlog import setup_logging 12 | 13 | 14 | @materialize(lazy=True) 15 | def lazy_task_1(): 16 | try: 17 | tbl = Table(sa.text("SELECT-TYPO 1"), name="tbl").materialize() 18 | except ProgrammingError: 19 | # This error is expected 20 | logger = logging.getLogger(__name__ + "-lazy_task_1") 21 | logger.info("Caught expected error", exc_info=True) 22 | 23 | # now we succeed, but are still not done, yet 24 | tbl = Table(sa.text("SELECT 'not-done-yet' as a"), name="tbl").materialize() 25 | 26 | # this will create another two tables but they are not returned and won't switch to 27 | # debug mode 28 | Table(sa.text("SELECT 3 as a")).materialize() 29 | Table(sa.text("SELECT 4 as a"), name="tbl2").materialize() 30 | 31 | # now, we succeed with fixing `tbl` and automatically switch in debug mode 32 | tbl = Table(sa.text("SELECT 1 as a"), name="tbl").materialize() 33 | 34 | # we can also keep a table object: 35 | tbl_obj = Table(sa.text("SELECT 'not-done-yet' as a")) 36 | tbl_obj.materialize() 37 | 38 | # this will also automatically switch to debug mode 39 | tbl_obj.obj = sa.text("SELECT 1 as a") 40 | tbl_obj.materialize() 41 | 42 | # However, now the flow will stop because cache invalidation cannot deal with debug 43 | # mode 44 | return tbl 45 | 46 | 47 | def main(): 48 | with tempfile.TemporaryDirectory() as temp_dir: 49 | cfg = create_basic_pipedag_config( 50 | f"duckdb:///{temp_dir}/db.duckdb", 51 | disable_stage_locking=True, # This is special for duckdb 52 | # Attention: If uncommented, stage and task names might be sent to the 53 | # following URL. You can self-host kroki if you like: 54 | # https://docs.kroki.io/kroki/setup/install/ 55 | # kroki_url="https://kroki.io", 56 | ).get("default") 57 | with cfg: 58 | with Flow() as f: 59 | with Stage("stage_1"): 60 | lazy_task_1() 61 | 62 | # Run flow 63 | f.run() 64 | 65 | 66 | if __name__ == "__main__": 67 | setup_logging() # you can setup the logging and/or structlog libraries as you wish 68 | main() 69 | -------------------------------------------------------------------------------- /example_postgres/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | postgres: 4 | image: postgres 5 | environment: 6 | POSTGRES_USER: sa 7 | POSTGRES_PASSWORD: Pydiverse23 8 | ports: 9 | - "6543:5432" 10 | -------------------------------------------------------------------------------- /example_postgres/pipedag.yaml: -------------------------------------------------------------------------------- 1 | instances: 2 | __any__: 3 | network_interface: "127.0.0.1" 4 | auto_table: 5 | - "pandas.DataFrame" 6 | - "sqlalchemy.sql.expression.TextClause" 7 | - "sqlalchemy.sql.expression.Selectable" 8 | 9 | fail_fast: true 10 | instance_id: pipedag_default 11 | 12 | # Attention: For disable_kroki: false, stage and task names might be sent to the kroki_url. 13 | # You can self-host kroki if you like: 14 | # https://docs.kroki.io/kroki/setup/install/ 15 | disable_kroki: true 16 | kroki_url: "https://kroki.io" 17 | 18 | table_store: 19 | class: "pydiverse.pipedag.backend.table.SQLTableStore" 20 | args: 21 | url: "postgresql://sa:Pydiverse23@127.0.0.1:6543/{instance_id}" 22 | create_database_if_not_exists: True 23 | 24 | print_materialize: true 25 | print_sql: true 26 | 27 | local_table_cache: 28 | store_input: true 29 | store_output: true 30 | use_stored_input_as_cache: true 31 | class: "pydiverse.pipedag.backend.table.cache.ParquetTableCache" 32 | args: 33 | base_path: "/tmp/pipedag/table_cache" 34 | 35 | blob_store: 36 | class: "pydiverse.pipedag.backend.blob.FileBlobStore" 37 | args: 38 | base_path: "/tmp/pipedag/blobs" 39 | 40 | lock_manager: 41 | class: "pydiverse.pipedag.backend.lock.DatabaseLockManager" 42 | 43 | orchestration: 44 | class: "pydiverse.pipedag.engine.SequentialEngine" 45 | -------------------------------------------------------------------------------- /example_postgres/run_pipeline.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import sqlalchemy as sa 5 | 6 | from pydiverse.pipedag import Flow, Stage, Table, materialize 7 | from pydiverse.pipedag.context import StageLockContext 8 | from pydiverse.pipedag.util.structlog import setup_logging 9 | 10 | 11 | @materialize(lazy=True) 12 | def lazy_task_1(): 13 | return sa.select( 14 | sa.literal(1).label("x"), 15 | sa.literal(2).label("y"), 16 | ) 17 | 18 | 19 | @materialize(lazy=True, input_type=sa.Table) 20 | def lazy_task_2(input1: sa.sql.expression.Alias, input2: sa.sql.expression.Alias): 21 | query = sa.select( 22 | (input1.c.x * 5).label("x5"), 23 | input2.c.a, 24 | ).select_from(input1.outerjoin(input2, input2.c.x == input1.c.x)) 25 | 26 | return Table(query, name="task_2_out", primary_key=["a"]) 27 | 28 | 29 | @materialize(lazy=True, input_type=sa.Table) 30 | def lazy_task_3(input1: sa.sql.expression.Alias): 31 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}") 32 | 33 | 34 | @materialize(lazy=True, input_type=sa.Table) 35 | def lazy_task_4(input1: sa.sql.expression.Alias): 36 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}") 37 | 38 | 39 | @materialize(nout=2, version="1.0.0") 40 | def eager_inputs(): 41 | dfA = pd.DataFrame( 42 | { 43 | "a": [0, 1, 2, 4], 44 | "b": [9, 8, 7, 6], 45 | } 46 | ) 47 | dfB = pd.DataFrame( 48 | { 49 | "a": [2, 1, 0, 1], 50 | "x": [1, 1, 2, 2], 51 | } 52 | ) 53 | return Table(dfA, "dfA"), Table(dfB, "dfB_%%") 54 | 55 | 56 | @materialize(version="1.0.0", input_type=pd.DataFrame) 57 | def eager_task(tbl1: pd.DataFrame, tbl2: pd.DataFrame): 58 | return tbl1.merge(tbl2, on="x") 59 | 60 | 61 | def main(): 62 | with Flow() as f: 63 | with Stage("stage_1"): 64 | lazy_1 = lazy_task_1() 65 | a, b = eager_inputs() 66 | 67 | with Stage("stage_2"): 68 | lazy_2 = lazy_task_2(lazy_1, b) 69 | lazy_3 = lazy_task_3(lazy_2) 70 | eager = eager_task(lazy_1, b) 71 | 72 | with Stage("stage_3"): 73 | lazy_4 = lazy_task_4(lazy_2) 74 | _ = lazy_3, lazy_4, eager # unused terminal output tables 75 | 76 | # Run flow 77 | result = f.run() 78 | assert result.successful 79 | 80 | # Run in a different way for testing 81 | with StageLockContext(): 82 | result = f.run() 83 | assert result.successful 84 | assert result.get(lazy_1, as_type=pd.DataFrame)["x"][0] == 1 85 | 86 | 87 | if __name__ == "__main__": 88 | setup_logging() # you can setup the logging and/or structlog libraries as you wish 89 | main() 90 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pydiverse-pipedag" 3 | version = "0.9.10" 4 | description = "A pipeline orchestration library executing tasks within one python session. It takes care of SQL table (de)materialization, caching and cache invalidation. Blob storage is supported as well for example for storing model files." 5 | authors = [ 6 | { name = "QuantCo, Inc." }, 7 | { name = "Nicolas Camenisch", email = "garnele007@gmail.com" }, 8 | { name = "Martin Trautmann", email = "windiana@users.sf.net" }, 9 | ] 10 | license = { file = "LICENSE" } 11 | readme = "docs/package/README.md" 12 | requires-python = ">=3.9" 13 | 14 | classifiers = [ 15 | "Development Status :: 3 - Alpha", 16 | "Intended Audience :: Developers", 17 | "Intended Audience :: Science/Research", 18 | "Programming Language :: SQL", 19 | "Topic :: Database", 20 | ] 21 | 22 | dependencies = [ 23 | "pandas>=1.4.3", 24 | "SQLAlchemy>=1.4.39", 25 | "typing-extensions>=4.1.0", 26 | "networkx>=2.8", 27 | "attrs>=22.1.0", 28 | "structlog>=22.1.0", 29 | "pynng>=0.7.1", 30 | "msgpack>=1.0.4", 31 | "packaging>=21.3", 32 | "python-box>=6.1.0", 33 | "PyYAML>=6.0", 34 | "pyarrow>=11.0.0", 35 | "cryptography>=41.0.1", 36 | "pydot>=1.4.2", 37 | "click>=8.1.3", 38 | "pyparsing>=3.0", 39 | ] 40 | 41 | [tool.hatch.build.targets.wheel] 42 | packages = ["src/pydiverse"] 43 | 44 | [project.scripts] 45 | pipedag-manage = "pydiverse.pipedag.management.cli:cli" 46 | 47 | [tool.ruff] 48 | select = ["F", "E", "UP", "W", "I001", "I002", "B", "A"] 49 | ignore = ["B028"] 50 | extend-exclude = ["docs/*"] 51 | ignore-init-module-imports = true 52 | fix = true 53 | target-version = "py38" 54 | 55 | [tool.ruff.per-file-ignores] 56 | "__init__.py" = ["F401", "F403"] 57 | "src/pydiverse/pipedag/backend/table/sql/ddl.py" = ["F811"] 58 | "tests/*" = ["F403", "F405"] 59 | 60 | [tool.ruff.isort] 61 | known-first-party = ["pydiverse"] 62 | required-imports = ["from __future__ import annotations"] 63 | 64 | [build-system] 65 | requires = ["hatchling"] 66 | build-backend = "hatchling.build" 67 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | pythonpath = src 3 | testpaths = tests 4 | 5 | markers = 6 | postgres: a test that requires postgres [SQLTableStore] 7 | mssql: a test that requires mssql [SQLTableStore] 8 | ibm_db2: a test that requires ibm_db2 [SQLTableStore] 9 | duckdb: a test that requires duckdb [SQLTableStore] 10 | snowflake: a test that requires snowflake [SQLTableStore] 11 | 12 | pdtransform: a test that requires pydiverse-transform [TableHook] 13 | ibis: a test that requires ibis [TableHook] 14 | polars: a test that requires polars/tidypolars [TableHook] 15 | 16 | dask: a test that requires dask [DaskEngine] 17 | prefect: a test that requires prefect [PrefectEngine] 18 | 19 | instances: marker used to run an test with different instances 20 | skip_instances: fixture used to skip running test for a list of instances 21 | 22 | parallelize: parallelize this test 23 | 24 | slow1: fastest of slow tests (this is more simulated treatment different slowness) 25 | slow2: slower tests 26 | slow3: even slower tests 27 | slow4: even much slower tests 28 | slow5: slowest tests -------------------------------------------------------------------------------- /src/pydiverse/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.py 2 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .container import ( 4 | Blob, 5 | ExternalTableReference, 6 | RawSql, 7 | Schema, 8 | Table, 9 | ) 10 | from .context import ConfigContext, StageLockContext 11 | from .core import ( 12 | Flow, 13 | GroupNode, 14 | PipedagConfig, 15 | Result, 16 | Stage, 17 | Task, 18 | VisualizationStyle, 19 | ) 20 | from .materialize import ( 21 | input_stage_versions, 22 | materialize, 23 | ) 24 | from .materialize.core import AUTO_VERSION 25 | 26 | __all__ = [ 27 | "Flow", 28 | "Stage", 29 | "materialize", 30 | "input_stage_versions", 31 | "AUTO_VERSION", 32 | "Table", 33 | "RawSql", 34 | "Blob", 35 | "GroupNode", 36 | "VisualizationStyle", 37 | "Schema", 38 | "Result", 39 | "PipedagConfig", 40 | "ConfigContext", 41 | "StageLockContext", 42 | ] 43 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/_typing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Callable, TypeVar, Union 4 | 5 | if TYPE_CHECKING: 6 | from pydiverse.pipedag import Blob, Table 7 | from pydiverse.pipedag.backend.table.base import BaseTableStore, TableHookResolver 8 | 9 | 10 | def decorator_hint(decorator: Callable) -> Callable: 11 | # Used to fix incorrect type hints in pycharm 12 | return decorator 13 | 14 | 15 | T = TypeVar("T") 16 | CallableT = TypeVar("CallableT", bound=Callable) 17 | StoreT = TypeVar("StoreT", bound="BaseTableStore") 18 | TableHookResolverT = TypeVar("TableHookResolverT", bound="TableHookResolver") 19 | 20 | # Materializable 21 | MPrimitives = Union[int, float, bool, str] 22 | MTypes = Union["Table", "Blob"] 23 | 24 | BaseMaterializable = Union[MPrimitives, MTypes] 25 | Materializable = Union[ 26 | BaseMaterializable, 27 | dict[str, "Materializable"], 28 | list["Materializable"], 29 | tuple["Materializable", ...], 30 | ] 31 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .blob import * 4 | from .lock import * 5 | from .table import * 6 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/lock/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .base import BaseLockManager, LockState 4 | from .database import DatabaseLockManager 5 | from .filelock import FileLockManager 6 | from .nolock import NoLockManager 7 | from .zookeeper import ZooKeeperLockManager 8 | 9 | __all__ = [ 10 | "BaseLockManager", 11 | "LockState", 12 | "NoLockManager", 13 | "FileLockManager", 14 | "ZooKeeperLockManager", 15 | "DatabaseLockManager", 16 | ] 17 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/lock/filelock.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import warnings 5 | from pathlib import Path 6 | from typing import Any 7 | 8 | from pydiverse.pipedag import ConfigContext, Stage 9 | from pydiverse.pipedag.backend.lock.base import BaseLockManager, Lockable, LockState 10 | from pydiverse.pipedag.errors import LockError 11 | from pydiverse.pipedag.util import normalize_name, requires 12 | 13 | try: 14 | import filelock as fl 15 | except ImportError as e: 16 | warnings.warn(str(e), ImportWarning) 17 | fl = None 18 | 19 | 20 | @requires(fl, ImportError("FileLockManager requires 'filelock' to be installed.")) 21 | class FileLockManager(BaseLockManager): 22 | """Lock manager that uses lock files 23 | 24 | For details on how exactly the file locking is implemented, check out the 25 | `filelock documentation`_. 26 | 27 | :param base_path: 28 | A path to a folder where the lock files should get stored. 29 | To differentiate between different instances, the ``instance_id`` will 30 | automatically be appended to the provided path. 31 | 32 | .. _filelock documentation: https://py-filelock.readthedocs.io/en/latest/index.html 33 | """ 34 | 35 | @classmethod 36 | def _init_conf_(cls, config: dict[str, Any]): 37 | instance_id = normalize_name(ConfigContext.get().instance_id) 38 | base_path = Path(config["base_path"]) / instance_id 39 | return cls(base_path) 40 | 41 | def __init__(self, base_path: str | Path): 42 | super().__init__() 43 | self.base_path = Path(base_path).absolute() 44 | self.locks: dict[Lockable, fl.BaseFileLock] = {} 45 | 46 | os.makedirs(self.base_path, exist_ok=True) 47 | 48 | @property 49 | def supports_stage_level_locking(self): 50 | return True 51 | 52 | def acquire(self, lockable: Lockable): 53 | if lockable not in self.locks: 54 | lock_path = self.lock_path(lockable) 55 | self.locks[lockable] = fl.FileLock(lock_path) 56 | 57 | lock = self.locks[lockable] 58 | if not lock.is_locked: 59 | self.logger.info(f"Locking '{lockable}'") 60 | lock.acquire() 61 | self.set_lock_state(lockable, LockState.LOCKED) 62 | 63 | def release(self, lockable: Lockable): 64 | if lockable not in self.locks: 65 | raise LockError(f"No lock '{lockable}' found.") 66 | 67 | lock = self.locks[lockable] 68 | lock.release() 69 | if not lock.is_locked: 70 | self.logger.info(f"Unlocking '{lockable}'") 71 | del self.locks[lockable] 72 | self.set_lock_state(lockable, LockState.UNLOCKED) 73 | 74 | def lock_path(self, lock: Lockable) -> Path: 75 | if isinstance(lock, Stage): 76 | return self.base_path / (lock.name + ".lock") 77 | elif isinstance(lock, str): 78 | return self.base_path / (lock + ".lock") 79 | else: 80 | raise NotImplementedError( 81 | f"Can't lock object of type '{type(lock).__name__}'" 82 | ) 83 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/lock/nolock.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydiverse.pipedag.backend.lock.base import BaseLockManager, Lockable, LockState 4 | 5 | 6 | class NoLockManager(BaseLockManager): 7 | """ 8 | This lock manager doesn't do any locking and only serves as a placeholder 9 | for an actual lock manager for testing something locally. 10 | 11 | .. Warning:: 12 | This lock manager is not intended for use in a production environment. 13 | Using a lock manager is essential for preventing data corruption. 14 | """ 15 | 16 | @property 17 | def supports_stage_level_locking(self): 18 | return True 19 | 20 | def acquire(self, lockable: Lockable): 21 | self.set_lock_state(lockable, LockState.LOCKED) 22 | 23 | def release(self, lockable: Lockable): 24 | self.set_lock_state(lockable, LockState.UNLOCKED) 25 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/table/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from . import cache 4 | from .base import BaseTableStore 5 | from .dict import DictTableStore 6 | from .sql import SQLTableStore 7 | 8 | __all__ = [ 9 | "BaseTableStore", 10 | "DictTableStore", 11 | "SQLTableStore", 12 | ] 13 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/table/cache/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .base import BaseTableCache 4 | from .parquet import ParquetTableCache 5 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/table/cache/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | import structlog 6 | 7 | from pydiverse.pipedag import Stage 8 | from pydiverse.pipedag._typing import T 9 | from pydiverse.pipedag.backend.table.base import TableHookResolver 10 | from pydiverse.pipedag.container import Table 11 | from pydiverse.pipedag.context import RunContext 12 | from pydiverse.pipedag.materialize.core import MaterializingTask 13 | from pydiverse.pipedag.util import Disposable 14 | 15 | 16 | class BaseTableCache(ABC, TableHookResolver, Disposable): 17 | def __init__( 18 | self, 19 | store_input: bool = True, 20 | store_output: bool = False, 21 | use_stored_input_as_cache: bool = True, 22 | ): 23 | super().__init__() 24 | 25 | self.logger = structlog.get_logger(logger_name=type(self).__name__) 26 | 27 | self.should_store_input = store_input 28 | self.should_store_output = store_output 29 | self.should_use_stored_input_as_cache = use_stored_input_as_cache 30 | 31 | def setup(self): 32 | """Setup function 33 | 34 | This function gets called at the beginning of a flow run. 35 | Unlike the __init__ method, a lock is acquired before 36 | the setup method gets called to prevent race conditions. 37 | """ 38 | 39 | def init_stage(self, stage: Stage): 40 | """Initialize a stage 41 | 42 | Gets called before any table is attempted to be stored in the stage. 43 | """ 44 | 45 | @abstractmethod 46 | def clear_cache(self, stage: Stage): 47 | """Delete the cache for a specific stage""" 48 | 49 | def store_table(self, table: Table, task: MaterializingTask): 50 | if self.should_store_output: 51 | return self._store_table(table, task) 52 | 53 | def store_input(self, table: Table, task: MaterializingTask): 54 | if self.should_store_input: 55 | return self._store_table(table, task) 56 | 57 | def _store_table(self, table: Table, task: MaterializingTask | None) -> bool: 58 | """ 59 | :return: bool flag indicating if storing was successful 60 | """ 61 | try: 62 | hook = self.get_m_table_hook(type(table.obj)) 63 | except TypeError: 64 | return False 65 | 66 | if not RunContext.get().should_store_table_in_cache(table): 67 | # Prevent multiple tasks writing at the same time 68 | return False 69 | 70 | try: 71 | hook.materialize(self, table, table.stage.transaction_name) 72 | except TypeError: 73 | return False 74 | return True 75 | 76 | def retrieve_table_obj( 77 | self, 78 | table: Table, 79 | as_type: type[T], 80 | for_auto_versioning: bool = False, 81 | ) -> T: 82 | assert not for_auto_versioning 83 | 84 | if not self.should_use_stored_input_as_cache: 85 | return None 86 | if not self._has_table(table, as_type): 87 | return None 88 | return self._retrieve_table_obj(table, as_type) 89 | 90 | def _retrieve_table_obj(self, table: Table, as_type: type[T]) -> T: 91 | try: 92 | hook = self.get_r_table_hook(as_type) 93 | obj = hook.retrieve(self, table, table.stage.name, as_type) 94 | self.logger.info("Retrieved table from local table cache", table=table) 95 | return obj 96 | except Exception as e: 97 | self.logger.warning( 98 | "Failed to retrieve table from local table cache", 99 | table=table, 100 | cause=str(e), 101 | ) 102 | return None 103 | 104 | @abstractmethod 105 | def _has_table(self, table: Table, as_type: type) -> bool: 106 | """Check if the given table is in the cache""" 107 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/table/sql/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .sql import SQLTableStore 4 | 5 | __all__ = [ 6 | "SQLTableStore", 7 | ] 8 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/table/sql/dialects/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .duckdb import DuckDBTableStore 4 | from .ibm_db2 import IBMDB2TableStore 5 | from .mssql import MSSqlTableStore 6 | from .postgres import PostgresTableStore 7 | from .snowflake import SnowflakeTableStore 8 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/table/sql/dialects/snowflake.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import time 4 | import warnings 5 | from typing import Literal 6 | 7 | from pydiverse.pipedag.backend.table.sql.hooks import ( 8 | IbisTableHook, 9 | ) 10 | from pydiverse.pipedag.backend.table.sql.sql import SQLTableStore 11 | 12 | try: 13 | import snowflake 14 | except ImportError as e: 15 | warnings.warn(str(e), ImportWarning) 16 | snowflake = None 17 | 18 | 19 | class SnowflakeTableStore(SQLTableStore): 20 | """ 21 | SQLTableStore that supports `Snowflake`_. 22 | 23 | Takes the same arguments as 24 | :py:class:`SQLTableStore ` 25 | """ 26 | 27 | _dialect_name = "snowflake" 28 | 29 | def _default_isolation_level(self) -> str | None: 30 | return None # "READ UNCOMMITTED" does not exist in Snowflake 31 | 32 | def optional_pause_for_db_transactionality( 33 | self, 34 | prev_action: Literal[ 35 | "table_drop", 36 | "table_create", 37 | "schema_drop", 38 | "schema_create", 39 | "schema_rename", 40 | ], 41 | ): 42 | _ = prev_action 43 | # The snowflake backend has transactionality problems with very quick 44 | # DROP/CREATE or RENAME activities for both schemas and tables 45 | # which happen in testing. 46 | time.sleep(2) 47 | 48 | def _init_database(self): 49 | create_database = self.engine_url.database.split("/")[0] 50 | with self.engine.connect() as conn: 51 | if not [ 52 | x.name 53 | for x in conn.exec_driver_sql("SHOW DATABASES").mappings().all() 54 | if x.name.upper() == create_database.upper() 55 | ]: 56 | self._init_database_with_database( 57 | "snowflake", 58 | disable_exists_check=True, 59 | create_database=create_database, 60 | ) 61 | 62 | 63 | try: 64 | import ibis 65 | except ImportError: 66 | ibis = None 67 | 68 | 69 | @SnowflakeTableStore.register_table(ibis) 70 | class IbisTableHook(IbisTableHook): 71 | @classmethod 72 | def _conn(cls, store: SnowflakeTableStore): 73 | return ibis.snowflake._from_url(store.engine_url) 74 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/backend/table/util/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .dtype import DType, PandasDTypeBackend 4 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/context/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydiverse.pipedag.context.context import ( 4 | ConfigContext, 5 | DAGContext, 6 | StageLockContext, 7 | TaskContext, 8 | ) 9 | from pydiverse.pipedag.context.run_context import ( 10 | FinalTaskState, 11 | RunContext, 12 | RunContextServer, 13 | ) 14 | 15 | __all__ = [ 16 | "DAGContext", 17 | "TaskContext", 18 | "ConfigContext", 19 | "RunContext", 20 | "RunContextServer", 21 | "StageLockContext", 22 | "FinalTaskState", 23 | ] 24 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/core/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .config import PipedagConfig 4 | from .flow import Flow, Subflow 5 | from .group_node import GroupNode, VisualizationStyle 6 | from .result import Result 7 | from .stage import Stage 8 | from .task import Task, UnboundTask 9 | 10 | __all__ = [ 11 | "Flow", 12 | "Subflow", 13 | "PipedagConfig", 14 | "Result", 15 | "Stage", 16 | "GroupNode", 17 | "VisualizationStyle", 18 | "UnboundTask", 19 | "Task", 20 | ] 21 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/debug/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydiverse.pipedag.materialize.debug import materialize_table 4 | 5 | __all__ = ["materialize_table"] 6 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/engine/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .base import OrchestrationEngine 4 | from .dask import DaskEngine 5 | 6 | # don't import prefect engines by default because importing prefect messes with 7 | # initialization of logging library 8 | # from .prefect import PrefectEngine, PrefectOneEngine, PrefectTwoEngine 9 | from .sequential import SequentialEngine 10 | 11 | __all__ = [ 12 | "OrchestrationEngine", 13 | # "PrefectEngine", 14 | # "PrefectOneEngine", 15 | # "PrefectTwoEngine", 16 | "SequentialEngine", 17 | "DaskEngine", 18 | ] 19 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/engine/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING 5 | 6 | from pydiverse.pipedag import ExternalTableReference, Task 7 | from pydiverse.pipedag.util import Disposable 8 | 9 | if TYPE_CHECKING: 10 | from pydiverse.pipedag.core import Result, Subflow 11 | 12 | 13 | class OrchestrationEngine(Disposable, ABC): 14 | """Flow orchestration engine base class""" 15 | 16 | @abstractmethod 17 | def run( 18 | self, 19 | flow: Subflow, 20 | ignore_position_hashes: bool = False, 21 | inputs: dict[Task, ExternalTableReference] | None = None, 22 | **kwargs, 23 | ) -> Result: 24 | """Execute a flow 25 | 26 | :param flow: the pipedag flow to execute 27 | :param ignore_position_hashes: 28 | If ``True``, the position hashes of tasks are not checked 29 | when retrieving the inputs of a task from the cache. 30 | This simplifies execution of subgraphs if you don't care whether inputs to 31 | that subgraph are cache invalid. This allows multiple modifications in the 32 | Graph before the next run updating the cache. 33 | Attention: This may break automatic cache invalidation. 34 | And for this to work, any task producing an input 35 | for the chosen subgraph may never be used more 36 | than once per stage. 37 | :param kwargs: Optional keyword arguments. How they get used is 38 | engine specific. 39 | :return: A result instance wrapping the flow execution result. 40 | """ 41 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/engine/sequential.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from pydiverse.pipedag import ExternalTableReference, Table, Task 6 | from pydiverse.pipedag.context import ConfigContext, RunContext 7 | from pydiverse.pipedag.core.result import Result 8 | from pydiverse.pipedag.engine.base import ( 9 | OrchestrationEngine, 10 | ) 11 | 12 | if TYPE_CHECKING: 13 | from pydiverse.pipedag.core import Subflow 14 | 15 | 16 | class SequentialEngine(OrchestrationEngine): 17 | """Most basic orchestration engine that just executes all tasks sequentially.""" 18 | 19 | def run( 20 | self, 21 | flow: Subflow, 22 | ignore_position_hashes: bool = False, 23 | inputs: dict[Task, ExternalTableReference] | None = None, 24 | **run_kwargs, 25 | ): 26 | run_context = RunContext.get() 27 | config_context = ConfigContext.get() 28 | 29 | failed_tasks = set() # type: set[Task] 30 | results = {} 31 | exception = None 32 | inputs = inputs if inputs is not None else {} 33 | 34 | try: 35 | for task in flow.get_tasks(): 36 | try: 37 | if not (set(task.input_tasks) & failed_tasks): 38 | task_inputs = { 39 | **{ 40 | in_id: results[in_t] 41 | for in_id, in_t in task.input_tasks.items() 42 | if in_t in results and in_t not in inputs 43 | }, 44 | **{ 45 | in_id: Table(inputs[in_t]) 46 | for in_id, in_t in task.input_tasks.items() 47 | if in_t in inputs 48 | }, 49 | } 50 | 51 | results[task] = task.run( 52 | inputs=task_inputs, 53 | run_context=run_context, 54 | config_context=config_context, 55 | ignore_position_hashes=ignore_position_hashes, 56 | ) 57 | else: 58 | failed_tasks.add(task) 59 | except Exception as e: 60 | if config_context.fail_fast: 61 | raise e 62 | if config_context._swallow_exceptions: 63 | exception = e 64 | failed_tasks.add(task) 65 | else: 66 | raise e 67 | 68 | except Exception as e: 69 | if config_context.fail_fast: 70 | raise e 71 | exception = e 72 | 73 | return Result.init_from( 74 | subflow=flow, 75 | underlying=results, 76 | successful=(exception is None), 77 | task_values=results, 78 | exception=exception, 79 | ) 80 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/errors/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | class FlowError(Exception): 5 | """ 6 | Exception raised when there is an issue with the flow definition. 7 | """ 8 | 9 | 10 | class StageError(Exception): 11 | """ 12 | Exception raised when something is wrong with the stage. 13 | """ 14 | 15 | 16 | class GroupNodeError(Exception): 17 | """ 18 | Exception raised when something is wrong with the stage. 19 | """ 20 | 21 | 22 | class CacheError(Exception): 23 | """ 24 | Exception raised if something couldn't be retrieved from the cache. 25 | """ 26 | 27 | 28 | class LockError(Exception): 29 | """ 30 | Exception raised if something goes wrong while locking, for example if 31 | a lock expires before it has been released. 32 | """ 33 | 34 | 35 | class DuplicateNameError(ValueError): 36 | """ 37 | Exception raised if an object that is supposed to have a unique name doesn't. 38 | """ 39 | 40 | 41 | class IPCError(Exception): 42 | """ 43 | Exception raised when inter process communication fails. 44 | """ 45 | 46 | 47 | class RemoteProcessError(IPCError): 48 | """ 49 | Exception raised if an exception occurred in the remote IPC process. 50 | """ 51 | 52 | 53 | class DisposedError(Exception): 54 | """ 55 | Exception raise when an object has been disposed, but some attributes are 56 | being accessed nevertheless. 57 | """ 58 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/src/pydiverse/pipedag/management/__init__.py -------------------------------------------------------------------------------- /src/pydiverse/pipedag/management/cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | import pkgutil 5 | from pathlib import Path 6 | 7 | import click 8 | 9 | 10 | @click.group() 11 | def cli(): 12 | pass 13 | 14 | 15 | def find_commands(): 16 | commands_dir = Path(__file__).parent / "commands" 17 | return [ 18 | name 19 | for _, name, ispkg in pkgutil.iter_modules([str(commands_dir)]) 20 | if not ispkg and not name.startswith("_") 21 | ] 22 | 23 | 24 | def load_command(command: str): 25 | importlib.import_module(f"pydiverse.pipedag.management.commands.{command}") 26 | 27 | 28 | def dynamically_load_commands(): 29 | for command in find_commands(): 30 | load_command(command) 31 | 32 | 33 | dynamically_load_commands() 34 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/src/pydiverse/pipedag/management/commands/__init__.py -------------------------------------------------------------------------------- /src/pydiverse/pipedag/management/commands/clear_metadata.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import click 4 | 5 | from pydiverse.pipedag import PipedagConfig 6 | from pydiverse.pipedag.backend.table import SQLTableStore 7 | from pydiverse.pipedag.backend.table.sql.ddl import DropSchema 8 | from pydiverse.pipedag.management.cli import cli 9 | 10 | 11 | @cli.command() 12 | @click.option( 13 | "--config", 14 | "config_path", 15 | type=str, 16 | help="path of the pipedag config file to use", 17 | ) 18 | @click.option( 19 | "--instance", 20 | required=True, 21 | type=str, 22 | prompt=True, 23 | help="name of the instance to load from the config file", 24 | ) 25 | @click.option( 26 | "--flow", 27 | type=str, 28 | help="name of the flow to load from the config file", 29 | ) 30 | @click.option( 31 | "--per-user", 32 | is_flag=True, 33 | default=False, 34 | ) 35 | @click.confirmation_option( 36 | prompt=( 37 | "Are you sure that you want to clear all metadata? " 38 | "This action can't be undone." 39 | ) 40 | ) 41 | def clear_metadata( 42 | config_path: str | None, 43 | instance: str, 44 | flow: str | None, 45 | per_user: bool, 46 | ): 47 | """Clears all pipedag metadata.""" 48 | 49 | if config_path: 50 | pipedag_config = PipedagConfig(path=config_path) 51 | else: 52 | pipedag_config = PipedagConfig.default 53 | 54 | config = pipedag_config.get( 55 | instance=instance, 56 | flow=flow, 57 | per_user=per_user, 58 | ) 59 | 60 | with config: 61 | table_store: SQLTableStore = config.store.table_store 62 | 63 | assert isinstance( 64 | table_store, SQLTableStore 65 | ), "clear-metadata only supported for SQLTableStore" 66 | 67 | drop_schema = DropSchema( 68 | table_store.metadata_schema, 69 | if_exists=True, 70 | cascade=True, 71 | engine=table_store.engine, 72 | ) 73 | 74 | table_store.execute(drop_schema) 75 | 76 | click.echo("Did clear all metadata.") 77 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/management/commands/delete_schemas.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import click 4 | import sqlalchemy as sa 5 | 6 | from pydiverse.pipedag import PipedagConfig 7 | from pydiverse.pipedag.backend.table import SQLTableStore 8 | from pydiverse.pipedag.backend.table.sql.ddl import DropSchema 9 | from pydiverse.pipedag.container import Schema 10 | from pydiverse.pipedag.management.cli import cli 11 | 12 | 13 | @cli.command() 14 | @click.option( 15 | "--config", 16 | "config_path", 17 | type=str, 18 | help="path of the pipedag config file to use", 19 | ) 20 | @click.option( 21 | "--instance", 22 | required=True, 23 | type=str, 24 | prompt=True, 25 | help="name of the instance to load from the config file", 26 | ) 27 | @click.option( 28 | "--flow", 29 | type=str, 30 | help="name of the flow to load from the config file", 31 | ) 32 | @click.option( 33 | "--per-user", 34 | is_flag=True, 35 | default=False, 36 | ) 37 | @click.option( 38 | "--yes", 39 | is_flag=True, 40 | help="Confirm the action without prompting.", 41 | ) 42 | def delete_schemas( 43 | config_path: str | None, 44 | instance: str, 45 | flow: str | None, 46 | per_user: bool, 47 | yes: bool, 48 | ): 49 | """ 50 | Delete all schemas associated with an instance. 51 | 52 | Only works with SQLTableStore. 53 | """ 54 | 55 | if config_path: 56 | pipedag_config = PipedagConfig(path=config_path) 57 | else: 58 | pipedag_config = PipedagConfig.default 59 | 60 | config = pipedag_config.get( 61 | instance=instance, 62 | flow=flow, 63 | per_user=per_user, 64 | ) 65 | 66 | with config: 67 | table_store: SQLTableStore = config.store.table_store 68 | 69 | assert isinstance( 70 | table_store, SQLTableStore 71 | ), "delete-schemas only supported for SQLTableStore" 72 | 73 | prefix = table_store.schema_prefix 74 | suffix = table_store.schema_suffix 75 | 76 | inspector = sa.inspect(table_store.engine) 77 | schema_names = inspector.get_schema_names() 78 | schema_names = [ 79 | schema 80 | for schema in schema_names 81 | if schema.startswith(prefix) and schema.endswith(suffix) 82 | ] 83 | 84 | if len(schema_names) == 0: 85 | click.echo("No matching schemas found. Aborting.") 86 | exit() 87 | 88 | database = table_store.engine_url.database 89 | click.echo(f"Found the following schemas (in database '{database}'):") 90 | for schema in schema_names: 91 | click.echo(f"- {schema}") 92 | 93 | if not yes: 94 | click.confirm( 95 | "Are you sure you want to continue? " 96 | "This will delete all the schemas listed above. " 97 | "This action can't be undone.", 98 | abort=True, 99 | ) 100 | 101 | schemas = [Schema(name, "", "") for name in schema_names] 102 | for schema in schemas: 103 | drop_schema = DropSchema(schema, cascade=True, engine=table_store.engine) 104 | table_store.execute(drop_schema) 105 | 106 | click.echo("Did delete all schemas.") 107 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/materialize/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .core import input_stage_versions, materialize 4 | 5 | __all__ = [ 6 | "materialize", 7 | "input_stage_versions", 8 | ] 9 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/materialize/cache.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import itertools 4 | from dataclasses import dataclass 5 | from functools import cached_property 6 | from typing import TYPE_CHECKING 7 | 8 | from pydiverse.pipedag.util.hashing import stable_hash 9 | 10 | if TYPE_CHECKING: 11 | from pydiverse.pipedag import Table 12 | from pydiverse.pipedag.materialize.core import MaterializingTask 13 | 14 | 15 | class ImperativeMaterializationState: 16 | def __init__(self): 17 | # every imperatively materialized table is an assumed dependency of 18 | # subsequently materialized tables of the same task 19 | self.assumed_dependencies: set[Table] = set() 20 | # Table(...).materialize() returns dematerialized objects. We need to find the 21 | # corresponding Table objects for handing returned objects over to consumer 22 | # tasks. 23 | self.object_lookup: dict[int, Table] = {} 24 | self.table_ids: set[int] = set() 25 | self.auto_suffix_counter = itertools.count() 26 | 27 | def add_table_lookup(self, obj, table: Table): 28 | self.assumed_dependencies.add(table) 29 | self.object_lookup[id(obj)] = table 30 | self.table_ids.add(id(table)) 31 | 32 | 33 | @dataclass(frozen=True) 34 | class TaskCacheInfo: 35 | task: MaterializingTask 36 | input_hash: str 37 | cache_fn_hash: str 38 | cache_key: str 39 | assert_no_materialization: bool 40 | force_task_execution: bool 41 | 42 | @cached_property 43 | def imperative_materialization_state(self): 44 | """State used by Table.materialize()""" 45 | return ImperativeMaterializationState() 46 | 47 | 48 | def task_cache_key(task: MaterializingTask, input_hash: str, cache_fn_hash: str): 49 | """Cache key used to judge cache validity of the current task output. 50 | 51 | Also referred to as `task_hash`. 52 | 53 | For lazy objects, this hash isn't used to judge cache validity, instead it 54 | serves as an identifier to reference a specific task run. This can be the case 55 | if a task is determined to be cache-valid and the lazy query string is also 56 | the same, but the task_hash is different from a previous run. Then we can 57 | compute this combined_cache_key from the task's cache metadata to determine 58 | which lazy object to use as cache. 59 | 60 | :param task: task for which the cache key is computed 61 | :param input_hash: hash used for checking whether task is cache invalid due 62 | to changing input. 63 | :param cache_fn_hash: same as input_hash but for external inputs which need 64 | manual cache invalidation function. 65 | :return: The hash / cache key (str). 66 | """ 67 | 68 | return stable_hash( 69 | "TASK", 70 | task.name, 71 | task.version, 72 | input_hash, 73 | cache_fn_hash, 74 | ) 75 | 76 | 77 | def lazy_table_cache_key(task_hash: str, query_hash: str): 78 | return stable_hash( 79 | "LAZY_TABLE", 80 | task_hash, 81 | query_hash, 82 | ) 83 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/materialize/metadata.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import datetime 4 | from dataclasses import dataclass 5 | 6 | 7 | @dataclass 8 | class TaskMetadata: 9 | """Metadata associated with a task 10 | 11 | This metadata object contains all the necessary information that is 12 | needed for determining if a task has already been executed with the 13 | same inputs, and all the information that is needed to reconstruct 14 | the output. 15 | """ 16 | 17 | name: str 18 | stage: str 19 | version: str | None 20 | timestamp: datetime.datetime 21 | run_id: str 22 | position_hash: str 23 | input_hash: str 24 | cache_fn_hash: str 25 | output_json: str 26 | 27 | 28 | @dataclass 29 | class LazyTableMetadata: 30 | """Metadata associated with a 'lazy table' 31 | 32 | This class is only provided for convenience for those table store 33 | backends that implement the `lazy` option for the `store_table` method. 34 | 35 | The `query_hash` is a hash of the query string that produced this table. 36 | The `task_hash` is the combined hash of the task that produced this table. 37 | 38 | The `name` and `stage` values are used to retrieve the appropriate 39 | table from the cache. 40 | 41 | Attention: `task_hash` is sometimes taken from cache and thus is not guaranteed 42 | to refer to the `task_hash` that corresponds to the currently executed task. 43 | Instead, it refers to the task that originally produced this object. 44 | """ 45 | 46 | name: str 47 | stage: str 48 | query_hash: str 49 | task_hash: str 50 | 51 | 52 | @dataclass 53 | class RawSqlMetadata: 54 | """Metadata associated with raw sql statements 55 | 56 | The `query_hash` is a hash of the raw sql string. 57 | The `task_hash` is the combined hash of the task that produced statement. 58 | 59 | The `prev_objects` and `stage` values are used to retrieve the appropriate 60 | tables from the cache. 61 | 62 | Attention: `task_hash` is sometimes taken from cache and thus is not guaranteed 63 | to refer to the `task_hash` that corresponds to the currently executed task. 64 | Instead, it refers to the task that originally produced this object. 65 | """ 66 | 67 | prev_objects: list[str] 68 | new_objects: list[str] 69 | stage: str 70 | query_hash: str 71 | task_hash: str 72 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/util/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .deep_map import deep_map 4 | from .deep_merge import deep_merge 5 | from .disposable import Disposable 6 | from .import_ import requires 7 | from .naming import normalize_name, safe_name 8 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/util/deep_map.py: -------------------------------------------------------------------------------- 1 | """Generic deep map or mutation operations. 2 | 3 | Heavily inspired by the builtin copy module of python: 4 | https://github.com/python/cpython/blob/main/Lib/copy.py 5 | """ 6 | from __future__ import annotations 7 | 8 | from typing import Callable 9 | 10 | _nil = [] 11 | 12 | 13 | def deep_map(x, fn: Callable, memo=None): 14 | if memo is None: 15 | memo = {} 16 | 17 | d = id(x) 18 | y = memo.get(d, _nil) 19 | if y is not _nil: 20 | return y 21 | 22 | cls = type(x) 23 | 24 | if cls == list: 25 | y = _deep_map_list(x, fn, memo) 26 | elif cls == tuple: 27 | y = _deep_map_tuple(x, fn, memo) 28 | elif cls == dict: 29 | y = _deep_map_dict(x, fn, memo) 30 | else: 31 | y = fn(x) 32 | 33 | # If is its own copy, don't memoize. 34 | if y is not x: 35 | memo[d] = y 36 | _keep_alive(x, memo) # Make sure x lives at least as long as d 37 | 38 | return y 39 | 40 | 41 | def _deep_map_list(x, fn, memo): 42 | y = [] 43 | append = y.append 44 | for a in x: 45 | append(deep_map(a, fn, memo)) 46 | return fn(y) 47 | 48 | 49 | def _deep_map_tuple(x, fn, memo): 50 | y = [deep_map(a, fn, memo) for a in x] 51 | # We're not going to put the tuple in the memo, but it's still important we 52 | # check for it, in case the tuple contains recursive mutable structures. 53 | try: 54 | return memo[id(x)] 55 | except KeyError: 56 | pass 57 | for k, j in zip(x, y): 58 | if k is not j: 59 | y = tuple(y) 60 | break 61 | else: 62 | y = x 63 | return fn(y) 64 | 65 | 66 | def _deep_map_dict(x, fn, memo): 67 | y = {} 68 | memo[id(x)] = y 69 | for key, value in x.items(): 70 | y[deep_map(key, fn, memo)] = deep_map(value, fn, memo) 71 | return fn(y) 72 | 73 | 74 | def _keep_alive(x, memo): 75 | """Keeps a reference to the object x in the memo. 76 | Because we remember objects by their id, we have 77 | to assure that possibly temporary objects are kept 78 | alive by referencing them. 79 | We store a reference at the id of the memo, which should 80 | normally not be used unless someone tries to deepcopy 81 | the memo itself... 82 | """ 83 | try: 84 | memo[id(memo)].append(x) 85 | except KeyError: 86 | # aha, this is the first one :-) 87 | memo[id(memo)] = [x] 88 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/util/deep_merge.py: -------------------------------------------------------------------------------- 1 | """Generic deep update function for nested dictionaries. 2 | 3 | Seems to be solved already in various ways (do we like an extra dependency for pydantic.deep_update?) 4 | https://stackoverflow.com/questions/3232943/update-value-of-a-nested-dictionary-of-varying-depth 5 | But for snippets, license restrictions exist: 6 | https://www.ictrecht.nl/en/blog/what-is-the-license-status-of-stackoverflow-code-snippets 7 | """ # noqa: E501 8 | from __future__ import annotations 9 | 10 | from collections.abc import Iterable, Mapping 11 | 12 | from box import Box 13 | 14 | 15 | def deep_merge(x, y, check_enum=False): 16 | if type(x) != type(y) and not (isinstance(x, Mapping) and isinstance(y, Mapping)): 17 | raise TypeError( 18 | f"deep_merge failed due to type mismatch '{x}' (type: {type(x)}) vs. '{y}'" 19 | f" (type: {type(y)})" 20 | ) 21 | 22 | if isinstance(x, Box): 23 | z = Box(_deep_merge_dict(x, y), frozen_box=True) 24 | elif isinstance(x, Mapping): 25 | z = _deep_merge_dict(x, y) 26 | elif isinstance(x, Iterable) and not isinstance(x, str): 27 | z = _deep_merge_iterable(x, y) 28 | else: 29 | z = y # update 30 | 31 | return z 32 | 33 | 34 | def _deep_merge_iterable(x: Iterable, y: Iterable): 35 | # Merging lists is not trivial. 36 | # There are a few different strategies: replace, unique, append, intersection, ... 37 | return y 38 | # return [*x, *y] 39 | # return [deep_merge(a, b) for a, b in zip(x, y)] 40 | 41 | 42 | def _deep_merge_dict(x: Mapping, y: Mapping): 43 | z = dict(x) 44 | for key in x: 45 | if key in y: 46 | if y[key] is None: 47 | # this is a special case but we have no other way in yaml to express 48 | # the deletion of fields from a dictionary in an override config 49 | del z[key] 50 | else: 51 | z[key] = deep_merge(x[key], y[key]) 52 | z.update({key: value for key, value in y.items() if key not in z}) 53 | return z 54 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/util/disposable.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydiverse.pipedag.errors import DisposedError 4 | 5 | 6 | class Disposable: 7 | def __getattribute__(self, name): 8 | try: 9 | object.__getattribute__(self, "_Disposable__disposed") 10 | obj_type = object.__getattribute__(self, "__class__") 11 | raise DisposedError(f"Object of type {obj_type} has already been disposed.") 12 | except AttributeError: 13 | pass 14 | 15 | return object.__getattribute__(self, name) 16 | 17 | def __setattr__(self, key, value): 18 | try: 19 | object.__getattribute__(self, "_Disposable__disposed") 20 | obj_type = object.__getattribute__(self, "__class__") 21 | raise DisposedError(f"Object of type {obj_type} has already been disposed.") 22 | except AttributeError: 23 | pass 24 | 25 | return object.__setattr__(self, key, value) 26 | 27 | def dispose(self): 28 | object.__setattr__(self, "_Disposable__disposed", True) 29 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/util/hashing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import base64 4 | import hashlib 5 | 6 | 7 | def stable_hash(*args: str) -> str: 8 | """Compute a hash over a set of strings 9 | 10 | :param args: Some strings from which to compute the cache key 11 | :return: A sha256 base32 digest, trimmed to 20 char length 12 | """ 13 | 14 | combined_hash = hashlib.sha256(b"PIPEDAG") 15 | for arg in args: 16 | arg_bytes = str(arg).encode("utf8") 17 | arg_bytes_len = len(arg_bytes).to_bytes(length=8, byteorder="big") 18 | 19 | combined_hash.update(arg_bytes_len) 20 | combined_hash.update(arg_bytes) 21 | 22 | # Only take first 20 characters of base32 digest (100 bits). This 23 | # provides 50 bits of collision resistance, which is more than enough. 24 | # To illustrate: If you were to generate 1k hashes per second, 25 | # you still would have to wait over 800k years until you encounter 26 | # a collision. 27 | 28 | # NOTE: Can't use base64 because it contains lower and upper case 29 | # letters; identifiers in pipedag are all lowercase 30 | hash_digest = combined_hash.digest() 31 | hash_str = base64.b32encode(hash_digest).decode("ascii").lower() 32 | return hash_str[:20] 33 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/util/naming.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import itertools 4 | 5 | 6 | def normalize_name(name: str) -> str: 7 | """Normalizes an identifier 8 | 9 | All names in PipeDAG are case-insensitive and can't contain any 10 | slashes. This helper function does exactly this conversion. 11 | """ 12 | if name is not None: 13 | return name.casefold().strip().replace("/", "_") 14 | 15 | 16 | def safe_name(name: str) -> str: 17 | """Converts an identifier to one that is lowercase, ascii only 18 | 19 | Some backends might only support a limited set of characters for 20 | identifiers. This generic functions provides a mechanism for making 21 | a name safe (at least in most bases) by encoding non ascii characters 22 | using punycode. 23 | 24 | :param name: The identifier / name to make safe 25 | :return: The safe name 26 | """ 27 | name = normalize_name(name) 28 | name = name.encode("punycode").decode("ascii") 29 | return name 30 | 31 | 32 | class NameDisambiguator: 33 | """State object for creating non-colliding names 34 | 35 | This object is used inside `TableHook.retrieve` to prevent SQLAlchemy issues... 36 | """ 37 | 38 | def __init__(self): 39 | self.used_names = set() 40 | self.counter = itertools.count() 41 | 42 | def get_name(self, name: str | None) -> str: 43 | new_name = name 44 | while new_name in self.used_names: 45 | new_name = f"alias_{next(self.counter)}" 46 | 47 | self.used_names.add(new_name) 48 | return new_name 49 | -------------------------------------------------------------------------------- /src/pydiverse/pipedag/util/structlog.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import sys 5 | import textwrap 6 | from io import StringIO 7 | 8 | import structlog 9 | from structlog.typing import EventDict, WrappedLogger 10 | 11 | 12 | class StructlogHandler(logging.Handler): 13 | """ 14 | Stdlib logging handler that feeds all events back into structlog 15 | 16 | Can't be used with a structlog logger_factory that uses the logging library, 17 | otherwise logging would result in an infinite loop. 18 | """ 19 | 20 | def __init__(self, *args, **kw): 21 | super().__init__(*args, **kw) 22 | self._log = structlog.get_logger() 23 | 24 | def emit(self, record): 25 | msg = self.format(record) 26 | self._log.log(record.levelno, msg, logger=record.name) 27 | 28 | 29 | class PipedagConsoleRenderer(structlog.dev.ConsoleRenderer): 30 | """ 31 | Custom subclass of the structlog ConsoleRenderer that allows rendering 32 | specific values in the event dict on separate lines. 33 | """ 34 | 35 | def __init__(self, *args, **kwargs): 36 | self._render_keys = kwargs.pop("render_keys", []) 37 | super().__init__(*args, **kwargs) 38 | 39 | def __call__(self, logger: WrappedLogger, name: str, event_dict: EventDict): 40 | render_objects = {} 41 | for key in self._render_keys: 42 | obj = event_dict.pop(key, None) 43 | if obj is not None: 44 | render_objects[key] = obj 45 | 46 | result = super().__call__(logger, name, event_dict) 47 | sio = StringIO() 48 | sio.write(result) 49 | 50 | for key, obj in render_objects.items(): 51 | string_rep = str(obj) 52 | sio.write( 53 | "\n" 54 | + " [" 55 | + self._styles.kv_key 56 | + key 57 | + self._styles.reset 58 | + "]" 59 | + "\n" 60 | + textwrap.indent(string_rep, prefix=" " + self._styles.kv_value) 61 | + self._styles.reset 62 | ) 63 | 64 | return sio.getvalue() 65 | 66 | 67 | def setup_logging( 68 | log_level=logging.INFO, 69 | log_stream=sys.stderr, 70 | timestamp_format="%Y-%m-%d %H:%M:%S.%f", 71 | ): 72 | """Configures structlog and logging with sane defaults.""" 73 | 74 | # Redirect all logs submitted to logging to structlog 75 | logging.basicConfig( 76 | format="%(message)s", 77 | level=log_level, 78 | handlers=[StructlogHandler()], 79 | ) 80 | 81 | # Configure structlog 82 | structlog.configure( 83 | processors=[ 84 | structlog.contextvars.merge_contextvars, 85 | structlog.processors.StackInfoRenderer(), 86 | structlog.dev.set_exc_info, 87 | structlog.processors.add_log_level, 88 | structlog.processors.TimeStamper(timestamp_format), 89 | PipedagConsoleRenderer( 90 | render_keys=["query", "table_obj", "task", "table", "detail"] 91 | ), 92 | ], 93 | wrapper_class=structlog.make_filtering_bound_logger(log_level), 94 | logger_factory=structlog.PrintLoggerFactory(log_stream), 95 | cache_logger_on_first_use=True, 96 | ) 97 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/fixtures/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/instances.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from itertools import chain 4 | 5 | import pytest 6 | 7 | from pydiverse.pipedag import PipedagConfig 8 | 9 | __all__ = [ 10 | "DATABASE_INSTANCES", 11 | "ORCHESTRATION_INSTANCES", 12 | "ALL_INSTANCES", 13 | "with_instances", 14 | "skip_instances", 15 | ] 16 | 17 | 18 | # Pytest markers associated with specific instance name 19 | INSTANCE_MARKS = { 20 | # Database Instances 21 | "postgres": pytest.mark.postgres, 22 | "postgres_unlogged": pytest.mark.postgres, 23 | "mssql": pytest.mark.mssql, 24 | "mssql_pytsql": pytest.mark.mssql, 25 | "ibm_db2": pytest.mark.ibm_db2, 26 | "ibm_db2_avoid_schema": pytest.mark.ibm_db2, 27 | "ibm_db2_materialization_details": pytest.mark.ibm_db2, 28 | "duckdb": pytest.mark.duckdb, 29 | "snowflake": pytest.mark.snowflake, 30 | # Local Table Cache Instances 31 | "local_table_cache": pytest.mark.postgres, 32 | "local_table_cache_inout": pytest.mark.postgres, 33 | "local_table_cache_inout_numpy": pytest.mark.postgres, 34 | "local_table_store": pytest.mark.postgres, 35 | # Orchestration Instances 36 | "dask_engine": [pytest.mark.dask, pytest.mark.postgres], 37 | "prefect_engine": [pytest.mark.prefect, pytest.mark.postgres], 38 | } 39 | 40 | # Collection of instances that represent different database technologies 41 | DATABASE_INSTANCES = ( 42 | "postgres", 43 | "mssql", 44 | "ibm_db2", 45 | "duckdb", 46 | ) 47 | 48 | ORCHESTRATION_INSTANCES = ( 49 | "dask_engine", 50 | "prefect_engine", 51 | ) 52 | 53 | # Extended collection of instances 54 | ALL_INSTANCES = ( 55 | "postgres", 56 | "postgres_unlogged", 57 | "mssql", 58 | "mssql_pytsql", 59 | "ibm_db2", 60 | "ibm_db2_avoid_schema", 61 | "ibm_db2_materialization_details", 62 | "duckdb", 63 | "snowflake", 64 | "local_table_cache", 65 | ) 66 | 67 | 68 | def with_instances(*instances, **kwargs): 69 | """Decorator to run a test with a specific set of instances 70 | 71 | :param instances: Names of the instances to use. 72 | :param kwargs: keyword arguments passed to PipedagConfig.default.get() 73 | """ 74 | return pytest.mark.instances(*flatten(instances), **kwargs) 75 | 76 | 77 | def skip_instances(*instances): 78 | """Decorator to skip running a test with a specific set of instances""" 79 | return pytest.mark.skip_instances(*flatten(instances)) 80 | 81 | 82 | def flatten(it): 83 | """Flatten an iterable""" 84 | if isinstance(it, (list, tuple)): 85 | yield from chain(*map(flatten, it)) 86 | else: 87 | yield it 88 | 89 | 90 | # FIXTURE IMPLEMENTATION 91 | 92 | 93 | @pytest.fixture(autouse=True, scope="function", name="run_with_instance") 94 | def fixture_run_with_instance(request): 95 | """Fixture that runs test with different config instances""" 96 | if hasattr(request, "param"): 97 | instance, kwargs = request.param 98 | config = PipedagConfig.default.get(instance=instance, **kwargs) 99 | with config: 100 | yield instance 101 | else: 102 | yield None 103 | -------------------------------------------------------------------------------- /tests/parallelize/README.md: -------------------------------------------------------------------------------- 1 | # Pipedag Parallelize 2 | 3 | This is a pytest plugin similar to `pytest-xdist` that allows executing tests in parallel. 4 | To prevent two tasks that run on the same instance from corrupting each other's data, 5 | it allows grouping tests together using the `pytest_parallelize_group_items` hook. 6 | Tests that have been grouped, run sequentially on the same worker. 7 | Different groups run in parallel on different workers. 8 | 9 | To specify the number of workers, use the `--workers` argument. 10 | -------------------------------------------------------------------------------- /tests/parallelize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/parallelize/__init__.py -------------------------------------------------------------------------------- /tests/parallelize/hooks.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | 6 | @pytest.hookspec() 7 | def pytest_parallelize_group_items(config, items): 8 | ... 9 | -------------------------------------------------------------------------------- /tests/parallelize/plugin.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from .sesson import Session 6 | from .util import parse_config 7 | 8 | 9 | def pytest_addoption(parser): 10 | workers_help = ( 11 | "Set the max num of workers (aka processes) to start " 12 | "(int or 'auto' - one per core)" 13 | ) 14 | 15 | group = parser.getgroup("parallelize") 16 | group.addoption("--workers", dest="workers", help=workers_help) 17 | parser.addini("workers", workers_help) 18 | 19 | 20 | @pytest.hookimpl(trylast=True) 21 | def pytest_configure(config): 22 | workers = parse_config(config, "workers") 23 | if config.option.collectonly or not workers: 24 | return 25 | 26 | config.pluginmanager.register(Session(config), "parallelize-session") 27 | 28 | try: 29 | # Patch _jb_pytest_runner to support parallel execution of test 30 | # when using the PyCharm IDE 31 | from _jb_runner_tools import set_parallel_mode 32 | 33 | set_parallel_mode() 34 | except ImportError: 35 | pass 36 | 37 | 38 | @pytest.hookimpl 39 | def pytest_addhooks(pluginmanager): 40 | from . import hooks 41 | 42 | pluginmanager.add_hookspecs(hooks) 43 | -------------------------------------------------------------------------------- /tests/parallelize/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def parse_config(config, name): 5 | return getattr(config.option, name, config.getini(name)) 6 | -------------------------------------------------------------------------------- /tests/parallelize/worker.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC 4 | from multiprocessing import Queue 5 | 6 | import pytest 7 | from _pytest.config import Config 8 | 9 | 10 | def start_worker( 11 | worker_id: int, work_queue: Queue, msg_queue: Queue, args: list, option_dict: dict 12 | ): 13 | option_dict["plugins"].append("no:terminal") 14 | config = Config.fromdictargs(option_dict, args) 15 | config.args = args 16 | 17 | from typing import TextIO 18 | 19 | class DontPrint(TextIO, ABC): 20 | def write(*_): 21 | pass 22 | 23 | # TODO: find a way to fix assert inspection code of pytest raised in threads 24 | # The following code meant to do this, but prevents tests from running at all. 25 | # # register dummy terminal reporter since it is needed by pytest even with 26 | # # plugins:"no:terminal" option 27 | # terminal_reporter = TerminalReporter(config, DontPrint()) 28 | # config.pluginmanager.register(terminal_reporter, "terminalreporter") 29 | 30 | # Remove workers option to prevent triggering main plugin 31 | config.option.workers = None 32 | 33 | worker = Worker(config, worker_id, work_queue, msg_queue) 34 | config.pluginmanager.register(worker) 35 | config.hook.pytest_cmdline_main(config=config) 36 | 37 | 38 | class Worker: 39 | def __init__(self, config, worker_id: int, work_queue: Queue, msg_queue: Queue): 40 | super().__init__() 41 | 42 | self.config = config 43 | self.worker_id = worker_id 44 | self.work_queue = work_queue 45 | self.msg_queue = msg_queue 46 | 47 | self.session_items = {} 48 | 49 | def send(self, msg, **kwargs): 50 | kwargs["worker_id"] = self.worker_id 51 | self.msg_queue.put((msg, kwargs)) 52 | 53 | @pytest.hookimpl 54 | def pytest_sessionstart(self, session): 55 | self.send("sessionstart") 56 | 57 | @pytest.hookimpl 58 | def pytest_sessionfinish(self, session): 59 | self.send("sessionfinish") 60 | 61 | @pytest.hookimpl 62 | def pytest_runtest_logstart(self, nodeid, location): 63 | self.send("logstart", nodeid=nodeid, location=location) 64 | 65 | @pytest.hookimpl 66 | def pytest_runtest_logfinish(self, nodeid, location): 67 | self.send("logfinish", nodeid=nodeid, location=location) 68 | 69 | @pytest.hookimpl 70 | def pytest_runtest_logreport(self, report): 71 | data = self.config.hook.pytest_report_to_serializable( 72 | config=self.config, 73 | report=report, 74 | ) 75 | self.send("logreport", report=data) 76 | 77 | @pytest.hookimpl 78 | def pytest_runtestloop(self, session): 79 | self.session_items = {item.nodeid: item for item in session.items} 80 | 81 | should_terminate = False 82 | while not should_terminate: 83 | command, args = self.work_queue.get() 84 | should_terminate = self.process_one_item(session, command, args) 85 | return True 86 | 87 | def process_one_item(self, session, command, args): 88 | if command == "STOP": 89 | return True 90 | 91 | if command == "GROUP": 92 | group_name, node_ids = args 93 | items = [self.session_items[node_id] for node_id in node_ids] 94 | 95 | self.send("DEBUG_start_group", group_name=group_name) 96 | for i, item in enumerate(items): 97 | next_item = items[i + 1] if i + 1 < len(items) else None 98 | self.run_one_test(session, item, next_item) 99 | 100 | return False 101 | 102 | def run_one_test(self, session, item, next_item): 103 | self.send("DEBUG_start_test", nodeid=item.nodeid) 104 | item.ihook.pytest_runtest_protocol(item=item, nextitem=next_item) 105 | if session.shouldfail: 106 | raise session.Failed(session.shouldfail) 107 | if session.shouldstop: 108 | raise session.Interrupted(session.shouldstop) 109 | -------------------------------------------------------------------------------- /tests/test_cache/test_auto_version.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pydiverse.pipedag import AUTO_VERSION, Blob, Flow, Stage, Table 7 | from pydiverse.pipedag.container import RawSql 8 | from pydiverse.pipedag.materialize.core import materialize 9 | from tests.fixtures.instances import with_instances 10 | from tests.util import swallowing_raises 11 | 12 | pytestmark = [with_instances("postgres"), with_instances("local_table_store")] 13 | 14 | 15 | # Specific backends have tests in the test_table_hooks folder 16 | 17 | 18 | def test_lazy_incompatible_with_auto_version(): 19 | with pytest.raises(ValueError): 20 | 21 | @materialize(input_type=pd.DataFrame, version=AUTO_VERSION, lazy=True) 22 | def task(): 23 | ... 24 | 25 | 26 | def test_missing_input_type_auto_version(): 27 | with pytest.raises(ValueError): 28 | 29 | @materialize(version=AUTO_VERSION) 30 | def task(): 31 | ... 32 | 33 | 34 | @with_instances("postgres") 35 | def test_auto_version_illegal_return_types(): 36 | @materialize(input_type=pd.DataFrame, version=AUTO_VERSION) 37 | def blob(): 38 | return Blob(1), Table(pd.DataFrame()) 39 | 40 | @materialize(input_type=pd.DataFrame, version=AUTO_VERSION) 41 | def raw_sql(): 42 | return RawSql("..."), Table(pd.DataFrame()) 43 | 44 | with Flow() as f: 45 | with Stage("auto_version"): 46 | _blob = blob() 47 | _raw_sql = raw_sql() 48 | 49 | with swallowing_raises(ValueError, match="Blob"): 50 | f.run(_blob) 51 | 52 | with swallowing_raises(ValueError, match="RawSql"): 53 | f.run(_raw_sql) 54 | 55 | 56 | def test_auto_version_not_supported(): 57 | import sqlalchemy as sa 58 | 59 | @materialize(input_type=sa.Table, version=AUTO_VERSION) 60 | def not_supported(): 61 | return Table(pd.DataFrame({"x": [1, 2, 3, 4]})) 62 | 63 | with Flow() as f: 64 | with Stage("auto_version"): 65 | _ = not_supported() 66 | 67 | with swallowing_raises(TypeError, match="Auto versioning not supported"): 68 | f.run() 69 | 70 | 71 | # TODO: Currently we only test that auto versioning actually works, 72 | # and that the task gets called the expected amount of times 73 | # in the polars hook tests. 74 | # Once we have support for auto versioning with pandas, we 75 | # might also want to put some tests into this file. 76 | -------------------------------------------------------------------------------- /tests/test_cache/test_local_table_cache.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import sqlalchemy as sa 5 | 6 | from pydiverse.pipedag import * 7 | from tests.fixtures.instances import with_instances 8 | 9 | 10 | @with_instances( 11 | "local_table_cache", 12 | "local_table_cache_inout", 13 | "local_table_cache_inout_numpy", 14 | "local_table_store", 15 | ) 16 | def test_local_table_cache(mocker): 17 | input_val_ = 0 18 | 19 | @materialize() 20 | def input_val(): 21 | return input_val_ 22 | 23 | @materialize(version="1.0") 24 | def select_pandas(x): 25 | # Supported by local caching 26 | return Table(pd.DataFrame({"x": [x]}), "pandas") 27 | 28 | @materialize(lazy=True) 29 | def select_sql(x): 30 | # Not supported by local caching 31 | return Table(sa.select(sa.literal(x).label("x")), "sql") 32 | 33 | @materialize(version="1.0", input_type=pd.DataFrame) 34 | def sink(*args): 35 | for arg in args: 36 | assert arg["x"][0] == input_val_ 37 | 38 | with Flow() as f: 39 | with Stage("stage"): 40 | x = input_val() 41 | 42 | s_pandas = select_pandas(x) 43 | s_sql = select_sql(x) 44 | 45 | _ = sink(s_pandas, s_sql) 46 | 47 | # Initial run to invalidate cache 48 | input_val_ = -1 49 | f.run() 50 | input_val_ = 0 51 | 52 | # Spy Setup 53 | config_context = ConfigContext.get() 54 | local_table_cache = config_context.store.local_table_cache 55 | 56 | si = int(local_table_cache.should_store_input) 57 | so = int(local_table_cache.should_store_output) 58 | siac = int(local_table_cache.should_use_stored_input_as_cache) 59 | 60 | store_table_spy = mocker.spy(local_table_cache, "store_table") 61 | store_input_spy = mocker.spy(local_table_cache, "store_input") 62 | _store_table_spy = mocker.spy(local_table_cache, "_store_table") 63 | 64 | retrieve_table_obj_spy = mocker.spy(local_table_cache, "retrieve_table_obj") 65 | _retrieve_table_obj_spy = mocker.spy(local_table_cache, "_retrieve_table_obj") 66 | 67 | # Initial Run 68 | f.run() 69 | 70 | expected_retrieve_table_obj = 2 71 | expected_successful_retrieve_table_obj = 1 * so * siac # pandas 72 | expected_store_table = 2 73 | expected_store_input = 2 - expected_successful_retrieve_table_obj 74 | 75 | assert store_table_spy.call_count == expected_store_table 76 | assert store_input_spy.call_count == expected_store_input 77 | assert retrieve_table_obj_spy.call_count == expected_retrieve_table_obj 78 | 79 | assert _store_table_spy.call_count == (expected_store_input * si) + ( 80 | expected_store_table * so 81 | ) 82 | assert _retrieve_table_obj_spy.call_count == expected_successful_retrieve_table_obj 83 | 84 | # Second Run 85 | store_table_spy.reset_mock() 86 | store_input_spy.reset_mock() 87 | _store_table_spy.reset_mock() 88 | retrieve_table_obj_spy.reset_mock() 89 | _retrieve_table_obj_spy.reset_mock() 90 | 91 | f.run() 92 | 93 | # Everything should be cache valid, thus no task should get executed. 94 | assert store_table_spy.call_count == 0 95 | assert store_input_spy.call_count == 0 96 | assert retrieve_table_obj_spy.call_count == 0 97 | -------------------------------------------------------------------------------- /tests/test_compression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | import sqlalchemy as sa 5 | 6 | from pydiverse.pipedag import ConfigContext, Flow, Stage, Table, materialize 7 | from pydiverse.pipedag.backend.table.sql.dialects import ( 8 | IBMDB2TableStore, 9 | MSSqlTableStore, 10 | ) 11 | 12 | # Parameterize all tests in this file with several instance_id configurations 13 | from tests.fixtures.instances import ( 14 | DATABASE_INSTANCES, 15 | skip_instances, 16 | with_instances, 17 | ) 18 | from tests.util import tasks_library as m 19 | 20 | pytestmark = [with_instances(DATABASE_INSTANCES)] 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "task, stage_materialization_details", 25 | [ 26 | (m.simple_table_compressed_one_method, "adaptive_value_compression"), 27 | (m.simple_table_compressed_two_methods, "adaptive_value_compression"), 28 | (m.simple_dataframe_compressed_one_method, "adaptive_value_compression"), 29 | (m.simple_dataframe_compressed_two_methods, "adaptive_value_compression"), 30 | (m.simple_table_default_compressed, "adaptive_value_compression"), 31 | (m.simple_dataframe_uncompressed, None), 32 | ], 33 | ) 34 | @with_instances(DATABASE_INSTANCES, "ibm_db2_materialization_details") 35 | @skip_instances("ibm_db2") 36 | def test_compression(task, stage_materialization_details): 37 | @materialize(input_type=sa.Table, lazy=False) 38 | def get_compression_attributes(table: sa.sql.expression.Alias): 39 | query = f""" 40 | SELECT COMPRESSION, ROWCOMPMODE FROM SYSCAT.TABLES 41 | WHERE TABSCHEMA = '{table.original.schema.upper()}' 42 | AND TABNAME = '{table.original.name.upper()}' 43 | """ 44 | return Table(sa.text(query), f"compression_attributes_{table.name}") 45 | 46 | with Flow("flow") as f: 47 | with Stage("stage", materialization_details=stage_materialization_details): 48 | comp_exp_x, x = task() 49 | config = ConfigContext.get() 50 | store = config.store.table_store 51 | if isinstance(store, IBMDB2TableStore): 52 | comp_x = get_compression_attributes(x) 53 | m.assert_table_equal(comp_exp_x, comp_x) 54 | 55 | m.assert_table_equal(x, x) 56 | 57 | for _ in range(3): 58 | if ( 59 | not isinstance(store, (MSSqlTableStore, IBMDB2TableStore)) 60 | and task != m.simple_dataframe_uncompressed 61 | ): 62 | with pytest.raises( 63 | ValueError, 64 | match="To silence this exception set" 65 | " strict_materialization_details=False", 66 | ): 67 | assert f.run().successful 68 | else: 69 | assert f.run().successful 70 | -------------------------------------------------------------------------------- /tests/test_dask.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import io 4 | import pickle 5 | from io import BytesIO 6 | 7 | import dask 8 | import structlog 9 | from _pytest.capture import EncodedFile 10 | 11 | 12 | class A(io.TextIOWrapper): 13 | def __getstate__(self): 14 | return "a" 15 | 16 | def __reduce__(self): 17 | return A, (BytesIO(b"hello"),) 18 | 19 | def __reduce_ex__(self, protocol): 20 | _ = protocol 21 | return self.__reduce__() 22 | 23 | 24 | def test_that_io_wrapper_is_pickleable(): 25 | pickle.dumps(A(BytesIO(b"hello"))) 26 | 27 | 28 | def test_that_encoded_file_is_picklable(): 29 | pickle.dumps(EncodedFile(BytesIO(b"hello"), "utf-8")) 30 | 31 | 32 | def test_dask_structlog_configuration_does_not_prevent_pickling(): 33 | def bind_run(): 34 | structlog_config = structlog.get_config() 35 | 36 | def run(parent_futures, **kwargs): 37 | _ = parent_futures 38 | 39 | structlog.configure(**structlog_config) 40 | 41 | return 1 42 | 43 | run.__name__ = "hi" 44 | return dask.delayed(run, pure=False) 45 | 46 | results = [bind_run()(parent_futures=[])] 47 | kw = { 48 | "traverse": True, 49 | "optimize_graph": False, 50 | "scheduler": "processes", 51 | "num_workers": 8, 52 | "chunksize": 1, 53 | } 54 | 55 | dask.compute(results, **kw) 56 | -------------------------------------------------------------------------------- /tests/test_flows/complex_config_flows/postgres_password.yaml: -------------------------------------------------------------------------------- 1 | username: sa 2 | password: Pydiverse23 3 | host: 127.0.0.1 4 | port: 6543 5 | -------------------------------------------------------------------------------- /tests/test_flows/complex_config_flows/test_locking_instances.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from pydiverse.pipedag.context import StageLockContext 6 | from pydiverse.pipedag.core.config import PipedagConfig 7 | from tests.test_flows.complex_config_flows.test_instance_selection import ( 8 | cfg_file_path, 9 | check_result, 10 | get_flow, 11 | ) 12 | 13 | _ = cfg_file_path 14 | 15 | 16 | @pytest.mark.parametrize("instance", ["lock_zookeeper", "lock_file"]) 17 | def test_lock_manager_instances(cfg_file_path, instance): 18 | # At this point, an instance is chosen from multi-pipedag-instance 19 | # configuration file 20 | pipedag_config = PipedagConfig(cfg_file_path) 21 | cfg = pipedag_config.get(instance=instance) 22 | 23 | flow, out1, out2 = get_flow(cfg.attrs, pipedag_config) 24 | 25 | with StageLockContext(): 26 | result = flow.run(config=cfg) 27 | check_result(result, out1, out2) 28 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql/create_db_helpers.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | /* 4 | Section: Procedures 5 | */ 6 | IF OBJECT_ID(N'{{out_schema}}.CREATEALLDATES', N'P') IS NOT NULL DROP PROCEDURE {{out_schema}}.CREATEALLDATES; 7 | GO 8 | CREATE PROCEDURE {{out_schema}}.CREATEALLDATES 9 | ( 10 | @StartDate AS DATE, @EndDate AS DATE 11 | ) AS 12 | DECLARE @Current AS DATE = DATEADD(DD, 0, @StartDate); DROP TABLE IF EXISTS ##alldates CREATE TABLE ##alldates ( 13 | dt DATE PRIMARY KEY 14 | ) WHILE @Current <= @EndDate BEGIN 15 | INSERT INTO ##alldates 16 | VALUES (@Current); 17 | SET @Current = DATEADD(DD, 1, @Current) -- add 1 to current day 18 | END 19 | GO 20 | 21 | 22 | /* 23 | Section: Functions 24 | */ 25 | IF OBJECT_ID(N'{{out_schema}}.get_db_sampling_factor', N'FN') IS NOT NULL DROP FUNCTION {{out_schema}}.get_db_sampling_factor; 26 | GO 27 | CREATE FUNCTION {{out_schema}}.get_db_sampling_factor () RETURNS INT AS 28 | BEGIN 29 | DECLARE @sampling_rate INT; 30 | SELECT @sampling_rate = ISNULL(TRY_CAST(RIGHT(DB_NAME(), LEN(DB_NAME()) - CHARINDEX('_m', DB_NAME()) - 1) AS INT), 31 | 1 -- fallback: take full sample 32 | ); 33 | RETURN @sampling_rate 34 | END; -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql/prep/entity_checks.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | DROP TABLE IF EXISTS {{out_schema}}.table01 3 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Create table') 4 | GO 5 | CREATE TABLE {{out_schema}}.table01 ( 6 | entity VARCHAR(17) NOT NULL 7 | , reason VARCHAR(50) NOT NULL 8 | PRIMARY KEY (entity, reason) 9 | ) 10 | 11 | 12 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Missing') 13 | GO 14 | INSERT INTO {{out_schema}}.table01 WITH (TABLOCKX) 15 | SELECT DISTINCT raw01.entity entity 16 | , 'Missing in raw01' reason 17 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 18 | LEFT JOIN ( 19 | SELECT DISTINCT entity 20 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 21 | ) raw01x 22 | ON raw01.entity = raw01x.entity 23 | WHERE raw01.end_date = '9999-01-01' 24 | AND raw01x.entity IS NULL 25 | 26 | 27 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - more missing in raw01') 28 | GO 29 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX) 30 | SELECT 31 | raw01.entity entity 32 | , 'missing' reason 33 | FROM {{in_schema}}.raw01 raw01 WITH(NOLOCK) 34 | GROUP BY raw01.entity 35 | HAVING MAX(raw01.end_date) < '9999-01-01' 36 | 37 | 38 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Inconsistency correction') 39 | GO 40 | WITH entity_ids AS ( 41 | SELECT DISTINCT raw01.entity entity 42 | FROM {{in_schema}}.raw01 raw01 WITH (NOLOCK) 43 | INNER JOIN ( -- filter 44 | SELECT entity 45 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 46 | WHERE end_date = '9999-01-01' 47 | ) raw01_final 48 | ON raw01.entity = raw01_final.entity 49 | WHERE 1=1 50 | ) 51 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX) 52 | SELECT x.entity 53 | , 'Inconsistency correction' reason 54 | FROM entity_ids x 55 | INNER JOIN entity_ids y 56 | ON x.entity = y.entity 57 | WHERE x.entity <> y.entity 58 | GROUP BY x.entity 59 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql/prep/more_tables.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | /* 4 | SECTION: raw01A 5 | */ 6 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01A') 7 | GO 8 | CREATE TABLE {{out_schema}}.raw01A( 9 | entity VARCHAR(17) NOT NULL 10 | , start_date DATE NOT NULL 11 | , end_date DATE NOT NULL 12 | , PRIMARY KEY(entity, start_date) 13 | ) 14 | INSERT INTO {{out_schema}}.raw01A WITH(TABLOCKX) 15 | SELECT apgs.entity entity 16 | , apgs.start_date start_date 17 | , apgs.end_date end_date 18 | FROM ( 19 | SELECT entity 20 | , start_date 21 | , end_date 22 | FROM {{in_schema}}.raw01 apgs WITH(NOLOCK) 23 | ) apgs 24 | INNER JOIN ( 25 | SELECT DISTINCT entity 26 | FROM {{in_schema}}.raw01 WITH(NOLOCK) 27 | ) base 28 | ON apgs.entity = base.entity 29 | CREATE INDEX raw_start_date ON {{out_schema}}.raw01A (start_date DESC) 30 | CREATE INDEX raw_start_date_end_date ON {{out_schema}}.raw01A (end_date, start_date DESC) 31 | GO 32 | SELECT 'äöüßéç' as string_col INTO {{out_schema}}.special_chars 33 | GO 34 | CREATE TABLE {{out_schema}}.special_chars2 ( 35 | id TINYINT NOT NULL PRIMARY KEY, 36 | string_col VARCHAR(60) NOT NULL 37 | ) 38 | INSERT INTO {{out_schema}}.special_chars2 (id, string_col) VALUES 39 | (1, 'äöüßéç') 40 | GO 41 | -- check that both strings match and have length 7 with NOT NULL constraint 42 | CREATE TABLE {{out_schema}}.special_chars_join ( 43 | string_col VARCHAR(60) NOT NULL, 44 | string_col2 VARCHAR(60) NOT NULL, 45 | string_col3 VARCHAR(60) NOT NULL 46 | ) 47 | INSERT INTO {{out_schema}}.special_chars_join 48 | SELECT a.string_col, b.string_col, c.string_col 49 | FROM {{out_schema}}.special_chars a 50 | FULL OUTER JOIN {{out_schema}}.special_chars2 b ON a.string_col = b.string_col 51 | FULL OUTER JOIN {{out_schema}}.special_chars2 c ON a.string_col = c.string_col 52 | and len(a.string_col) = 6 and len(c.string_col) = 6 -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql/raw/raw_views.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | {{helper_schema}}.CREATEALLDATES '2022-01-01', '2023-01-01' 3 | SELECT * INTO {{out_schema}}.dummy_dates FROM ##alldates 4 | GO 5 | SELECT 1000000 as entity_nr, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO {{out_schema}}.schema00_raw01_table 6 | GO 7 | SELECT '1' as mod_type, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO {{out_schema}}.filter_table 8 | GO 9 | 10 | /* 11 | SECTION: SAMPLING 12 | */ 13 | GO 14 | DECLARE @START BIGINT = 0 + (SELECT CAST(MIN(entity_nr) AS BIGINT) FROM {{out_schema}}.schema00_raw01_table); 15 | DECLARE @END BIGINT = (SELECT CAST(MAX(entity_nr) AS BIGINT) FROM {{out_schema}}.schema00_raw01_table); 16 | DECLARE @STEP INT = {{helper_schema}}.get_db_sampling_factor(); 17 | DROP TABLE IF EXISTS {{out_schema}}.sample_entities; 18 | WITH L0 AS (SELECT c FROM (SELECT 1 UNION ALL SELECT 1) AS D(c)), -- 2^1 19 | L1 AS (SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B), -- 2^2 20 | L2 AS (SELECT 1 AS c FROM L1 AS A CROSS JOIN L1 AS B), -- 2^4 21 | L3 AS (SELECT 1 AS c FROM L2 AS A CROSS JOIN L2 AS B), -- 2^8 22 | L4 AS (SELECT 1 AS c FROM L3 AS A CROSS JOIN L3 AS B), -- 2^16 23 | L5 AS (SELECT 1 AS c FROM L4 AS A CROSS JOIN L4 AS B), -- 2^32 24 | Nums AS (SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS k FROM L5) 25 | SELECT k * @STEP + @START AS nr 26 | INTO {{out_schema}}.sample_entities 27 | FROM nums 28 | WHERE k <= (@END - @START) / @STEP 29 | CREATE UNIQUE CLUSTERED INDEX nr_index ON {{out_schema}}.sample_entities (nr) WITH ( FILLFACTOR = 100, DATA_COMPRESSION = ROW ); 30 | 31 | 32 | /* 33 | SECTION: Raw-Tables 34 | */ 35 | GO 36 | PRINT (CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01') 37 | DROP VIEW IF EXISTS {{out_schema}}.raw01 38 | GO 39 | CREATE VIEW {{out_schema}}.raw01 40 | AS 41 | SELECT entity_nr entity 42 | , start_date start_date 43 | , end_date end_date 44 | FROM {{out_schema}}.schema00_raw01_table WITH (NOLOCK) 45 | INNER JOIN sample_entities WITH (NOLOCK) 46 | ON entity_nr = sample_entities.nr 47 | 48 | 49 | /* 50 | SECTION: Reference tables 51 | */ 52 | 53 | GO 54 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.fm_mod_type') 55 | DROP VIEW IF EXISTS {{out_schema}}.fm_mod_type 56 | GO 57 | CREATE VIEW {{out_schema}}.fm_mod_type 58 | AS 59 | SELECT mod_type x_inv_type 60 | , start_date start_date 61 | , end_date end_date 62 | FROM {{out_schema}}.filter_table WITH(NOLOCK) 63 | GO 64 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql/create_db_helpers.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | USE {{out_database}} 3 | GO 4 | 5 | 6 | /* 7 | Section: Procedures 8 | */ 9 | IF OBJECT_ID(N'dbo.CREATEALLDATES', N'P') IS NOT NULL DROP PROCEDURE dbo.CREATEALLDATES; 10 | GO 11 | CREATE PROCEDURE CREATEALLDATES 12 | ( 13 | @StartDate AS DATE, @EndDate AS DATE 14 | ) AS 15 | DECLARE @Current AS DATE = DATEADD(DD, 0, @StartDate); DROP TABLE IF EXISTS ##alldates CREATE TABLE ##alldates ( 16 | dt DATE PRIMARY KEY 17 | ) WHILE @Current <= @EndDate BEGIN 18 | INSERT INTO ##alldates 19 | VALUES (@Current); 20 | SET @Current = DATEADD(DD, 1, @Current) -- add 1 to current day 21 | END 22 | GO 23 | 24 | 25 | /* 26 | Section: Functions 27 | */ 28 | IF OBJECT_ID(N'dbo.get_db_sampling_factor', N'FN') IS NOT NULL DROP FUNCTION get_db_sampling_factor; 29 | GO 30 | CREATE FUNCTION dbo.get_db_sampling_factor () RETURNS INT AS 31 | BEGIN 32 | DECLARE @sampling_rate INT; 33 | SELECT @sampling_rate = ISNULL(TRY_CAST(RIGHT(DB_NAME(), LEN(DB_NAME()) - CHARINDEX('_m', DB_NAME()) - 1) AS INT), 34 | 1 -- fallback: take full sample 35 | ); 36 | RETURN @sampling_rate 37 | END; -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql/prep/entity_checks.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | USE master -- no default schema 4 | GO 5 | 6 | CREATEALLDATES('2022-01-01', '2023-01-01') 7 | 8 | SELECT * INTO {{out_schema}}.dummy_dates FROM ##alldates 9 | 10 | DROP TABLE IF EXISTS {{out_schema}}.table01 11 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Create table') 12 | GO 13 | CREATE TABLE {{out_schema}}.table01 ( 14 | entity VARCHAR(17) NOT NULL 15 | , reason VARCHAR(50) NOT NULL 16 | PRIMARY KEY (entity, reason) 17 | ) 18 | 19 | 20 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Missing') 21 | GO 22 | INSERT INTO {{out_schema}}.table01 WITH (TABLOCKX) 23 | SELECT DISTINCT raw01.entity entity 24 | , 'Missing in raw01' reason 25 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 26 | LEFT JOIN ( 27 | SELECT DISTINCT entity 28 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 29 | ) raw01x 30 | ON raw01.entity = raw01x.entity 31 | WHERE raw01.end_date = '9999-01-01' 32 | AND raw01x.entity IS NULL 33 | 34 | 35 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - more missing in raw01') 36 | GO 37 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX) 38 | SELECT 39 | raw01.entity entity 40 | , 'missing' reason 41 | FROM {{in_schema}}.raw01 raw01 WITH(NOLOCK) 42 | GROUP BY raw01.entity 43 | HAVING MAX(raw01.end_date) < '9999-01-01' 44 | 45 | 46 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Inconsistency correction') 47 | GO 48 | WITH entity_ids AS ( 49 | SELECT DISTINCT raw01.entity entity 50 | FROM {{in_schema}}.raw01 raw01 WITH (NOLOCK) 51 | INNER JOIN ( -- filter 52 | SELECT entity 53 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 54 | WHERE end_date = '9999-01-01' 55 | ) raw01_final 56 | ON raw01.entity = raw01_final.entity 57 | WHERE 1=1 58 | ) 59 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX) 60 | SELECT x.entity 61 | , 'Inconsistency correction' reason 62 | FROM entity_ids x 63 | INNER JOIN entity_ids y 64 | ON x.entity = y.entity 65 | WHERE x.entity <> y.entity 66 | GROUP BY x.entity 67 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql/prep/more_tables.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | USE master 4 | GO 5 | 6 | /* 7 | SECTION: raw01A 8 | */ 9 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01A') 10 | GO 11 | CREATE TABLE {{out_schema}}.raw01A( 12 | entity VARCHAR(17) NOT NULL 13 | , start_date DATE NOT NULL 14 | , end_date DATE NOT NULL 15 | , PRIMARY KEY(entity, start_date) 16 | ) 17 | INSERT INTO {{out_schema}}.raw01A WITH(TABLOCKX) 18 | SELECT apgs.entity entity 19 | , apgs.start_date start_date 20 | , apgs.end_date end_date 21 | FROM ( 22 | SELECT entity 23 | , start_date 24 | , end_date 25 | FROM {{in_schema}}.raw01 apgs WITH(NOLOCK) 26 | ) apgs 27 | INNER JOIN ( 28 | SELECT DISTINCT entity 29 | FROM {{in_schema}}.raw01 WITH(NOLOCK) 30 | ) base 31 | ON apgs.entity = base.entity 32 | CREATE INDEX raw_start_date ON {{out_schema}}.raw01A (start_date DESC) 33 | CREATE INDEX raw_start_date_end_date ON {{out_schema}}.raw01A (end_date, start_date DESC) 34 | 35 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql/raw/raw_views.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | USE {{out_database}} -- needed for views 4 | GO 5 | 6 | SELECT 1000000 as entity_nr, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.schema00_raw01_table 7 | GO 8 | SELECT '1' as mod_type, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.filter_table 9 | GO 10 | 11 | /* 12 | SECTION: SAMPLING 13 | */ 14 | GO 15 | DECLARE @START BIGINT = 0 + (SELECT CAST(MIN(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table); 16 | DECLARE @END BIGINT = (SELECT CAST(MAX(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table); 17 | DECLARE @STEP INT = {{helper_schema}}.get_db_sampling_factor(); 18 | DROP TABLE IF EXISTS {{out_schema}}.sample_entities; 19 | WITH L0 AS (SELECT c FROM (SELECT 1 UNION ALL SELECT 1) AS D(c)), -- 2^1 20 | L1 AS (SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B), -- 2^2 21 | L2 AS (SELECT 1 AS c FROM L1 AS A CROSS JOIN L1 AS B), -- 2^4 22 | L3 AS (SELECT 1 AS c FROM L2 AS A CROSS JOIN L2 AS B), -- 2^8 23 | L4 AS (SELECT 1 AS c FROM L3 AS A CROSS JOIN L3 AS B), -- 2^16 24 | L5 AS (SELECT 1 AS c FROM L4 AS A CROSS JOIN L4 AS B), -- 2^32 25 | Nums AS (SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS k FROM L5) 26 | SELECT k * @STEP + @START AS nr 27 | INTO {{out_schema}}.sample_entities 28 | FROM nums 29 | WHERE k <= (@END - @START) / @STEP 30 | CREATE UNIQUE CLUSTERED INDEX nr_index ON {{out_schema}}.sample_entities (nr) WITH ( FILLFACTOR = 100, DATA_COMPRESSION = ROW ); 31 | 32 | 33 | /* 34 | SECTION: Raw-Tables 35 | */ 36 | GO 37 | PRINT (CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01') 38 | DROP VIEW IF EXISTS {{out_schema_only}}.raw01 39 | GO 40 | CREATE VIEW {{out_schema_only}}.raw01 41 | AS 42 | SELECT entity_nr entity 43 | , start_date start_date 44 | , end_date end_date 45 | FROM dbo.schema00_raw01_table WITH (NOLOCK) 46 | INNER JOIN sample_entities WITH (NOLOCK) 47 | ON entity_nr = sample_entities.nr 48 | 49 | 50 | /* 51 | SECTION: Reference tables 52 | */ 53 | 54 | GO 55 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.fm_mod_type') 56 | DROP VIEW IF EXISTS {{out_schema_only}}.fm_mod_type 57 | GO 58 | CREATE VIEW {{out_schema_only}}.fm_mod_type 59 | AS 60 | SELECT mod_type x_inv_type 61 | , start_date start_date 62 | , end_date end_date 63 | FROM dbo.filter_table WITH(NOLOCK) 64 | GO 65 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/create_db_helpers.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | USE {{out_database}} 3 | GO 4 | 5 | 6 | /* 7 | Section: Procedures 8 | */ 9 | IF OBJECT_ID(N'dbo.CREATEALLDATES', N'P') IS NOT NULL DROP PROCEDURE dbo.CREATEALLDATES; 10 | GO 11 | CREATE PROCEDURE CREATEALLDATES 12 | ( 13 | @StartDate AS DATE, @EndDate AS DATE 14 | ) AS 15 | DECLARE @Current AS DATE = DATEADD(DD, 0, @StartDate); DROP TABLE IF EXISTS ##alldates CREATE TABLE ##alldates ( 16 | dt DATE PRIMARY KEY 17 | ) WHILE @Current <= @EndDate BEGIN 18 | INSERT INTO ##alldates 19 | VALUES (@Current); 20 | SET @Current = DATEADD(DD, 1, @Current) -- add 1 to current day 21 | END 22 | GO 23 | 24 | 25 | /* 26 | Section: Functions 27 | */ 28 | -- IF OBJECT_ID(N'dbo.get_db_sampling_factor', N'FN') IS NOT NULL DROP FUNCTION get_db_sampling_factor; 29 | -- GO 30 | CREATE FUNCTION dbo.get_db_sampling_factor () RETURNS INT AS 31 | BEGIN 32 | DECLARE @sampling_rate INT; 33 | SELECT @sampling_rate = ISNULL(TRY_CAST(RIGHT(DB_NAME(), LEN(DB_NAME()) - CHARINDEX('_m', DB_NAME()) - 1) AS INT), 34 | 1 -- fallback: take full sample 35 | ); 36 | RETURN @sampling_rate 37 | END; -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/prep/entity_checks.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | USE master -- no default schema 4 | GO 5 | 6 | CREATEALLDATES('2022-01-01', '2023-01-01') 7 | 8 | SELECT * INTO {{out_schema}}.dummy_dates FROM ##alldates 9 | 10 | DROP TABLE IF EXISTS {{out_schema}}.table01 11 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Create table') 12 | GO 13 | CREATE TABLE {{out_schema}}.table01 ( 14 | entity VARCHAR(17) NOT NULL 15 | , reason VARCHAR(50) NOT NULL 16 | PRIMARY KEY (entity, reason) 17 | ) 18 | 19 | 20 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Missing') 21 | GO 22 | INSERT INTO {{out_schema}}.table01 WITH (TABLOCKX) 23 | SELECT DISTINCT raw01.entity entity 24 | , 'Missing in raw01' reason 25 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 26 | LEFT JOIN ( 27 | SELECT DISTINCT entity 28 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 29 | ) raw01x 30 | ON raw01.entity = raw01x.entity 31 | WHERE raw01.end_date = '9999-01-01' 32 | AND raw01x.entity IS NULL 33 | 34 | 35 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - more missing in raw01') 36 | GO 37 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX) 38 | SELECT 39 | raw01.entity entity 40 | , 'missing' reason 41 | FROM {{in_schema}}.raw01 raw01 WITH(NOLOCK) 42 | GROUP BY raw01.entity 43 | HAVING MAX(raw01.end_date) < '9999-01-01' 44 | 45 | 46 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Inconsistency correction') 47 | GO 48 | WITH entity_ids AS ( 49 | SELECT DISTINCT raw01.entity entity 50 | FROM {{in_schema}}.raw01 raw01 WITH (NOLOCK) 51 | INNER JOIN ( -- filter 52 | SELECT entity 53 | FROM {{in_schema}}.raw01 WITH (NOLOCK) 54 | WHERE end_date = '9999-01-01' 55 | ) raw01_final 56 | ON raw01.entity = raw01_final.entity 57 | WHERE 1=1 58 | ) 59 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX) 60 | SELECT x.entity 61 | , 'Inconsistency correction' reason 62 | FROM entity_ids x 63 | INNER JOIN entity_ids y 64 | ON x.entity = y.entity 65 | WHERE x.entity <> y.entity 66 | GROUP BY x.entity 67 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/prep/more_tables.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | USE master 4 | GO 5 | 6 | /* 7 | SECTION: raw01A 8 | */ 9 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01A') 10 | GO 11 | CREATE TABLE {{out_schema}}.raw01A( 12 | entity VARCHAR(17) NOT NULL 13 | , start_date DATE NOT NULL 14 | , end_date DATE NOT NULL 15 | , PRIMARY KEY(entity, start_date) 16 | ) 17 | INSERT INTO {{out_schema}}.raw01A WITH(TABLOCKX) 18 | SELECT apgs.entity entity 19 | , apgs.start_date start_date 20 | , apgs.end_date end_date 21 | FROM ( 22 | SELECT entity 23 | , start_date 24 | , end_date 25 | FROM {{in_schema}}.raw01 apgs WITH(NOLOCK) 26 | ) apgs 27 | INNER JOIN ( 28 | SELECT DISTINCT entity 29 | FROM {{in_schema}}.raw01 WITH(NOLOCK) 30 | ) base 31 | ON apgs.entity = base.entity 32 | CREATE INDEX raw_start_date ON {{out_schema}}.raw01A (start_date DESC) 33 | CREATE INDEX raw_start_date_end_date ON {{out_schema}}.raw01A (end_date, start_date DESC) 34 | -------------------------------------------------------------------------------- /tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/raw/raw_views.sql: -------------------------------------------------------------------------------- 1 | -- This is intentionally crazy TSQL code similar to code "found in the wild" 2 | 3 | USE {{out_database}} -- needed for views 4 | GO 5 | 6 | SELECT 1000000 as entity_nr, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.schema00_raw01_table 7 | GO 8 | SELECT '1' as mod_type, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.filter_table 9 | GO 10 | 11 | /* 12 | SECTION: SAMPLING 13 | */ 14 | GO 15 | DECLARE @START BIGINT = 0 + (SELECT CAST(MIN(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table); 16 | DECLARE @END BIGINT = (SELECT CAST(MAX(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table); 17 | DECLARE @STEP INT = {{helper_schema}}.get_db_sampling_factor(); 18 | DROP TABLE IF EXISTS {{out_schema}}.sample_entities; 19 | WITH L0 AS (SELECT c FROM (SELECT 1 UNION ALL SELECT 1) AS D(c)), -- 2^1 20 | L1 AS (SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B), -- 2^2 21 | L2 AS (SELECT 1 AS c FROM L1 AS A CROSS JOIN L1 AS B), -- 2^4 22 | L3 AS (SELECT 1 AS c FROM L2 AS A CROSS JOIN L2 AS B), -- 2^8 23 | L4 AS (SELECT 1 AS c FROM L3 AS A CROSS JOIN L3 AS B), -- 2^16 24 | L5 AS (SELECT 1 AS c FROM L4 AS A CROSS JOIN L4 AS B), -- 2^32 25 | Nums AS (SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS k FROM L5) 26 | SELECT k * @STEP + @START AS nr 27 | INTO {{out_schema}}.sample_entities 28 | FROM nums 29 | WHERE k <= (@END - @START) / @STEP 30 | CREATE UNIQUE CLUSTERED INDEX nr_index ON {{out_schema}}.sample_entities (nr) WITH ( FILLFACTOR = 100, DATA_COMPRESSION = ROW ); 31 | 32 | 33 | /* 34 | SECTION: Raw-Tables 35 | */ 36 | GO 37 | PRINT (CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01') 38 | DROP VIEW IF EXISTS {{out_schema_only}}.raw01 39 | GO 40 | CREATE VIEW {{out_schema_only}}.raw01 41 | AS 42 | SELECT entity_nr entity 43 | , start_date start_date 44 | , end_date end_date 45 | FROM dbo.schema00_raw01_table WITH (NOLOCK) 46 | INNER JOIN sample_entities WITH (NOLOCK) 47 | ON entity_nr = sample_entities.nr 48 | 49 | 50 | /* 51 | SECTION: Reference tables 52 | */ 53 | 54 | GO 55 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.fm_mod_type') 56 | DROP VIEW IF EXISTS {{out_schema_only}}.fm_mod_type 57 | GO 58 | CREATE VIEW {{out_schema_only}}.fm_mod_type 59 | AS 60 | SELECT mod_type x_inv_type 61 | , start_date start_date 62 | , end_date end_date 63 | FROM dbo.filter_table WITH(NOLOCK) 64 | GO 65 | -------------------------------------------------------------------------------- /tests/test_flows/sql_scripts/script1-db2.sql: -------------------------------------------------------------------------------- 1 | SELECT 12 AS coltab1 FROM SYSIBM.SYSDUMMY1 2 | -------------------------------------------------------------------------------- /tests/test_flows/sql_scripts/script1.sql: -------------------------------------------------------------------------------- 1 | SELECT 12 AS coltab1 2 | -------------------------------------------------------------------------------- /tests/test_flows/sql_scripts/script2.sql: -------------------------------------------------------------------------------- 1 | SELECT COLTAB1 + 12 AS coltab2 2 | FROM {{dependent}} 3 | -------------------------------------------------------------------------------- /tests/test_flows/test_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from example.run_pipeline import main as example_flow_main 6 | from example.simple_pipeline import main as simple_pipeline_main 7 | from example.visualization import main as visualization_main 8 | from example_imperative.run_pipeline import main as example_imperative_flow_main 9 | from example_interactive.run_tasks_interactively import main as example_interactive_main 10 | from example_postgres.run_pipeline import main as example_postgres_flow_main 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "fn", 15 | [ 16 | example_flow_main, 17 | simple_pipeline_main, 18 | visualization_main, 19 | example_imperative_flow_main, 20 | example_postgres_flow_main, 21 | example_interactive_main, 22 | ], 23 | ) 24 | def test_examples(fn): 25 | """ 26 | This test just runs the example pipeline that we provide in example/run_pipeline.py 27 | """ 28 | 29 | fn() 30 | -------------------------------------------------------------------------------- /tests/test_flows/test_flow.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import sqlalchemy as sa 5 | from pandas.testing import assert_frame_equal 6 | 7 | from pydiverse.pipedag import Blob, Flow, Stage, Table, materialize 8 | from pydiverse.pipedag.context import StageLockContext 9 | 10 | dfA = pd.DataFrame( 11 | { 12 | "a": [0, 1, 2, 4], 13 | "b": [9, 8, 7, 6], 14 | } 15 | ) 16 | 17 | dfB = pd.DataFrame( 18 | { 19 | "a": [2, 1, 0, 1], 20 | "x": [1, 1, 2, 2], 21 | } 22 | ) 23 | 24 | 25 | @materialize(nout=2, version="1") 26 | def inputs(): 27 | import time 28 | 29 | time.sleep(1) 30 | return Table(dfA, "dfA"), Table(dfB, "dfB_%%") 31 | 32 | 33 | @materialize(input_type=pd.DataFrame) 34 | def double_values(df: pd.DataFrame): 35 | return Table(df.transform(lambda x: x * 2)) 36 | 37 | 38 | @materialize(input_type=sa.Table, lazy=True) 39 | def join_on_a(left: sa.sql.expression.Alias, right: sa.sql.expression.Alias): 40 | return Table(left.select().join(right, left.c.a == right.c.a)) 41 | 42 | 43 | @materialize(input_type=pd.DataFrame) 44 | def list_arg(x: list[pd.DataFrame]): 45 | assert isinstance(x[0], pd.DataFrame) 46 | return Blob(x) 47 | 48 | 49 | @materialize 50 | def blob_task(x, y): 51 | return Blob(x), Blob(y) 52 | 53 | 54 | def test_simple_flow(with_blob=True): 55 | with Flow() as flow: 56 | with Stage("simple_flow_stage1"): 57 | inp = inputs() 58 | a, b = inp 59 | 60 | a2 = double_values(a) 61 | b2 = double_values(b) 62 | b4 = double_values(b2) 63 | b4 = double_values(b4) 64 | x = list_arg([a2, b, b4]) 65 | 66 | with Stage("simple_flow_stage2"): 67 | joined = join_on_a(a2, b4) 68 | joined_times_2 = double_values(joined) 69 | 70 | if with_blob: 71 | v = blob_task(x, x) 72 | v = blob_task(v, v) 73 | v = blob_task(v, v) 74 | 75 | blob_tuple = blob_task(1, 2) 76 | 77 | with StageLockContext(): 78 | result = flow.run() # this will use the default configuration instance=__any__ 79 | assert result.successful 80 | 81 | # Check result.get works 82 | res_a = result.get(a, as_type=pd.DataFrame) 83 | res_b = result.get(b, as_type=pd.DataFrame) 84 | res_inp = result.get(inp, as_type=pd.DataFrame) 85 | res_joined = result.get(joined, as_type=pd.DataFrame) 86 | res_joined_times_2 = result.get(joined_times_2, as_type=pd.DataFrame) 87 | 88 | assert_frame_equal(res_a, dfA, check_dtype=False) 89 | assert_frame_equal(res_b, dfB, check_dtype=False) 90 | assert_frame_equal(res_inp[0], dfA, check_dtype=False) 91 | assert_frame_equal(res_inp[1], dfB, check_dtype=False) 92 | assert_frame_equal(res_joined * 2, res_joined_times_2) 93 | 94 | result.get(x) 95 | if with_blob: 96 | result.get(v) 97 | assert tuple(result.get(blob_tuple)) == (1, 2) 98 | 99 | 100 | if __name__ == "__main__": 101 | test_simple_flow() 102 | -------------------------------------------------------------------------------- /tests/test_flows/test_simple_flow.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import sqlalchemy as sa 5 | 6 | from pydiverse.pipedag import Flow, Stage, Table, materialize 7 | from tests.fixtures.instances import ( 8 | DATABASE_INSTANCES, 9 | ORCHESTRATION_INSTANCES, 10 | with_instances, 11 | ) 12 | 13 | 14 | @materialize(nout=2, version="1.1") 15 | def inputs(): 16 | df_a = pd.DataFrame( 17 | { 18 | "a": [0, 1, 2, 4], 19 | "b": [9, 8, 7, 6], 20 | } 21 | ) 22 | 23 | df_b = pd.DataFrame( 24 | { 25 | "a": [2, 1, 0, 1], 26 | "x": [1, 1, 2, 2], 27 | } 28 | ) 29 | return Table(df_a, "dfA", primary_key=["a"]), Table(df_b, "dfB") 30 | 31 | 32 | @materialize(input_type=pd.DataFrame, version="1.0") 33 | def double_values(df: pd.DataFrame): 34 | df["a"] = df["a"] * 2 35 | return Table(df) 36 | 37 | 38 | @materialize(input_type=sa.Table, lazy=True) 39 | def join_on_a(left: sa.sql.expression.Alias, right: sa.sql.expression.Alias): 40 | return Table(left.select().join(right, left.c.a == right.c.a)) 41 | 42 | 43 | # noinspection PyTypeChecker 44 | def get_flow(): 45 | with Flow() as flow: 46 | with Stage("simple_flow_stage1"): 47 | a, b = inputs() 48 | a2 = double_values(a) 49 | 50 | with Stage("simple_flow_stage2"): 51 | b2 = double_values(b) 52 | joined = join_on_a(a2, b2) 53 | _ = joined 54 | return flow 55 | 56 | 57 | @with_instances(DATABASE_INSTANCES, ORCHESTRATION_INSTANCES) 58 | def test_simple_flow(): 59 | flow = get_flow() 60 | result = flow.run() 61 | assert result.successful 62 | 63 | 64 | if __name__ == "__main__": 65 | test_simple_flow() 66 | -------------------------------------------------------------------------------- /tests/test_flows/test_source_invalidation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | 5 | from pydiverse.pipedag import Flow, Stage, Table, materialize 6 | from pydiverse.pipedag.context import StageLockContext 7 | from pydiverse.pipedag.context.context import CacheValidationMode 8 | 9 | dfA_source = pd.DataFrame( 10 | { 11 | "a": [0, 1, 2, 4], 12 | "b": [9, 8, 7, 6], 13 | } 14 | ) 15 | dfA = dfA_source.copy() 16 | input_hash = hash(str(dfA)) 17 | 18 | 19 | def has_new_input(dummy_arg): 20 | """Returns whether new input is available via input hash. 21 | 22 | :param dummy_arg: Argument used to test that custom cache invalidation function 23 | gets same arguments as task function 24 | :return: hash value of input (stored hash must not exactly be input hash) 25 | """ 26 | assert dummy_arg == "irrelevant" 27 | global input_hash 28 | return input_hash 29 | 30 | 31 | # noinspection DuplicatedCode 32 | @materialize(nout=2, cache=has_new_input, version="1.0") 33 | def input_task(dummy_arg): 34 | global dfA 35 | return Table(dfA, "dfA"), Table(dfA, "dfB") 36 | 37 | 38 | @materialize(input_type=pd.DataFrame, version="1.0") 39 | def double_values(df: pd.DataFrame): 40 | return Table(df.transform(lambda x: x * 2)) 41 | 42 | 43 | # noinspection PyTypeChecker 44 | def get_flow(): 45 | with Flow("FLOW") as flow: 46 | with Stage("stage_1"): 47 | dummy_arg = "irrelevant" 48 | a, b = input_task(dummy_arg) 49 | a2 = double_values(a) 50 | 51 | with Stage("stage_2"): 52 | b2 = double_values(b) 53 | a3 = double_values(a2) 54 | 55 | return flow, b2, a3 56 | 57 | 58 | def test_source_invalidation(): 59 | # trigger reload of input data 60 | global dfA 61 | global input_hash 62 | 63 | flow, out1, out2 = get_flow() 64 | 65 | with StageLockContext(): 66 | result = flow.run() 67 | assert result.successful 68 | 69 | v_out1, v_out2 = result.get(out1), result.get(out2) 70 | pd.testing.assert_frame_equal(dfA_source * 2, v_out1, check_dtype=False) 71 | pd.testing.assert_frame_equal(dfA_source * 4, v_out2, check_dtype=False) 72 | 73 | # modify input without updating input hash => cached version is used 74 | dfA["a"] = 10 + dfA_source["a"] 75 | 76 | # this run should work from caches and not change outputs 77 | with StageLockContext(): 78 | result = flow.run() 79 | assert result.successful 80 | 81 | v_out1, v_out2 = result.get(out1), result.get(out2) 82 | pd.testing.assert_frame_equal(dfA_source * 2, v_out1, check_dtype=False) 83 | pd.testing.assert_frame_equal(dfA_source * 4, v_out2, check_dtype=False) 84 | 85 | # update input hash trigger reload of new input data 86 | input_hash = hash(str(dfA)) 87 | 88 | with StageLockContext(): 89 | # this run should ignore fresh input at source nodes and not change outputs 90 | result = flow.run(cache_validation_mode=CacheValidationMode.IGNORE_FRESH_INPUT) 91 | assert result.successful 92 | 93 | v_out1, v_out2 = result.get(out1), result.get(out2) 94 | pd.testing.assert_frame_equal(dfA_source * 2, v_out1, check_dtype=False) 95 | pd.testing.assert_frame_equal(dfA_source * 4, v_out2, check_dtype=False) 96 | 97 | with StageLockContext(): 98 | result = flow.run() 99 | assert result.successful 100 | 101 | v_out1, v_out2 = result.get(out1), result.get(out2) 102 | 103 | pd.testing.assert_frame_equal(dfA * 2, v_out1, check_dtype=False) 104 | pd.testing.assert_frame_equal(dfA * 4, v_out2, check_dtype=False) 105 | 106 | 107 | if __name__ == "__main__": 108 | test_source_invalidation() 109 | -------------------------------------------------------------------------------- /tests/test_flows/test_sql_text_node.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | import sqlalchemy as sa 7 | 8 | from pydiverse.pipedag import ConfigContext, Flow, Stage, Table, materialize 9 | from tests.fixtures.instances import with_instances 10 | 11 | 12 | @materialize(input_type=sa.Table, lazy=True) 13 | def table_1(script_path: str): 14 | sql = Path(script_path).read_text(encoding="utf-8") 15 | return Table(sa.text(sql), name="table_1") 16 | 17 | 18 | @materialize(input_type=sa.Table, lazy=True) 19 | def table_2(script_path: str, dependent_table: Table): 20 | sql = ( 21 | Path(script_path) 22 | .read_text(encoding="utf-8") 23 | .replace("{{dependent}}", str(dependent_table.original)) 24 | ) 25 | return Table(sa.text(sql), name="test_table2") 26 | 27 | 28 | @materialize(input_type=pd.DataFrame, lazy=True) 29 | def assert_result(df: pd.DataFrame): 30 | pd.testing.assert_frame_equal( 31 | df, pd.DataFrame({"coltab2": [24]}), check_dtype=False 32 | ) 33 | 34 | 35 | @with_instances("postgres", "mssql", "ibm_db2", per_user=True) 36 | def test_sql_node(): 37 | instance_name = ConfigContext.get().instance_name 38 | 39 | script_1_name = { 40 | "ibm_db2": "script1-db2.sql", 41 | }.get(instance_name, "script1.sql") 42 | script_2_name = "script2.sql" 43 | 44 | with Flow("FLOW") as flow: 45 | with Stage("schema1"): 46 | parent_dir = Path(__file__).parent 47 | tab1 = table_1(str(parent_dir / "sql_scripts" / script_1_name)) 48 | tab2 = table_2(str(parent_dir / "sql_scripts" / script_2_name), tab1) 49 | assert_result(tab2) 50 | 51 | flow_result = flow.run() 52 | assert flow_result.successful 53 | 54 | 55 | if __name__ == "__main__": 56 | test_sql_node() 57 | -------------------------------------------------------------------------------- /tests/test_indexes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from pydiverse.pipedag import Flow, Stage 6 | 7 | # Parameterize all tests in this file with several instance_id configurations 8 | from tests.fixtures.instances import DATABASE_INSTANCES, with_instances 9 | from tests.util import tasks_library as m 10 | from tests.util import tasks_library_imperative as m2 11 | 12 | pytestmark = [with_instances(DATABASE_INSTANCES)] 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "task", 17 | [ 18 | m.simple_dataframe, 19 | m.simple_dataframe_with_pk, 20 | m.simple_dataframe_with_pk2, 21 | m.simple_dataframe_with_index, 22 | m.simple_dataframe_with_indexes, 23 | m.simple_dataframes_with_indexes, 24 | m.simple_lazy_table, 25 | m.simple_lazy_table_with_pk, 26 | m.simple_lazy_table_with_pk2, 27 | m.simple_lazy_table_with_index, 28 | m.simple_lazy_table_with_indexes, 29 | m2.simple_lazy_table, 30 | m2.simple_lazy_table_with_pk, 31 | ], 32 | ) 33 | def test_materialize_table_with_indexes(task): 34 | with Flow("flow") as f: 35 | with Stage("stage"): 36 | x = task() 37 | 38 | m.assert_table_equal(x, x) 39 | m.check_pk_length(x) 40 | 41 | assert f.run().successful 42 | -------------------------------------------------------------------------------- /tests/test_raw_sql/scripts/mssql/create_tables/simple_tables.sql: -------------------------------------------------------------------------------- 1 | SELECT 1 as x, 1 as y INTO {{out_schema}}.table_1; 2 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 2); 3 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 3); 4 | GO 5 | 6 | SELECT 1 as x, 1 as y INTO {{out_schema}}.table_2; 7 | INSERT INTO {{out_schema}}.table_2 VALUES (2, 2); 8 | INSERT INTO {{out_schema}}.table_2 VALUES (3, 3); 9 | GO 10 | -------------------------------------------------------------------------------- /tests/test_raw_sql/scripts/mssql/schema_swap/check_objects.sql: -------------------------------------------------------------------------------- 1 | -- Test that table exists 2 | SELECT 1 FROM {{in_schema}}.t; 3 | GO 4 | 5 | -- Test that view exists 6 | SELECT 1 FROM {{in_schema}}.v; 7 | GO 8 | 9 | -- Test that procedure exists 10 | {{in_schema}}.p 1; 11 | GO 12 | 13 | -- Test that function exists 14 | SELECT ({{in_schema}}.f (1, 2)); 15 | GO 16 | 17 | -------------------------------------------------------------------------------- /tests/test_raw_sql/scripts/mssql/schema_swap/create_objects.sql: -------------------------------------------------------------------------------- 1 | -- Create a table 2 | SELECT 1 as x, 2 as y INTO {{out_schema}}.t; 3 | GO 4 | 5 | -- Create a view 6 | CREATE VIEW {{out_schema}}.v AS SELECT * FROM t; 7 | GO 8 | 9 | -- Create a procedure 10 | CREATE PROC {{out_schema}}.p(@id INT) AS 11 | BEGIN 12 | SELECT * 13 | FROM t 14 | WHERE x = @id 15 | END; 16 | GO 17 | 18 | -- Create a function 19 | CREATE FUNCTION {{out_schema}}.f(@x INT, @y INT) 20 | RETURNS INT 21 | AS 22 | BEGIN 23 | RETURN (@x + @y) 24 | END; 25 | GO -------------------------------------------------------------------------------- /tests/test_raw_sql/scripts/postgres/create_tables/simple_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {{out_schema}}.table_1 AS SELECT 1 as x, 1 as y; 2 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 2); 3 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 3); 4 | 5 | CREATE TABLE {{out_schema}}.table_2 AS SELECT 1 as x, 1 as y; 6 | INSERT INTO {{out_schema}}.table_2 VALUES (2, 2); 7 | INSERT INTO {{out_schema}}.table_2 VALUES (3, 3); 8 | -------------------------------------------------------------------------------- /tests/test_raw_sql/test_raw_sql_input.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from pydiverse.pipedag import Flow, Stage, materialize 8 | from pydiverse.pipedag.context import ConfigContext 9 | from tests.fixtures.instances import with_instances 10 | from tests.test_raw_sql.util import sql_script 11 | 12 | 13 | @materialize(input_type=pd.DataFrame) 14 | def raw_sql_object(raw_sql): 15 | df_1 = raw_sql["table_1"] 16 | df_2 = raw_sql["table_2"] 17 | 18 | assert not df_1.empty 19 | assert not df_2.empty 20 | 21 | 22 | @materialize(input_type=pd.DataFrame) 23 | def raw_sql_individual_table(df_1): 24 | assert not df_1.empty 25 | 26 | 27 | @with_instances("postgres", "mssql") 28 | def test_raw_sql_task_input(): 29 | instance_name = ConfigContext.get().instance_name 30 | dir_ = Path(__file__).parent / "scripts" / instance_name / "create_tables" 31 | 32 | with Flow() as f: 33 | with Stage("raw_0"): 34 | simple_tables = sql_script("simple_tables.sql", dir_) 35 | 36 | raw_sql_object(simple_tables) 37 | raw_sql_individual_table(simple_tables["table_1"]) 38 | raw_sql_individual_table(simple_tables["table_2"]) 39 | 40 | f.run() 41 | f.run() 42 | -------------------------------------------------------------------------------- /tests/test_raw_sql/test_raw_sql_schema_swap.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | from pydiverse.pipedag import Flow, Stage 6 | from pydiverse.pipedag.context import ConfigContext, FinalTaskState 7 | from tests.fixtures.instances import with_instances 8 | from tests.test_raw_sql.util import sql_script 9 | 10 | 11 | # TODO: Extend tests for other backends 12 | @with_instances("mssql") 13 | def test_raw_sql_schema_swap(): 14 | # This test creates various different objects in one schema and then 15 | # checks if, after swapping the schema, if they are still working correctly. 16 | 17 | instance_name = ConfigContext.get().instance_name 18 | dir_ = Path(__file__).parent / "scripts" / instance_name / "schema_swap" 19 | 20 | with Flow() as f: 21 | with Stage("raw_0") as raw_0: 22 | sql_1 = sql_script("create_objects.sql", dir_) 23 | with Stage("raw_1"): 24 | sql_2 = sql_script( 25 | "check_objects.sql", dir_, input_stage=raw_0, depend=[sql_1] 26 | ) 27 | 28 | f.run() 29 | 30 | # Check that running the flow again results in the cache being used 31 | for _ in range(2): 32 | result = f.run() 33 | assert result.task_states[sql_1] == FinalTaskState.CACHE_VALID 34 | assert result.task_states[sql_2] == FinalTaskState.CACHE_VALID 35 | -------------------------------------------------------------------------------- /tests/test_raw_sql/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import sqlalchemy as sa 6 | 7 | from pydiverse.pipedag import Stage, materialize 8 | from pydiverse.pipedag.container import RawSql 9 | from pydiverse.pipedag.context import ConfigContext, TaskContext 10 | 11 | 12 | @materialize(input_type=sa.Table, lazy=True) 13 | def sql_script( 14 | name: str, 15 | script_directory: Path, 16 | *, 17 | input_stage=None, 18 | depend=None, 19 | ): 20 | _ = depend # only relevant for adding additional task dependency 21 | stage = TaskContext.get().task.stage 22 | 23 | script_path = script_directory / name 24 | sql = Path(script_path).read_text(encoding="utf-8") 25 | sql = raw_sql_bind_schema(sql, "out_", stage, transaction=True) 26 | sql = raw_sql_bind_schema(sql, "in_", input_stage) 27 | return RawSql(sql) 28 | 29 | 30 | def raw_sql_bind_schema( 31 | sql, prefix: str, stage: Stage | RawSql | None, *, transaction=False 32 | ): 33 | config = ConfigContext.get() 34 | store = config.store.table_store 35 | if stage is not None: 36 | stage_name = stage.transaction_name if transaction else stage.name 37 | schema_name = store.get_schema(stage_name).get() 38 | sql = sql.replace(f"{{{{{prefix}schema}}}}", schema_name) 39 | return sql 40 | -------------------------------------------------------------------------------- /tests/test_sql_ddl.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydiverse.pipedag.backend.table.sql.ddl import insert_into_in_query 4 | 5 | 6 | def test_insert_into(): 7 | test_pairs = { 8 | "Select 1": "Select 1 INTO a.b", 9 | "Select 1 as _from": "Select 1 as _from INTO a.b", 10 | "Select 1 as afrom": "Select 1 as afrom INTO a.b", 11 | "Select 1 WHERE TRUE": "Select 1 INTO a.b WHERE TRUE", 12 | "Select 1 GROUP\nBY x": "Select 1 INTO a.b GROUP\nBY x", 13 | "Select 1 FROM A GROUP BY x": "Select 1 INTO a.b FROM A GROUP BY x", 14 | "Select 1 UNION ALL SELECT 2": "Select 1 INTO a.b UNION ALL SELECT 2", 15 | "Select 1 From X": "Select 1 INTO a.b From X", 16 | "Select (SELECT 1 FROM Y) From X": "Select (SELECT 1 FROM Y) INTO a.b From X", 17 | "Select (SELECT (SELECT 1 FROM Z) FROM Y) From X": ( 18 | "Select (SELECT (SELECT 1 FROM Z) FROM Y) INTO a.b From X" 19 | ), 20 | "Select a.[from] from a": "Select a.[from] INTO a.b from a", 21 | "Select a.[ from ] from a": "Select a.[ from ] INTO a.b from a", 22 | 'Select "from" from a': 'Select "from" INTO a.b from a', 23 | } 24 | for raw_query, expected_query in test_pairs.items(): 25 | res = insert_into_in_query(raw_query, "a", "b") 26 | assert res == expected_query 27 | -------------------------------------------------------------------------------- /tests/test_sql_dialect/scripts/lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/test_sql_dialect/scripts/lock -------------------------------------------------------------------------------- /tests/test_sql_dialect/scripts/simple_nicknames.sql: -------------------------------------------------------------------------------- 1 | BEGIN 2 | IF EXISTS (SELECT * FROM SYSCAT.WRAPPERS WHERE WRAPNAME = 'DRDA') 3 | THEN EXECUTE IMMEDIATE 'DROP WRAPPER DRDA'; 4 | END IF; 5 | END| 6 | CREATE WRAPPER DRDA| 7 | CREATE SERVER remote_db TYPE DB2/LUW VERSION 11 WRAPPER DRDA 8 | AUTHORIZATION "db2inst1" PASSWORD "password" OPTIONS ( 9 | HOST '127.0.0.1', PORT '50000', DBNAME 'testdb' 10 | )| 11 | 12 | CREATE NICKNAME {{out_schema}}.nick1 FOR remote_db.{{out_schema}}.{{out_table}}| 13 | CREATE NICKNAME {{out_schema}}.nick2 FOR remote_db.{{out_schema}}.{{out_table}}| -------------------------------------------------------------------------------- /tests/test_sql_dialect/scripts/simple_table_spaces.sql: -------------------------------------------------------------------------------- 1 | BEGIN 2 | IF NOT EXISTS (SELECT * FROM SYSCAT.TABLESPACES WHERE TBSPACE = 'S1') 3 | THEN EXECUTE IMMEDIATE 'CREATE TABLESPACE S1'; 4 | END IF; 5 | END| 6 | BEGIN 7 | IF NOT EXISTS (SELECT * FROM SYSCAT.TABLESPACES WHERE TBSPACE = 'S2') 8 | THEN EXECUTE IMMEDIATE 'CREATE TABLESPACE S2'; 9 | END IF; 10 | END| 11 | BEGIN 12 | IF NOT EXISTS (SELECT * FROM SYSCAT.TABLESPACES WHERE TBSPACE = 'S3') 13 | THEN EXECUTE IMMEDIATE 'CREATE TABLESPACE S3'; 14 | END IF; 15 | END| 16 | -------------------------------------------------------------------------------- /tests/test_sql_dialect/test_postgres.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import uuid 4 | 5 | import pandas as pd 6 | import sqlalchemy as sa 7 | import structlog 8 | 9 | from pydiverse.pipedag import Flow, Stage, materialize 10 | from pydiverse.pipedag.context import ConfigContext 11 | from tests.fixtures.instances import with_instances 12 | 13 | 14 | @with_instances("postgres", "postgres_unlogged") 15 | def test_postgres_unlogged(): 16 | @materialize(version="1.0.0") 17 | def dataframe(manual_invalidate): 18 | _ = manual_invalidate 19 | return pd.DataFrame({"x": [1]}) 20 | 21 | @materialize(lazy=True) 22 | def sql_table(manual_invalidate): 23 | _ = manual_invalidate 24 | return sa.select(sa.literal(1).label("x")) 25 | 26 | @materialize(input_type=sa.Table) 27 | def get_relpersistence(table: sa.sql.expression.Alias): 28 | return sa.text( 29 | """ 30 | SELECT relpersistence 31 | FROM pg_class 32 | LEFT JOIN pg_namespace ON pg_class.relnamespace = pg_namespace.oid 33 | WHERE nspname = :schema 34 | AND relname = :name 35 | """ 36 | ).bindparams( 37 | schema=str(table.original.schema), 38 | name=str(table.original.name), 39 | ) 40 | 41 | @materialize(input_type=pd.DataFrame) 42 | def assert_relpersistence(df: pd.DataFrame): 43 | relpersistence = ( 44 | "u" 45 | if ConfigContext.get() 46 | .store.table_store.materialization_details["__any__"] 47 | .unlogged 48 | else "p" 49 | ) 50 | assert df["relpersistence"][0] == relpersistence 51 | 52 | def get_flow(manual_invalidate, partial_invalidate): 53 | with Flow() as f: 54 | with Stage("stage"): 55 | df = dataframe(manual_invalidate) 56 | tbl = sql_table(manual_invalidate) 57 | # just to prevent 100% cache validity 58 | _ = sql_table(partial_invalidate) 59 | with Stage("check"): 60 | rp_df = get_relpersistence(df) 61 | rp_tbl = get_relpersistence(tbl) 62 | assert_relpersistence(rp_df) 63 | assert_relpersistence(rp_tbl) 64 | return f 65 | 66 | manual_invalidate = str(uuid.uuid4()) 67 | partial_invalidate = str(uuid.uuid4()) 68 | 69 | logger = structlog.get_logger("test_postgres_unlogged") 70 | logger.info("1st run") 71 | f = get_flow(manual_invalidate, partial_invalidate) 72 | f.run() 73 | 74 | logger.info("2nd run with 100% cache valid stage") 75 | f.run() 76 | 77 | logger.info("3rd run with partial cache invalid stage") 78 | partial_invalidate = str(uuid.uuid4()) 79 | f = get_flow(manual_invalidate, partial_invalidate) 80 | f.run() 81 | -------------------------------------------------------------------------------- /tests/test_table_hooks/lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/test_table_hooks/lock -------------------------------------------------------------------------------- /tests/test_table_hooks/test_dtype_polars.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import pytest 6 | 7 | from pydiverse.pipedag.backend.table.util import DType 8 | 9 | pl = pytest.importorskip("polars") 10 | 11 | if TYPE_CHECKING: 12 | import polars as pl 13 | 14 | 15 | def test_dtype_from_polars(): 16 | def assert_conversion(type_, expected): 17 | assert DType.from_polars(type_) == expected 18 | 19 | assert_conversion(pl.Int64, DType.INT64) 20 | assert_conversion(pl.Int32, DType.INT32) 21 | assert_conversion(pl.Int16, DType.INT16) 22 | assert_conversion(pl.Int8, DType.INT8) 23 | 24 | assert_conversion(pl.UInt64, DType.UINT64) 25 | assert_conversion(pl.UInt32, DType.UINT32) 26 | assert_conversion(pl.UInt16, DType.UINT16) 27 | assert_conversion(pl.UInt8, DType.UINT8) 28 | 29 | assert_conversion(pl.Float64, DType.FLOAT64) 30 | assert_conversion(pl.Float32, DType.FLOAT32) 31 | 32 | assert_conversion(pl.Utf8, DType.STRING) 33 | assert_conversion(pl.Boolean, DType.BOOLEAN) 34 | 35 | assert_conversion(pl.Date, DType.DATE) 36 | assert_conversion(pl.Time, DType.TIME) 37 | assert_conversion(pl.Datetime, DType.DATETIME) 38 | assert_conversion(pl.Datetime("ms"), DType.DATETIME) 39 | assert_conversion(pl.Datetime("us"), DType.DATETIME) 40 | assert_conversion(pl.Datetime("ns"), DType.DATETIME) 41 | 42 | 43 | def test_dtype_to_polars(): 44 | def assert_conversion(type_: DType, expected): 45 | assert type_.to_polars() == expected 46 | 47 | assert_conversion(DType.INT64, pl.Int64) 48 | assert_conversion(DType.INT32, pl.Int32) 49 | assert_conversion(DType.INT16, pl.Int16) 50 | assert_conversion(DType.INT8, pl.Int8) 51 | 52 | assert_conversion(DType.UINT64, pl.UInt64) 53 | assert_conversion(DType.UINT32, pl.UInt32) 54 | assert_conversion(DType.UINT16, pl.UInt16) 55 | assert_conversion(DType.UINT8, pl.UInt8) 56 | 57 | assert_conversion(DType.FLOAT64, pl.Float64) 58 | assert_conversion(DType.FLOAT32, pl.Float32) 59 | 60 | assert_conversion(DType.STRING, pl.Utf8) 61 | assert_conversion(DType.BOOLEAN, pl.Boolean) 62 | 63 | assert_conversion(DType.DATE, pl.Date) 64 | assert_conversion(DType.TIME, pl.Time) 65 | assert_conversion(DType.DATETIME, pl.Datetime("us")) 66 | -------------------------------------------------------------------------------- /tests/test_table_hooks/test_dtype_pyarrow.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pyarrow as pa 4 | 5 | from pydiverse.pipedag.backend.table.util import DType 6 | 7 | 8 | def test_dtype_from_pyarrow(): 9 | def assert_conversion(type_, expected): 10 | assert DType.from_arrow(type_) == expected 11 | 12 | assert_conversion(pa.int64(), DType.INT64) 13 | assert_conversion(pa.int32(), DType.INT32) 14 | assert_conversion(pa.int16(), DType.INT16) 15 | assert_conversion(pa.int8(), DType.INT8) 16 | 17 | assert_conversion(pa.uint64(), DType.UINT64) 18 | assert_conversion(pa.uint32(), DType.UINT32) 19 | assert_conversion(pa.uint16(), DType.UINT16) 20 | assert_conversion(pa.uint8(), DType.UINT8) 21 | 22 | assert_conversion(pa.float64(), DType.FLOAT64) 23 | assert_conversion(pa.float32(), DType.FLOAT32) 24 | assert_conversion(pa.float16(), DType.FLOAT32) 25 | 26 | assert_conversion(pa.string(), DType.STRING) 27 | assert_conversion(pa.bool_(), DType.BOOLEAN) 28 | 29 | assert_conversion(pa.date32(), DType.DATE) 30 | assert_conversion(pa.date64(), DType.DATE) 31 | 32 | assert_conversion(pa.time32("s"), DType.TIME) 33 | assert_conversion(pa.time32("ms"), DType.TIME) 34 | assert_conversion(pa.time64("us"), DType.TIME) 35 | assert_conversion(pa.time64("ns"), DType.TIME) 36 | 37 | assert_conversion(pa.timestamp("s"), DType.DATETIME) 38 | assert_conversion(pa.timestamp("ms"), DType.DATETIME) 39 | assert_conversion(pa.timestamp("us"), DType.DATETIME) 40 | assert_conversion(pa.timestamp("ns"), DType.DATETIME) 41 | 42 | 43 | def test_dtype_to_pyarrow(): 44 | def assert_conversion(type_: DType, expected): 45 | assert type_.to_arrow() == expected 46 | 47 | assert_conversion(DType.INT64, pa.int64()) 48 | assert_conversion(DType.INT32, pa.int32()) 49 | assert_conversion(DType.INT16, pa.int16()) 50 | assert_conversion(DType.INT8, pa.int8()) 51 | 52 | assert_conversion(DType.UINT64, pa.uint64()) 53 | assert_conversion(DType.UINT32, pa.uint32()) 54 | assert_conversion(DType.UINT16, pa.uint16()) 55 | assert_conversion(DType.UINT8, pa.uint8()) 56 | 57 | assert_conversion(DType.FLOAT64, pa.float64()) 58 | assert_conversion(DType.FLOAT32, pa.float32()) 59 | 60 | assert_conversion(DType.STRING, pa.string()) 61 | assert_conversion(DType.BOOLEAN, pa.bool_()) 62 | 63 | assert_conversion(DType.DATE, pa.date32()) 64 | assert_conversion(DType.TIME, pa.time64("us")) 65 | assert_conversion(DType.DATETIME, pa.timestamp("us")) 66 | -------------------------------------------------------------------------------- /tests/test_table_hooks/test_dtype_sqlalchemy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sqlalchemy as sa 4 | 5 | from pydiverse.pipedag.backend.table.util import DType 6 | 7 | 8 | def test_dtype_from_sqlalchemy(): 9 | def assert_conversion(type_, expected): 10 | assert DType.from_sql(type_) == expected 11 | 12 | assert_conversion(sa.BigInteger(), DType.INT64) 13 | assert_conversion(sa.Integer(), DType.INT32) 14 | assert_conversion(sa.SmallInteger(), DType.INT16) 15 | 16 | assert_conversion(sa.Numeric(), DType.FLOAT64) 17 | assert_conversion(sa.Numeric(13, 2), DType.FLOAT64) 18 | assert_conversion(sa.Numeric(1, 0), DType.FLOAT64) 19 | assert_conversion(sa.DECIMAL(13, 2), DType.FLOAT64) 20 | assert_conversion(sa.DECIMAL(1, 0), DType.FLOAT64) 21 | assert_conversion(sa.Float(), DType.FLOAT64) 22 | assert_conversion(sa.Float(24), DType.FLOAT32) 23 | assert_conversion(sa.Float(53), DType.FLOAT64) 24 | 25 | assert_conversion(sa.String(), DType.STRING) 26 | assert_conversion(sa.Boolean(), DType.BOOLEAN) 27 | 28 | assert_conversion(sa.Date(), DType.DATE) 29 | assert_conversion(sa.Time(), DType.TIME) 30 | assert_conversion(sa.DateTime(), DType.DATETIME) 31 | 32 | 33 | def test_dtype_to_sqlalchemy(): 34 | def assert_conversion(type_: DType, expected): 35 | assert isinstance(type_.to_sql(), expected) 36 | 37 | assert_conversion(DType.INT64, sa.BigInteger) 38 | assert_conversion(DType.INT32, sa.Integer) 39 | assert_conversion(DType.INT16, sa.SmallInteger) 40 | assert_conversion(DType.INT8, sa.SmallInteger) 41 | 42 | assert_conversion(DType.UINT64, sa.BigInteger) 43 | assert_conversion(DType.UINT32, sa.BigInteger) 44 | assert_conversion(DType.UINT16, sa.Integer) 45 | assert_conversion(DType.UINT8, sa.SmallInteger) 46 | 47 | assert_conversion(DType.FLOAT64, sa.Float) 48 | assert_conversion(DType.FLOAT32, sa.Float) 49 | 50 | assert_conversion(DType.STRING, sa.String) 51 | assert_conversion(DType.BOOLEAN, sa.Boolean) 52 | 53 | assert_conversion(DType.DATE, sa.Date) 54 | assert_conversion(DType.TIME, sa.Time) 55 | assert_conversion(DType.DATETIME, sa.DateTime) 56 | -------------------------------------------------------------------------------- /tests/test_table_hooks/test_ibis.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pydiverse.pipedag import * 7 | 8 | # Parameterize all tests in this file with several instance_id configurations 9 | from tests.fixtures.instances import DATABASE_INSTANCES, skip_instances, with_instances 10 | from tests.util.tasks_library import assert_table_equal 11 | 12 | pytestmark = [pytest.mark.ibis, with_instances(DATABASE_INSTANCES)] 13 | 14 | 15 | try: 16 | import ibis 17 | except ImportError: 18 | ibis = None 19 | 20 | 21 | # connectorx and thus ibis have trouble with db2+ibm_db:// URLs and mssql 22 | @skip_instances("ibm_db2", "mssql") 23 | def test_table_store(): 24 | IbisTable = ibis.api.Table 25 | 26 | @materialize() 27 | def in_table(): 28 | return Table( 29 | pd.DataFrame( 30 | { 31 | "col": [0, 1, 2, 3], 32 | } 33 | ) 34 | ) 35 | 36 | @materialize() 37 | def expected_out_table(): 38 | return Table( 39 | pd.DataFrame( 40 | { 41 | "col": [0, 1, 2, 3], 42 | "x": [1, 1, 1, 1], 43 | "y": [2, 2, 2, 2], 44 | } 45 | ) 46 | ) 47 | 48 | @materialize(input_type=IbisTable) 49 | def noop(x): 50 | return Table(x) 51 | 52 | @materialize(lazy=True, input_type=IbisTable) 53 | def noop_lazy(x): 54 | return Table(x) 55 | 56 | @materialize(input_type=IbisTable) 57 | def add_column(x: IbisTable): 58 | return Table(x.mutate(x=ibis.literal(1))) 59 | 60 | @materialize(lazy=True, input_type=IbisTable) 61 | def add_column_lazy(x: IbisTable): 62 | return Table(x.mutate(y=ibis.literal(2))) 63 | 64 | with Flow() as f: 65 | with Stage("ibis"): 66 | table = in_table() 67 | table = noop(table) 68 | table = noop_lazy(table) 69 | table = add_column(table) 70 | table = add_column_lazy(table) 71 | 72 | expected = expected_out_table() 73 | _ = assert_table_equal(table, expected, check_dtype=False) 74 | 75 | assert f.run().successful 76 | -------------------------------------------------------------------------------- /tests/test_table_hooks/test_pdtransform.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pydiverse.pipedag import * 7 | 8 | # Parameterize all tests in this file with several instance_id configurations 9 | from tests.fixtures.instances import DATABASE_INSTANCES, with_instances 10 | from tests.util.tasks_library import assert_table_equal 11 | 12 | pytestmark = [pytest.mark.pdtransform, with_instances(DATABASE_INSTANCES)] 13 | 14 | try: 15 | import pydiverse.transform as pdt 16 | 17 | _ = pdt 18 | 19 | try: 20 | from pydiverse.transform.core.verbs import mutate 21 | from pydiverse.transform.eager import PandasTableImpl 22 | from pydiverse.transform.lazy import SQLTableImpl 23 | 24 | # ensures a "used" state for the import, preventing black from deleting it 25 | _ = PandasTableImpl 26 | 27 | test_list = [SQLTableImpl, PandasTableImpl] 28 | except ImportError: 29 | try: 30 | from pydiverse.transform.extended import Pandas, Polars, SqlAlchemy, mutate 31 | 32 | test_list = [SqlAlchemy, Polars, Pandas] 33 | except ImportError: 34 | raise NotImplementedError( 35 | "pydiverse.transform 0.2.0 - 0.2.2 isn't supported" 36 | ) from None 37 | except ImportError: 38 | test_list = [] 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "impl_type", 43 | test_list, 44 | ) 45 | def test_table_store(impl_type: type): 46 | def cache_fn(*args, **kwargs): 47 | return impl_type.__name__ 48 | 49 | @materialize() 50 | def in_table(): 51 | return Table( 52 | pd.DataFrame( 53 | { 54 | "col": [0, 1, 2, 3], 55 | } 56 | ) 57 | ) 58 | 59 | @materialize() 60 | def expected_out_table(): 61 | return Table( 62 | pd.DataFrame( 63 | { 64 | "col": [0, 1, 2, 3], 65 | "x": [1, 1, 1, 1], 66 | "y": [2, 2, 2, 2], 67 | } 68 | ) 69 | ) 70 | 71 | @materialize(input_type=impl_type, cache=cache_fn) 72 | def noop(x): 73 | return Table(x) 74 | 75 | @materialize(lazy=True, input_type=impl_type, cache=cache_fn) 76 | def noop_lazy(x): 77 | return Table(x) 78 | 79 | @materialize(input_type=impl_type, cache=cache_fn) 80 | def add_column(x): 81 | return Table(x >> mutate(x=1)) 82 | 83 | @materialize(lazy=True, input_type=impl_type, cache=cache_fn) 84 | def add_column_lazy(x): 85 | return Table(x >> mutate(y=2)) 86 | 87 | with Flow() as f: 88 | with Stage("pdtransform"): 89 | table = in_table() 90 | table = noop(table) 91 | table = noop_lazy(table) 92 | table = add_column(table) 93 | table = add_column_lazy(table) 94 | 95 | expected = expected_out_table() 96 | _ = assert_table_equal(table, expected, check_dtype=False) 97 | 98 | assert f.run().successful 99 | -------------------------------------------------------------------------------- /tests/test_table_hooks/test_tidypolars.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from pydiverse.pipedag import * 6 | 7 | # Parameterize all tests in this file with several instance_id configurations 8 | from tests.fixtures.instances import DATABASE_INSTANCES, skip_instances, with_instances 9 | from tests.util.tasks_library import assert_table_equal 10 | 11 | pytestmark = [ 12 | pytest.mark.polars, 13 | with_instances(DATABASE_INSTANCES), 14 | skip_instances("duckdb"), 15 | ] 16 | 17 | 18 | try: 19 | import tidypolars as tp 20 | except ImportError: 21 | tp = None 22 | 23 | 24 | @pytest.mark.skipif(tp is None, reason="Test requires tidypolars to be installed") 25 | def test_table_store(): 26 | @materialize() 27 | def in_table(): 28 | return Table( 29 | tp.Tibble( 30 | { 31 | "col": [0, 1, 2, 3], 32 | } 33 | ) 34 | ) 35 | 36 | @materialize() 37 | def expected_out_table(): 38 | return Table( 39 | tp.Tibble( 40 | { 41 | "col": [0, 1, 2, 3], 42 | "x": [1, 1, 1, 1], 43 | "y": [2, 2, 2, 2], 44 | } 45 | ) 46 | ) 47 | 48 | @materialize(input_type=tp.Tibble) 49 | def noop(x): 50 | return Table(x) 51 | 52 | @materialize(lazy=True, input_type=tp.Tibble) 53 | def noop_lazy(x): 54 | return Table(x) 55 | 56 | @materialize(input_type=tp.Tibble) 57 | def add_column(x): 58 | return Table(x.mutate(x=1)) 59 | 60 | @materialize(lazy=True, input_type=tp.Tibble) 61 | def add_column_lazy(x): 62 | return Table(x.mutate(y=2)) 63 | 64 | with Flow() as f: 65 | with Stage("tidypolars"): 66 | table = in_table() 67 | table = noop(table) 68 | table = noop_lazy(table) 69 | table = add_column(table) 70 | table = add_column_lazy(table) 71 | 72 | expected = expected_out_table() 73 | _ = assert_table_equal(table, expected, check_dtype=False) 74 | 75 | assert f.run().successful 76 | -------------------------------------------------------------------------------- /tests/test_unicode.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | import sqlalchemy as sa 5 | 6 | from pydiverse.pipedag import Flow, Stage, materialize 7 | from pydiverse.pipedag.context import StageLockContext 8 | 9 | # Parameterize all tests in this file with several instance_id configurations 10 | from tests.fixtures.instances import ( 11 | ALL_INSTANCES, 12 | ORCHESTRATION_INSTANCES, 13 | skip_instances, 14 | with_instances, 15 | ) 16 | from tests.util import tasks_library as m 17 | from tests.util.tasks_library import simple_dataframe 18 | 19 | pytestmark = [with_instances(ALL_INSTANCES, ORCHESTRATION_INSTANCES)] 20 | 21 | 22 | def test_unicode(unicode_str="äöüßéç"): 23 | @materialize(lazy=True, input_type=sa.Table) 24 | def unicode(src): 25 | return sa.select(sa.literal(unicode_str).label("a")).select_from(src).limit(1) 26 | 27 | with Flow("flow") as f: 28 | with Stage("stage"): 29 | dummy_source = simple_dataframe() 30 | x = unicode(dummy_source) 31 | x2 = m.noop(x) 32 | x3 = m.noop_lazy(x2) 33 | m.assert_table_equal(x, x2) 34 | m.assert_table_equal(x, x3) 35 | 36 | with StageLockContext(): 37 | result = f.run() 38 | assert result.successful 39 | assert result.get(x3, as_type=pd.DataFrame)["a"][0] == unicode_str 40 | 41 | 42 | @skip_instances("mssql", "mssql_pytsql") 43 | def test_unicode_beyond_mssql(): 44 | test_unicode("λ") 45 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import traceback 4 | 5 | import pytest 6 | 7 | from pydiverse.pipedag.errors import DisposedError 8 | from pydiverse.pipedag.util import Disposable, requires 9 | 10 | 11 | def test_requires(): 12 | @requires(None, ImportError("Some Error")) 13 | class BadClass: 14 | a = 1 15 | b = 2 16 | 17 | # Shouldn't be able to create instance 18 | with pytest.raises(ImportError, match="Some Error"): 19 | BadClass() 20 | 21 | # Shouldn't be able to access class attribute 22 | with pytest.raises(ImportError, match="Some Error"): 23 | _ = BadClass.a 24 | 25 | # If all requirements are fulfilled, nothing should change 26 | @requires((pytest,), Exception("This shouldn't happen")) 27 | class GoodClass: 28 | a = 1 29 | 30 | _ = GoodClass() 31 | _ = GoodClass.a 32 | 33 | 34 | def test_disposable(): 35 | class Foo(Disposable): 36 | a = 1 37 | 38 | def bar(self): 39 | return 2 40 | 41 | x = Foo() 42 | 43 | assert x.a == 1 44 | assert x.bar() == 2 45 | 46 | x.dispose() 47 | 48 | with pytest.raises(DisposedError): 49 | _ = x.a 50 | with pytest.raises(DisposedError): 51 | x.foo() 52 | with pytest.raises(DisposedError): 53 | x.dispose() 54 | with pytest.raises(DisposedError): 55 | x.a = 1 56 | 57 | 58 | def test_format_exception(): 59 | # traceback.format_exception syntax changed from python 3.9 to 3.10 60 | # thus we use traceback.format_exc() 61 | try: 62 | raise RuntimeError("this error is intended by test") 63 | except RuntimeError: 64 | trace = traceback.format_exc() 65 | assert 'RuntimeError("this error is intended by test")' in trace 66 | assert "test_util.py" in trace 67 | -------------------------------------------------------------------------------- /tests/util/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from tests.util.pytest_raises import swallowing_raises 4 | from tests.util.sql import compile_sql, select_as 5 | -------------------------------------------------------------------------------- /tests/util/dask_patch.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from io import BytesIO 4 | 5 | # Patch pytest EncodedFile (from pytest-capture plugin) to be pickleable 6 | # https://github.com/mariusvniekerk/pytest-dask/blob/master/pytest_dask/serde_patch.py 7 | from _pytest.capture import EncodedFile 8 | 9 | 10 | def apply_getsetstate(cls): 11 | def inner(ref): 12 | cls.__getstate__ = ref.__getstate__ 13 | cls.__reduce__ = ref.__reduce__ 14 | cls.__reduce_ex__ = ref.__reduce_ex__ 15 | return cls 16 | 17 | return inner 18 | 19 | 20 | @apply_getsetstate(EncodedFile) 21 | class _EncodedFile: 22 | def __getstate__(self): 23 | assert isinstance(self, EncodedFile) 24 | current_position = self.buffer.seek(0, 1) 25 | self.buffer.seek(0) 26 | value = self.buffer.read() 27 | self.buffer.seek(current_position, 0) 28 | return {"value": value, "encoding": self.encoding} 29 | 30 | def __reduce__(self): 31 | state = self.__getstate__() 32 | return self.__class__, (BytesIO(state["value"]), state["encoding"]) 33 | 34 | def __reduce_ex__(self, protocol): 35 | _ = protocol 36 | return self.__reduce__() 37 | -------------------------------------------------------------------------------- /tests/util/pytest_raises.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import contextlib 4 | 5 | import pytest 6 | 7 | from pydiverse.pipedag import ConfigContext 8 | 9 | 10 | @contextlib.contextmanager 11 | def swallowing_raises(*args, **kwargs): 12 | with ConfigContext.get().evolve(swallow_exceptions=True): 13 | with pytest.raises(*args, **kwargs) as raises: 14 | yield raises 15 | -------------------------------------------------------------------------------- /tests/util/spy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copy 4 | import unittest.mock 5 | 6 | from pydiverse.pipedag.core.task import Task, TaskGetItem 7 | from pydiverse.pipedag.materialize.core import MaterializingTask 8 | 9 | 10 | class PipedagMock: 11 | def __init__(self, mock: unittest.mock.Mock): 12 | self.mock = mock 13 | self._last_call_count = mock.call_count 14 | 15 | def reset_call_count(self): 16 | self._last_call_count = self.mock.call_count 17 | 18 | def _calls_since_last_time(self): 19 | delta = self.mock.call_count - self._last_call_count 20 | self.reset_call_count() 21 | return delta 22 | 23 | def _assert_call_count(self, n): 24 | __tracebackhide__ = True 25 | m = self._calls_since_last_time() 26 | if n == m: 27 | return 28 | name = self.mock.mock.__dict__["_mock_name"] 29 | msg = ( 30 | f"Expected function '{name}' to have been called {n} times, but it has" 31 | f" been called {m} times ({self.mock.call_count} times in total)." 32 | ) 33 | raise AssertionError(msg) 34 | 35 | def assert_not_called(self): 36 | __tracebackhide__ = True 37 | self._assert_call_count(0) 38 | 39 | def assert_called_once(self): 40 | __tracebackhide__ = True 41 | self._assert_call_count(1) 42 | 43 | def assert_called(self, times): 44 | __tracebackhide__ = True 45 | self._assert_call_count(times) 46 | 47 | 48 | def spy_task(mocker, task) -> PipedagMock: 49 | if isinstance(task, TaskGetItem): 50 | task = task.task 51 | if isinstance(task, MaterializingTask): 52 | task.fn = copy.copy(task.fn) 53 | spy = mocker.spy(task.fn, "fn") 54 | elif isinstance(task, Task): 55 | task_fn = task.fn 56 | 57 | def fn(*args, **kwargs): 58 | return task_fn(*args, **kwargs) 59 | 60 | task.fn = fn 61 | spy = mocker.spy(task, "fn") 62 | else: 63 | raise TypeError("Expected object of type Task or TaskGetItem") 64 | 65 | spy.mock.__dict__["_mock_name"] = task.name 66 | return PipedagMock(spy) 67 | -------------------------------------------------------------------------------- /tests/util/sql.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copy 4 | 5 | import sqlalchemy as sa 6 | 7 | from pydiverse.pipedag.backend import BaseTableStore 8 | from pydiverse.pipedag.context import ConfigContext 9 | 10 | 11 | def select_as(value, as_): 12 | return sa.select(sa.literal(value).label(as_)) 13 | 14 | 15 | def sql_table_expr(cols: dict): 16 | num_values = {len(vals) for vals in cols.values()} 17 | assert len(num_values) == 1 18 | 19 | queries = [] 20 | num_values = num_values.pop() 21 | for i in range(num_values): 22 | literals = [] 23 | for col, vals in cols.items(): 24 | literals.append(sa.literal(vals[i]).label(col)) 25 | 26 | queries.append(sa.select(*literals)) 27 | 28 | return sa.union_all(*queries) 29 | 30 | 31 | def compile_sql(query): 32 | engine = ConfigContext.get().store.table_store.engine 33 | return str(query.compile(engine, compile_kwargs={"literal_binds": True})) 34 | 35 | 36 | def get_config_with_table_store( 37 | base_cfg: ConfigContext, table_store_class: type[BaseTableStore] 38 | ): 39 | instance = base_cfg.instance_name 40 | flow = base_cfg.flow_name 41 | cfg = ConfigContext.new( 42 | copy.deepcopy(base_cfg._config_dict), base_cfg.pipedag_name, flow, instance 43 | ) 44 | cfg._config_dict["table_store"]["class"] = table_store_class 45 | # this actually instantiates the table store 46 | table_store = cfg.store.table_store 47 | assert type(table_store) == table_store_class 48 | return cfg 49 | --------------------------------------------------------------------------------