├── .git-blame-ignore-revs
├── .gitattributes
├── .github
├── CODEOWNERS
├── PULL_REQUEST_TEMPLATE.md
├── dependabot.yml
├── scripts
│ └── docker_compose_ready.sh
└── workflows
│ ├── nightly_tests.yml
│ ├── release.yml
│ ├── test.yml
│ ├── tests.yml
│ └── update-lockfiles.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── LICENSE
├── README.md
├── docker-compose.yaml
├── docker_db2.env_list
├── docs
├── Makefile
├── make.bat
├── package
│ └── README.md
└── source
│ ├── best_practices.md
│ ├── changelog.md
│ ├── conf.py
│ ├── database_testing.md
│ ├── examples.md
│ ├── examples
│ ├── best_practices_inline.md
│ ├── best_practices_instances.md
│ ├── best_practices_sql.md
│ ├── color_legend.svg
│ ├── environment.yml
│ ├── group_and_visualize.ipynb
│ ├── group_and_visualize.md
│ ├── group_and_visualize01.svg
│ ├── group_and_visualize02.svg
│ ├── group_and_visualize03.svg
│ ├── group_and_visualize04.svg
│ ├── group_and_visualize05.svg
│ ├── group_and_visualize06.svg
│ ├── imperative_materialization.md
│ ├── interactive_development.md
│ ├── multi_instance_pipeline.md
│ ├── multi_instance_pipeline.zip
│ ├── raw_sql.md
│ ├── raw_sql.zip
│ ├── realistic_pipeline.md
│ ├── realistic_pipeline.zip
│ ├── simple_pipeline.md
│ ├── simple_pipeline01.svg
│ ├── stage_validation.md
│ └── stage_validation.svg
│ ├── index.md
│ ├── license.md
│ ├── quickstart.md
│ ├── reference
│ ├── api.rst
│ ├── cli.md
│ └── config.md
│ └── table_backends.md
├── example
├── run_pipeline.py
├── simple_pipeline.py
├── stage_validation.py
├── visualization.py
└── visualization_legend.py
├── example_imperative
├── failing_example.py
└── run_pipeline.py
├── example_interactive
├── failing_flow_after_successful_debugging.py
└── run_tasks_interactively.py
├── example_postgres
├── docker-compose.yaml
├── pipedag.yaml
└── run_pipeline.py
├── pipedag.yaml
├── pixi.lock
├── pixi.toml
├── pyproject.toml
├── pytest.ini
├── src
└── pydiverse
│ ├── .gitignore
│ └── pipedag
│ ├── __init__.py
│ ├── _typing.py
│ ├── backend
│ ├── __init__.py
│ ├── blob.py
│ ├── lock
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── database.py
│ │ ├── filelock.py
│ │ ├── nolock.py
│ │ └── zookeeper.py
│ └── table
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── cache
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── parquet.py
│ │ ├── dict.py
│ │ ├── sql
│ │ ├── __init__.py
│ │ ├── ddl.py
│ │ ├── dialects
│ │ │ ├── __init__.py
│ │ │ ├── duckdb.py
│ │ │ ├── ibm_db2.py
│ │ │ ├── mssql.py
│ │ │ ├── postgres.py
│ │ │ └── snowflake.py
│ │ ├── hooks.py
│ │ ├── reflection.py
│ │ └── sql.py
│ │ └── util
│ │ ├── __init__.py
│ │ └── dtype.py
│ ├── container
│ └── __init__.py
│ ├── context
│ ├── __init__.py
│ ├── context.py
│ ├── run_context.py
│ └── trace_hook.py
│ ├── core
│ ├── __init__.py
│ ├── config.py
│ ├── flow.py
│ ├── group_node.py
│ ├── result.py
│ ├── stage.py
│ └── task.py
│ ├── debug
│ └── __init__.py
│ ├── engine
│ ├── __init__.py
│ ├── base.py
│ ├── dask.py
│ ├── prefect.py
│ └── sequential.py
│ ├── errors
│ └── __init__.py
│ ├── management
│ ├── __init__.py
│ ├── cli.py
│ └── commands
│ │ ├── __init__.py
│ │ ├── clear_metadata.py
│ │ └── delete_schemas.py
│ ├── materialize
│ ├── __init__.py
│ ├── cache.py
│ ├── core.py
│ ├── debug.py
│ ├── details.py
│ ├── metadata.py
│ └── store.py
│ └── util
│ ├── __init__.py
│ ├── computation_tracing.py
│ ├── deep_map.py
│ ├── deep_merge.py
│ ├── disposable.py
│ ├── hashing.py
│ ├── import_.py
│ ├── ipc.py
│ ├── json.py
│ ├── naming.py
│ └── structlog.py
└── tests
├── __init__.py
├── conftest.py
├── fixtures
├── __init__.py
└── instances.py
├── parallelize
├── README.md
├── __init__.py
├── hooks.py
├── plugin.py
├── sesson.py
├── util.py
└── worker.py
├── test_cache
├── test_auto_version.py
├── test_basic_cache_invalidation.py
├── test_ignore_cache_function.py
└── test_local_table_cache.py
├── test_compression.py
├── test_core.py
├── test_dask.py
├── test_flows
├── complex_config_flows
│ ├── pipedag_anchor.yaml
│ ├── pipedag_complex.yaml
│ ├── postgres_password.yaml
│ ├── test_instance_selection.py
│ └── test_locking_instances.py
├── raw_sql_scripts
│ ├── mssql
│ │ ├── create_db_helpers.sql
│ │ ├── prep
│ │ │ ├── entity_checks.sql
│ │ │ └── more_tables.sql
│ │ └── raw
│ │ │ └── raw_views.sql
│ ├── mssql_pytsql
│ │ ├── create_db_helpers.sql
│ │ ├── prep
│ │ │ ├── entity_checks.sql
│ │ │ └── more_tables.sql
│ │ └── raw
│ │ │ └── raw_views.sql
│ └── mssql_pytsql_isolate
│ │ ├── create_db_helpers.sql
│ │ ├── prep
│ │ ├── entity_checks.sql
│ │ └── more_tables.sql
│ │ └── raw
│ │ └── raw_views.sql
├── sql_scripts
│ ├── script1-db2.sql
│ ├── script1.sql
│ └── script2.sql
├── test_example.py
├── test_flow.py
├── test_raw_sql_pipeline.py
├── test_simple_flow.py
├── test_source_invalidation.py
└── test_sql_text_node.py
├── test_indexes.py
├── test_input_stage_versions.py
├── test_inputs.py
├── test_lock_manager.py
├── test_materialize.py
├── test_materializing_task.py
├── test_raw_sql
├── scripts
│ ├── mssql
│ │ ├── create_tables
│ │ │ └── simple_tables.sql
│ │ └── schema_swap
│ │ │ ├── check_objects.sql
│ │ │ └── create_objects.sql
│ └── postgres
│ │ └── create_tables
│ │ └── simple_tables.sql
├── test_raw_sql_input.py
├── test_raw_sql_schema_swap.py
└── util.py
├── test_run_group_node.json
├── test_run_group_node.py
├── test_run_subflow.py
├── test_sql_ddl.py
├── test_sql_dialect
├── scripts
│ ├── lock
│ ├── simple_nicknames.sql
│ └── simple_table_spaces.sql
├── test_ibm_db2.py
└── test_postgres.py
├── test_table_hooks
├── lock
├── test_dtype_pandas.py
├── test_dtype_polars.py
├── test_dtype_pyarrow.py
├── test_dtype_sqlalchemy.py
├── test_ibis.py
├── test_pandas_hook.py
├── test_pdtransform.py
├── test_polars.py
├── test_sql_table_reference.py
└── test_tidypolars.py
├── test_unicode.py
├── test_util.py
└── util
├── __init__.py
├── baseline.py
├── dask_patch.py
├── pytest_raises.py
├── spy.py
├── sql.py
├── tasks_library.py
└── tasks_library_imperative.py
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # Turn into installable package
2 | af97118ac7596c7b83abf5c4739451bf70fefa18
3 | # Reformat using black
4 | a8fc1a37386d867759f6526f159e8f586bdaedc3
5 | # Ruff
6 | e14623fce7efea34513b2efa2701ccf59b4df559
7 | 7eed813324e91c98052faa7177fdbf4320fb1ee1
8 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 |
3 | *.{diff,patch} binary
4 |
5 | *.{py,yaml,yml,sh} text eol=lf
6 | *.bat text eol=crlf
7 |
8 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true
9 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @pydiverse/code-owners
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
5 |
6 | # Checklist
7 |
8 | - [ ] Added a `docs/source/changelog.md` entry
9 | - [ ] Added/updated documentation in `docs/source/`
10 | - [ ] Added/updated examples in `docs/source/examples.md`
11 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: github-actions
4 | directory: /
5 | schedule:
6 | interval: monthly
7 | groups:
8 | gh-actions:
9 | patterns:
10 | - "*"
11 |
--------------------------------------------------------------------------------
/.github/scripts/docker_compose_ready.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script checks if all the services defined in our docker compose file
3 | # are up and running.
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | running_services=$(docker compose ps --services --status running)
9 |
10 | if [[ "$running_services" =~ "postgres" ]]; then
11 | docker compose logs postgres 2>&1 | grep "database system is ready to accept connections" > /dev/null
12 | fi
13 |
14 | if [[ "$running_services" =~ "mssql" ]]; then
15 | docker compose logs mssql 2>&1 | grep "SQL Server is now ready for client connections" > /dev/null
16 | fi
17 |
18 | if [[ "$running_services" =~ "ibm_db2" ]]; then
19 | docker compose logs ibm_db2 2>&1 | grep "Setup has completed" > /dev/null
20 | fi
21 |
22 | if [[ "$running_services" =~ "zoo" ]]; then
23 | echo ruok | nc localhost 2181 > /dev/null
24 | fi
25 |
--------------------------------------------------------------------------------
/.github/workflows/nightly_tests.yml:
--------------------------------------------------------------------------------
1 | name: Nightly Tests
2 |
3 | on:
4 | schedule:
5 | - cron: "0 2 * * *"
6 | workflow_dispatch:
7 |
8 | jobs:
9 | check:
10 | runs-on: ubuntu-latest
11 | name: Check latest commit
12 | outputs:
13 | should-run: ${{ steps.should-run.outputs.should-run }}
14 | steps:
15 | - uses: actions/checkout@v4
16 |
17 | - name: check if latest commit is within 24 hrs
18 | id: should-run
19 | continue-on-error: true
20 | if: ${{ github.event_name == 'schedule' }}
21 | run: test -z $(git rev-list --after=\"24 hours\" ${{ github.sha }}) && echo \"::set-output name=should-run::false\"
22 |
23 | os_test:
24 | name: OS Test
25 | needs: [check]
26 | if: ${{ needs.check.outputs.should-run != 'false' }}
27 | strategy:
28 | matrix:
29 | os:
30 | - ubuntu-latest
31 | - macos-latest
32 | - windows-latest
33 | environment:
34 | - py312
35 | - py311
36 | - py39
37 | - py39pdsa1
38 | uses: ./.github/workflows/test.yml
39 | with:
40 | os: ${{ matrix.os }}
41 | environment: ${{ matrix.environment }}
42 | docker-services: |
43 | postgres
44 | zoo
45 | pytest-arguments: --mssql -m mssql --polars --ibis --pdtransform
46 |
47 | library_version_test:
48 | name: Library Version Test
49 | needs: [check]
50 | if: ${{ needs.check.outputs.should-run != 'false' }}
51 | strategy:
52 | matrix:
53 | os:
54 | - ubuntu-latest
55 | - macos-latest
56 | - windows-latest
57 | environment:
58 | - py312all
59 | - py311all
60 | - py310all
61 | - py39pdsa1all
62 | - py311pdsa1all
63 | uses: ./.github/workflows/test.yml
64 | with:
65 | os: ${{ matrix.os }}
66 | environment: ${{ matrix.environment }}
67 | docker-services: |
68 | postgres
69 | zoo
70 | pytest-arguments: --mssql --ibm_db2 --snowflake --polars --pdtransform
71 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | tags:
8 | - '*.*.*'
9 | pull_request:
10 |
11 | jobs:
12 | build:
13 | name: Build Package
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Checkout branch
17 | uses: actions/checkout@v4
18 |
19 | - name: Set up pixi
20 | uses: prefix-dev/setup-pixi@v0.8.1
21 | with:
22 | environments: release
23 |
24 | - name: Ensure tag matches version
25 | if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') }}
26 | run: |
27 | version="$(pixi exec -s go-yq -- yq .project.version pyproject.toml)"
28 | tag="${{ github.ref_name }}"
29 | if [ "$version" != "$tag" ]; then
30 | echo "Tag $tag does not match version $version"
31 | exit 1
32 | fi
33 |
34 | - name: Build
35 | run: pixi run -e release hatch build
36 |
37 | - name: Check build
38 | run: pixi run -e release twine check dist/*
39 |
40 | - name: List files
41 | run: ls -l dist/
42 |
43 | - name: Upload package
44 | uses: actions/upload-artifact@v4
45 | with:
46 | name: artifact
47 | path: dist/*
48 |
49 | release:
50 | name: Publish Package
51 | if: startsWith(github.ref, 'refs/tags/')
52 | needs: [build]
53 | runs-on: ubuntu-latest
54 | permissions:
55 | id-token: write
56 | contents: write
57 | environment: pypi
58 | steps:
59 | - uses: actions/download-artifact@v4
60 | with:
61 | name: artifact
62 | path: dist
63 | - name: Publish package on PyPi
64 | uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
65 | with:
66 | # the twine version in the container is outdated
67 | # and results in a false positive
68 | verify-metadata: false
69 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | on:
2 | workflow_call:
3 | inputs:
4 | os:
5 | required: true
6 | type: string
7 | environment:
8 | required: true
9 | type: string
10 | docker-services:
11 | required: false
12 | type: string
13 | pytest-arguments:
14 | required: false
15 | type: string
16 | workers:
17 | default: 4
18 | type: number
19 | timeout-minutes:
20 | default: 30
21 | type: number
22 | secrets:
23 | SNOWFLAKE_PASSWORD:
24 | required: false
25 | SNOWFLAKE_ACCOUNT:
26 | required: false
27 | SNOWFLAKE_USER:
28 | required: false
29 |
30 | jobs:
31 | test:
32 | name: pytest
33 | runs-on: ${{ inputs.os }}
34 | timeout-minutes: ${{ inputs.timeout-minutes }}
35 | steps:
36 | - uses: actions/checkout@v4
37 |
38 | - name: Setup Pixi
39 | uses: prefix-dev/setup-pixi@v0.8.1
40 | with:
41 | environments: ${{ inputs.environment }}
42 |
43 | - name: Start Docker Compose
44 | if: ${{ inputs.docker-services != '' }}
45 | uses: isbang/compose-action@e5813a5909aca4ae36058edae58f6e52b9c971f8
46 | with:
47 | compose-file: docker-compose.yaml
48 | services: ${{ inputs.docker-services }}
49 |
50 | - name: Install Microsoft ODBC
51 | if: ${{ contains(inputs.docker-services, 'mssql') }}
52 | run: sudo ACCEPT_EULA=Y apt-get install -y msodbcsql18
53 |
54 | - name: Wait for Docker Servers
55 | if: ${{ inputs.docker-services != '' }}
56 | run: |
57 | until bash ./.github/scripts/docker_compose_ready.sh; do
58 | sleep 1
59 | done
60 |
61 | - name: Run tests
62 | env:
63 | SNOWFLAKE_PASSWORD: ${{ secrets.SNOWFLAKE_PASSWORD }}
64 | SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWFLAKE_ACCOUNT }}
65 | SNOWFLAKE_USER: ${{ secrets.SNOWFLAKE_USER }}
66 | run: |
67 | pixi run -e ${{ inputs.environment }} pytest tests ${RUNNER_DEBUG:+-v} --color=yes --workers=${{ inputs.workers }} ${{ inputs.pytest-arguments }}
68 |
--------------------------------------------------------------------------------
/.github/workflows/update-lockfiles.yml:
--------------------------------------------------------------------------------
1 | name: Update lockfiles
2 |
3 | permissions:
4 | contents: write
5 | pull-requests: write
6 |
7 | on:
8 | workflow_dispatch:
9 | schedule:
10 | - cron: 0 5 1 * *
11 |
12 | jobs:
13 | pixi-update:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: actions/checkout@v4
17 | - name: Set up pixi
18 | uses: prefix-dev/setup-pixi@v0.8.1
19 | with:
20 | run-install: false
21 | - name: Update lockfiles
22 | run: |
23 | set -euo pipefail
24 | pixi update --json | pixi exec pixi-diff-to-markdown >> diff.md
25 | - name: Create pull request
26 | uses: peter-evans/create-pull-request@v6
27 | with:
28 | token: ${{ secrets.GITHUB_TOKEN }}
29 | commit-message: Update pixi lockfile
30 | title: Update pixi lockfile
31 | body-path: diff.md
32 | branch: update-pixi
33 | base: main
34 | labels: pixi
35 | delete-branch: true
36 | add-paths: pixi.lock
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | .envrc
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | .asv
29 | pip-wheel-metadata
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | /.pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 | docs/api/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # pyenv
79 | .python-version
80 |
81 | # celery beat schedule file
82 | celerybeat-schedule
83 |
84 | # SageMath parsed files
85 | *.sage.py
86 |
87 | # dotenv
88 | .env
89 |
90 | # virtualenv
91 | .venv
92 | venv/
93 | ENV/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | # pycharm
109 | /.idea/
110 |
111 |
112 | # experiments
113 | private_*
114 |
115 | # mlflow
116 | mlruns
117 |
118 | # vscode
119 | .vscode
120 |
121 | # direnv
122 | .envrc
123 |
124 | # baseline update files
125 | *.updated.json
126 |
127 | # pixi
128 | .pixi
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 23.3.0
4 | hooks:
5 | - id: black
6 | language_version: python3.9
7 | - repo: https://github.com/charliermarsh/ruff-pre-commit
8 | rev: v0.0.270
9 | hooks:
10 | - id: ruff
11 | - repo: https://github.com/asottile/pyupgrade
12 | rev: v3.3.1
13 | hooks:
14 | - id: pyupgrade
15 | args:
16 | - --py39-plus
17 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | version: 2
5 | build:
6 | os: ubuntu-22.04
7 | tools:
8 | python: mambaforge-latest
9 | commands:
10 | - mamba install -c conda-forge -c nodefaults pixi
11 | - pixi run -e docs postinstall
12 | - pixi run -e docs docs
13 | - pixi run -e docs readthedocs
14 | sphinx:
15 | configuration: docs/source/conf.py
16 | formats:
17 | - pdf
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2022, pydiverse
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | postgres:
3 | image: postgres
4 | environment:
5 | POSTGRES_USER: sa
6 | POSTGRES_PASSWORD: Pydiverse23
7 | ports:
8 | - "6543:5432"
9 | mssql:
10 | image: mcr.microsoft.com/azure-sql-edge
11 | environment:
12 | ACCEPT_EULA: Y
13 | SA_PASSWORD: PydiQuant27
14 | ports:
15 | - "1433:1433"
16 | zoo:
17 | image: zookeeper
18 | environment:
19 | ZOO_4LW_COMMANDS_WHITELIST: ruok
20 | ZOO_MAX_CLIENT_CNXNS: 100
21 | ports:
22 | - "2181:2181"
23 | ibm_db2:
24 | platform: linux/x86_64
25 | image: icr.io/db2_community/db2
26 | privileged: true
27 | environment:
28 | LICENSE: accept
29 | DB2INSTANCE: db2inst1
30 | DB2INST1_PASSWORD: password
31 | DBNAME: testdb
32 | UPDATEAVAIL: NO
33 | ports:
34 | - 50000:50000
35 |
--------------------------------------------------------------------------------
/docker_db2.env_list:
--------------------------------------------------------------------------------
1 | LICENSE=accept
2 | DB2INSTANCE=db2inst1
3 | DB2INST1_PASSWORD=password
4 | DBNAME=testdb
5 | BLU=false
6 | ENABLE_ORACLE_COMPATIBILITY=false
7 | UPDATEAVAIL=NO
8 | TO_CREATE_SAMPLEDB=false
9 | REPODB=false
10 | IS_OSXFS=false
11 | PERSISTENT_HOME=true
12 | HADR_ENABLED=false
13 | ETCD_ENDPOINT=
14 | ETCD_USERNAME=
15 | ETCD_PASSWORD=
16 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
22 | livehtml:
23 | sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) --watch ../src $(O)
24 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/best_practices.md:
--------------------------------------------------------------------------------
1 | # Best Practices for data pipelines
2 |
3 | The Python community is very concerned with enabling users to stitch together a few code snippets that run as a py file
4 | or jupyter notebook. However, in practice, projects trying to extract significant business impact from data analytics
5 | very quickly reach a size where more sophisticated code organization is needed. On the one hand, this relates to software
6 | engineering principles like modularization, unit/integration testing, IDE support, CI/CD. On the other hand, data processing
7 | steps are best organized as a pipeline or graph of steps/tasks. Those data pipelines are the focus of the following
8 | best practice suggestions:
9 |
10 | * [moving from Raw SQL over handwritten SELECT statements to programmatic SQL](/examples/best_practices_sql)
11 | * [multiple instances: full_fresh, full_stable, mini_stable, midi_stable](/examples/best_practices_instances)
12 | * [inline views, CTEs, and subqueries](/examples/best_practices_inline)
13 |
14 | ```{toctree}
15 | /examples/best_practices_sql
16 | /examples/best_practices_instances
17 | /examples/best_practices_inline
18 | ```
--------------------------------------------------------------------------------
/docs/source/examples.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | pydiverse.pipedag already has quite a rich set of features. So we like to provide some examples showing typical use cases.
4 |
5 | * [Quickstart example](/quickstart)
6 | * [Simple pipeline](/examples/simple_pipeline)
7 | * [Working with real database](/database_testing)
8 | * [Imperative materialization / Materializing Subqueries](/examples/imperative_materialization)
9 | * [Interactive development](/examples/interactive_development)
10 | * [Grouping Tasks/Stages and visualization](/examples/group_and_visualize)
11 | * [Stage validation before schema swap](/examples/stage_validation)
12 | * [Slightly more realistic pipeline](/examples/realistic_pipeline)
13 | * [Introduction to vectorization principle with some example pipelines](https://github.com/Quantco/vectorization-tutorial/blob/main/README.md)
14 | * [Multiple instances: full, mini, midi](/examples/multi_instance_pipeline)
15 | * [Raw SQL example](/examples/raw_sql)
16 | * [Best practices / moving from Raw SQL over handwritten SELECT statements to programmatic SQL](/examples/best_practices_sql)
17 | * [Best practices / multiple instances: full_fresh, full_stable, mini_stable, midi_stable](/examples/best_practices_instances)
18 | * [Best practices / inline views, CTEs, and subqueries](/examples/best_practices_inline)
19 |
20 | ```{toctree}
21 | /quickstart
22 | /examples/simple_pipeline
23 | /database_testing
24 | /examples/imperative_materialization
25 | /examples/interactive_development
26 | /examples/group_and_visualize
27 | /examples/stage_validation
28 | /examples/realistic_pipeline
29 | /examples/multi_instance_pipeline
30 | /examples/raw_sql
31 | ```
--------------------------------------------------------------------------------
/docs/source/examples/best_practices_instances.md:
--------------------------------------------------------------------------------
1 | # Best practices: multiple instances: full_fresh, full_stable, mini_stable, midi_stable
2 |
3 | This story expands on the [multi_instance_pipeline example](multi_instance_pipeline) storyline.
4 |
5 | In general, data pipelines are processing a considerable amount of information be it tables with 100k to 100 million rows
6 | or even billions. Thus processing times will be many minutes or hours. However, iteration speed of software development
7 | on the pipeline is key because the pipeline is used to transform the data in a way that increases understanding and from
8 | better understanding come changes to the code in the data pipeline.
9 |
10 | As a consequence, you should not just have one data pipeline. You should always have at least two little siblings for any
11 | pipeline:
12 | * mini: The minimal amount of data that allows the pipeline code to run through technically.
13 | * midi: A somewhat reasonable selection of data which reaches a high level of code coverage, triggers most edge cases the
14 | pipeline code is concerned with, and may be sampled in a way that allows for statistically sound conclusions be it with
15 | reduced statistical prediction power or higher error margins. If all goals cannot be met with one subset of the input
16 | data, more pipeline instances may be needed.
17 |
18 | Another concern to worry about is that for some purposes, fresh data is required, however, for understanding data and
19 | developing statistically significant models, it is actually rather harmful to have changing data and changing code at the
20 | same time. If you train your model on 1-3 years worth of data, then adding the latest days or weeks does not really provide
21 | much value. Thus it may even be beneficial to have separate pipelines working on fresh data and on stable data.
22 |
23 | The prototypical setup of a pipeline instances with different sizes and different freshness is:
24 | - full fresh pipeline (sources raw input layer)
25 | - full stable pipeline (feeds from full fresh raw input layer)
26 | - midi stable pipeline (feeds from full stable raw input layer and filters)
27 | - mini stable pipeline (feeds from full stable raw input layer and filters)
28 |
29 | Filtering between pipeline instances is nice because like this, it is guaranteed that all stable pipelines are in-sync
30 | capturing the same data version. And it is nice because generic filtering technology can be developed independent of
31 | where data is sourced from. In the future this code could also be provided by a separate pydiverse library.
32 |
33 | For development and test of the source loading technology, it might also be nice to keep an additional instance which,
34 | however, should not be used for developing the actual pipeline:
35 | - mini fresh: performs the same loading technology than the full fresh pipeline, but only loads a minimal amount of data.
36 |
37 | For the full stable pipeline, it is important that the data is not changing. This can be achieved by switching the
38 | option `cache_validation: mode:` to "assert_no_fresh_input" for this pipeline once the data is loaded and should be
39 | kept stable. On the one hand, this implies ignoring cache functions mentioned in @materialize() decorator and additionally,
40 | it fails in case a task changes that has a cache function and thus might bring in data from external sources.
41 |
42 | An example showing how to implement this can be found here: [](/examples/multi_instance_pipeline).
--------------------------------------------------------------------------------
/docs/source/examples/environment.yml:
--------------------------------------------------------------------------------
1 | name: pipedag-howto-jupyter
2 | channels:
3 | - conda-forge
4 | - nodefaults
5 | dependencies:
6 | - pydiverse-pipedag
7 | - duckdb
8 | - duckdb-engine
9 | - ipython
10 | - jupyter
11 |
--------------------------------------------------------------------------------
/docs/source/examples/group_and_visualize01.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/examples/group_and_visualize02.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/examples/group_and_visualize03.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/examples/group_and_visualize04.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/examples/group_and_visualize05.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/examples/group_and_visualize06.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/examples/multi_instance_pipeline.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/docs/source/examples/multi_instance_pipeline.zip
--------------------------------------------------------------------------------
/docs/source/examples/raw_sql.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/docs/source/examples/raw_sql.zip
--------------------------------------------------------------------------------
/docs/source/examples/realistic_pipeline.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/docs/source/examples/realistic_pipeline.zip
--------------------------------------------------------------------------------
/docs/source/examples/simple_pipeline01.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/examples/stage_validation.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/source/license.md:
--------------------------------------------------------------------------------
1 | # License
2 |
3 | ```{literalinclude} ../../LICENSE
4 | :language: none
5 | ```
--------------------------------------------------------------------------------
/docs/source/reference/api.rst:
--------------------------------------------------------------------------------
1 | ***
2 | API
3 | ***
4 |
5 | Public
6 | ======
7 |
8 | .. py:module:: pydiverse.pipedag
9 |
10 | .. autoclass:: Flow
11 | :members:
12 | :inherited-members:
13 | :special-members: __getitem__
14 |
15 | .. autoclass:: Stage
16 | :members:
17 | :inherited-members:
18 | :special-members: __getitem__
19 |
20 | .. autodecorator:: materialize
21 |
22 | .. autodecorator:: input_stage_versions
23 |
24 | .. autodata:: AUTO_VERSION
25 |
26 | .. autoclass:: Table
27 |
28 | .. autoclass:: RawSql
29 | :members:
30 | :special-members: __iter__, __getitem__, __contains__
31 |
32 | .. autoclass:: Blob
33 |
34 | .. autoclass:: GroupNode
35 |
36 | .. autoclass:: VisualizationStyle
37 |
38 | .. autoclass:: Schema
39 | :members:
40 |
41 | .. autoclass:: Result
42 | :members:
43 |
44 | .. autoclass:: PipedagConfig
45 | :inherited-members:
46 |
47 | .. autoclass:: ConfigContext
48 | :inherited-members:
49 |
50 | .. autoclass:: StageLockContext
51 | :inherited-members:
52 |
53 |
54 | Related Classes
55 | ===============
56 |
57 | .. autoclass:: pydiverse.pipedag.materialize.core.UnboundMaterializingTask(__overload__)
58 | .. autoclass:: pydiverse.pipedag.materialize.core.MaterializingTask(__overload__)
59 | :members: get_output_from_store
60 | :special-members: __getitem__
61 | .. autoclass:: pydiverse.pipedag.materialize.core.MaterializingTaskGetItem(__overload__)
62 | :members: get_output_from_store
63 | :special-members: __getitem__
64 |
65 | Backend Classes
66 | ===============
67 |
68 | Table Store
69 | -----------
70 | .. autoclass:: pydiverse.pipedag.backend.table.SQLTableStore
71 |
72 | SQLTableStore Dialects
73 | ^^^^^^^^^^^^^^^^^^^^^^
74 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.PostgresTableStore
75 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.DuckDBTableStore
76 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.MSSqlTableStore
77 | .. autoclass:: pydiverse.pipedag.backend.table.sql.dialects.IBMDB2TableStore
78 |
79 | Local Table Cache
80 | ^^^^^^^^^^^^^^^^^
81 | .. autoclass:: pydiverse.pipedag.backend.table.cache.ParquetTableCache
82 |
83 | Blob Store
84 | ----------
85 | .. autoclass:: pydiverse.pipedag.backend.blob.FileBlobStore
86 |
87 | Lock Manager
88 | ------------
89 | .. autoclass:: pydiverse.pipedag.backend.lock.DatabaseLockManager
90 | .. autoclass:: pydiverse.pipedag.backend.lock.ZooKeeperLockManager
91 | .. autoclass:: pydiverse.pipedag.backend.lock.FileLockManager
92 | .. autoclass:: pydiverse.pipedag.backend.lock.NoLockManager
93 |
94 | Orchestration Engine
95 | --------------------
96 | .. autoclass:: pydiverse.pipedag.engine.SequentialEngine
97 | .. autoclass:: pydiverse.pipedag.engine.DaskEngine
98 |
99 | .. py:class:: PrefectEngine
100 | :canonical: pydiverse.pipedag.engine.prefect.PrefectEngine
101 |
102 | Alias for either
103 | :class:`PrefectOneEngine ` or
104 | :class:`PrefectTwoEngine `
105 | depending on the version of Prefect that is installed.
106 |
107 | .. autoclass:: pydiverse.pipedag.engine.prefect.PrefectOneEngine
108 | .. autoclass:: pydiverse.pipedag.engine.prefect.PrefectTwoEngine
109 |
110 | Special Table Types
111 | -------------------
112 |
113 | .. autoclass:: pydiverse.pipedag.materialize.container.ExternalTableReference
114 |
--------------------------------------------------------------------------------
/docs/source/reference/cli.md:
--------------------------------------------------------------------------------
1 | # Command Line Utility
2 |
3 | Pipedag comes with a command line utility called `pipedag-manage` to help with some common pipedag related management operations.
4 | These are all the available commands:
5 |
6 | ```{eval-rst}
7 | .. click:: pydiverse.pipedag.management.cli:cli
8 | :prog: pipedag-manage
9 | :nested: full
10 | ```
--------------------------------------------------------------------------------
/docs/source/table_backends.md:
--------------------------------------------------------------------------------
1 | # Table Backends
2 |
3 | We currently only support one table backend battle tested:
4 |
5 | - [](#pydiverse.pipedag.backend.table.SQLTableStore)
6 |
7 | ## [](#pydiverse.pipedag.backend.table.SQLTableStore)
8 |
9 | This backend is highly flexible in terms of database dialects and task implementation styles for which it can
10 | materialize/dematerialize tables. Internally, this is abstracted as Hooks like:
11 |
12 | ```python
13 | @SQLTableStore.register_table()
14 | class SQLAlchemyTableHook(TableHook[SQLTableStore]):
15 | ```
16 |
17 | Which need to implement the following functions:
18 |
19 | ```python
20 | def can_materialize(cls, type_) -> bool:
21 | def can_retrieve(cls, type_) -> bool:
22 | def materialize(cls, store: SQLTableStore, table: Table, stage_name):
23 | def retrieve(cls, store, table, stage_name, as_type: type):
24 | def lazy_query_str(cls, store, obj) -> str:
25 | ```
26 |
27 | The SQLTableStore currently supports the following SQL databases/dialects:
28 |
29 | - Postgres
30 | - Snowflake
31 | - Microsoft SQL Server/TSQL
32 | - IBM DB2 (LUW)
33 | - DuckDB (rather used for testing so far)
34 | - Every dialect unknown to pipedag will be treated like a postgres database (issues are likely)
35 |
36 | Example connection strings:
37 | - Postgres: `postgresql://user:password@localhost:5432/{instance_id}`
38 | - Snowflake: `snowflake://{$SNOWFLAKE_USER}:{$SNOWFLAKE_PASSWORD}@{$SNOWFLAKE_ACCOUNT}/database_name/DBO?warehouse=warehouse_name&role=access_role`
39 | - Microsoft SQL Server: `mssql+pyodbc://user:password@127.0.0.1:1433/{instance_id}?driver=ODBC+Driver+18+for+SQL+Server&encrypt=no`
40 | - IBM DB2: `db2+ibm_db://db2inst1:password@localhost:50000/testdb`, `schema_prefix: "{instance_id}_"`
41 | - DuckDB: `duckdb:////tmp/pipedag/{instance_id}/db.duckdb`
42 |
43 | See [Database Testing](database_testing.md) for an example how to spin up a database for testing.
44 |
45 | SQLTableStore supports the following `input_type` arguments to the {py:func}`@materialize `
46 | decorator out-of-the-box:
47 |
48 | - `sqlalchemy.Table` (see [https://www.sqlalchemy.org/](https://www.sqlalchemy.org/); recommended with `lazy=True`;
49 | can also be used for composing handwritten SQL strings)
50 | - `pydiverse.transform.eager.PandasTableImpl` (see
51 | [https://pydiversetransform.readthedocs.io/en/latest/](https://pydiversetransform.readthedocs.io/en/latest/);
52 | recommended with manual version bumping and `version="X.Y.Z"`)
53 | - `pydiverse.transform.lazy.SQLTableImpl` (
54 | see [https://pydiversetransform.readthedocs.io/en/latest/](https://pydiversetransform.readthedocs.io/en/latest/);
55 | recommended with `lazy=True`)
56 | - `ibis.Table` (see [https://ibis-project.org/](https://ibis-project.org/); recommended with `lazy=True`)
57 | - `tidypolars.Tibble` (see [https://github.com/markfairbanks/tidypolars](https://github.com/markfairbanks/tidypolars);
58 | recommended with `lazy=True`)
59 | - `pandas.DataFrame` (see [https://pandas.pydata.org/](https://pandas.pydata.org/); recommended with manual version
60 | bumping and `version="X.Y.Z"`)
61 | - `polars.DataFrame` (see [https://pola.rs/](https://pola.rs/); recommended with manual version bumping
62 | and `version="X.Y.Z"`)
63 | - `polars.LazyFrame` (see [https://pola.rs/](https://pola.rs/); recommended with `version=AUTO_VERSION`)
--------------------------------------------------------------------------------
/example/run_pipeline.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import tempfile
4 |
5 | import pandas as pd
6 | import sqlalchemy as sa
7 |
8 | from pydiverse.pipedag import Flow, Stage, Table, materialize
9 | from pydiverse.pipedag.context import StageLockContext
10 | from pydiverse.pipedag.core.config import create_basic_pipedag_config
11 | from pydiverse.pipedag.util.structlog import setup_logging
12 |
13 |
14 | @materialize(lazy=True)
15 | def lazy_task_1():
16 | return sa.select(
17 | sa.literal(1).label("x"),
18 | sa.literal(2).label("y"),
19 | )
20 |
21 |
22 | @materialize(lazy=True, input_type=sa.Table)
23 | def lazy_task_2(input1: sa.sql.expression.Alias, input2: sa.sql.expression.Alias):
24 | query = sa.select(
25 | (input1.c.x * 5).label("x5"),
26 | input2.c.a,
27 | ).select_from(input1.outerjoin(input2, input2.c.x == input1.c.x))
28 |
29 | return Table(query, name="task_2_out", primary_key=["a"])
30 |
31 |
32 | @materialize(lazy=True, input_type=sa.Table)
33 | def lazy_task_3(input1: sa.sql.expression.Alias):
34 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}")
35 |
36 |
37 | @materialize(lazy=True, input_type=sa.Table)
38 | def lazy_task_4(input1: sa.sql.expression.Alias):
39 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}")
40 |
41 |
42 | @materialize(nout=2, version="1.0.0")
43 | def eager_inputs():
44 | dfA = pd.DataFrame(
45 | {
46 | "a": [0, 1, 2, 4],
47 | "b": [9, 8, 7, 6],
48 | }
49 | )
50 | dfB = pd.DataFrame(
51 | {
52 | "a": [2, 1, 0, 1],
53 | "x": [1, 1, 2, 2],
54 | }
55 | )
56 | return Table(dfA, "dfA"), Table(dfB, "dfB_%%")
57 |
58 |
59 | @materialize(version="1.0.0", input_type=pd.DataFrame)
60 | def eager_task(tbl1: pd.DataFrame, tbl2: pd.DataFrame):
61 | return tbl1.merge(tbl2, on="x")
62 |
63 |
64 | def main():
65 | with tempfile.TemporaryDirectory() as temp_dir:
66 | cfg = create_basic_pipedag_config(
67 | f"duckdb:///{temp_dir}/db.duckdb",
68 | disable_stage_locking=True, # This is special for duckdb
69 | # Attention: If uncommented, stage and task names might be sent to the
70 | # following URL. You can self-host kroki if you like:
71 | # https://docs.kroki.io/kroki/setup/install/
72 | # kroki_url="https://kroki.io",
73 | ).get("default")
74 | with cfg:
75 | with Flow() as f:
76 | with Stage("stage_1"):
77 | lazy_1 = lazy_task_1()
78 | a, b = eager_inputs()
79 |
80 | with Stage("stage_2"):
81 | lazy_2 = lazy_task_2(lazy_1, b)
82 | lazy_3 = lazy_task_3(lazy_2)
83 | eager = eager_task(lazy_1, b)
84 |
85 | with Stage("stage_3"):
86 | lazy_4 = lazy_task_4(lazy_2)
87 | _ = lazy_3, lazy_4, eager # unused terminal output tables
88 |
89 | # Run flow
90 | result = f.run()
91 | assert result.successful
92 |
93 | # Run in a different way for testing
94 | with StageLockContext():
95 | result = f.run()
96 | assert result.successful
97 | assert result.get(lazy_1, as_type=pd.DataFrame)["x"][0] == 1
98 |
99 |
100 | if __name__ == "__main__":
101 | setup_logging() # you can setup the logging and/or structlog libraries as you wish
102 | main()
103 |
--------------------------------------------------------------------------------
/example/simple_pipeline.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import sqlalchemy as sa
5 |
6 | from pydiverse.pipedag import Flow, Stage, materialize
7 |
8 |
9 | # Define the different tasks our flow consists of
10 | @materialize(version="1.0", nout=2)
11 | def input_tables():
12 | names = pd.DataFrame(
13 | {
14 | "id": [1, 2, 3],
15 | "name": ["Alice", "Bob", "Charlie"],
16 | }
17 | )
18 |
19 | ages = pd.DataFrame(
20 | {
21 | "id": [1, 2, 3],
22 | "age": [20, 40, 60],
23 | }
24 | )
25 |
26 | return names, ages
27 |
28 |
29 | @materialize(lazy=True, input_type=sa.Table)
30 | def join_tables(names, ages):
31 | return sa.select(names.c.id, names.c.name, ages.c.age).join_from(
32 | names, ages, names.c.id == ages.c.id
33 | )
34 |
35 |
36 | @materialize(input_type=pd.DataFrame)
37 | def print_dataframe(df):
38 | print(df)
39 |
40 |
41 | def main():
42 | # Define how the different tasks should be wired
43 | with Flow("flow") as flow:
44 | with Stage("inputs"):
45 | names, ages = input_tables()
46 |
47 | with Stage("features"):
48 | joined_table = join_tables(names, ages)
49 | print_dataframe(joined_table)
50 |
51 | # # In case you provide a pipedag.yaml, you can run the flow as simple as:
52 | # flow.run()
53 |
54 | # run flow with a duckdb configuration in a random temporary directory (this is
55 | # easier to get started)
56 | import tempfile
57 |
58 | from pydiverse.pipedag.core.config import create_basic_pipedag_config
59 |
60 | with tempfile.TemporaryDirectory() as temp_dir:
61 | cfg = create_basic_pipedag_config(
62 | f"duckdb:///{temp_dir}/db.duckdb",
63 | disable_stage_locking=True, # This is special for duckdb
64 | ).get("default")
65 | # Execute the flow
66 | flow.run(config=cfg)
67 |
68 |
69 | if __name__ == "__main__":
70 | from pydiverse.pipedag.util.structlog import setup_logging
71 |
72 | setup_logging() # you can setup the logging and/or structlog libraries as you wish
73 | main()
74 |
--------------------------------------------------------------------------------
/example/visualization.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import tempfile
4 |
5 | from pydiverse.pipedag import Flow, GroupNode, Stage, VisualizationStyle, materialize
6 | from pydiverse.pipedag.core.config import create_basic_pipedag_config
7 | from pydiverse.pipedag.util.structlog import setup_logging
8 |
9 |
10 | @materialize
11 | def any_task():
12 | return 1
13 |
14 |
15 | @materialize
16 | def task_within_group():
17 | return 2
18 |
19 |
20 | @materialize
21 | def task_within_group2(input1: int):
22 | return input1 + 1
23 |
24 |
25 | def main():
26 | with tempfile.TemporaryDirectory() as temp_dir:
27 | cfg = create_basic_pipedag_config(
28 | f"duckdb:///{temp_dir}/db.duckdb",
29 | disable_stage_locking=True, # This is special for duckdb
30 | # Attention: stage and task names might be sent to the
31 | # following URL. You can self-host kroki if you like:
32 | # https://docs.kroki.io/kroki/setup/install/
33 | kroki_url="https://kroki.io",
34 | ).get("default")
35 | with cfg:
36 | with Flow() as flow:
37 | with Stage("stage1"):
38 | _ = any_task()
39 | with GroupNode(
40 | "group1",
41 | ordering_barrier=True,
42 | style=VisualizationStyle(
43 | hide_content=True, box_color_always="#ccccff"
44 | ),
45 | ):
46 | task1 = task_within_group()
47 | _ = task_within_group2(task1)
48 | _ = any_task()
49 |
50 | # Run flow
51 | result = flow.run()
52 | assert result.successful
53 |
54 | # you can also visualize the flow explicitly:
55 | # kroki_url = result.visualize_url()
56 | # result.visualize()
57 |
58 |
59 | if __name__ == "__main__":
60 | setup_logging() # you can setup the logging and/or structlog libraries as you wish
61 | main()
62 |
--------------------------------------------------------------------------------
/example/visualization_legend.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import tempfile
4 |
5 | from pydiverse.pipedag import Flow, GroupNode, Stage, VisualizationStyle, materialize
6 | from pydiverse.pipedag.core.config import create_basic_pipedag_config
7 | from pydiverse.pipedag.util.structlog import setup_logging
8 |
9 |
10 | @materialize
11 | def failed():
12 | raise AssertionError("This task is supposed to fail")
13 |
14 |
15 | @materialize(version=None)
16 | def completed_but_cache_invalid():
17 | return 1
18 |
19 |
20 | @materialize(version="1.0")
21 | def cache_valid():
22 | return 2
23 |
24 |
25 | @materialize(version="1.0")
26 | def cache_valid2():
27 | return 3
28 |
29 |
30 | @materialize
31 | def skipped(out):
32 | return out + 1
33 |
34 |
35 | def main():
36 | with tempfile.TemporaryDirectory() as temp_dir:
37 | cfg = (
38 | create_basic_pipedag_config(
39 | f"duckdb:///{temp_dir}/db.duckdb",
40 | disable_stage_locking=True, # This is special for duckdb
41 | # Attention: stage and task names might be sent to the
42 | # following URL. You can self-host kroki if you like:
43 | # https://docs.kroki.io/kroki/setup/install/
44 | kroki_url="https://kroki.io",
45 | fail_fast=False,
46 | )
47 | .get("default")
48 | .evolve(swallow_exceptions=True)
49 | )
50 | with cfg:
51 | with Flow() as flow:
52 | with Stage("stage1"):
53 | _ = completed_but_cache_invalid()
54 | _ = cache_valid()
55 | with Stage("stage2"):
56 | out = failed()
57 | with Stage("stage3"):
58 | _ = skipped(out)
59 | with GroupNode(
60 | "group_none_cache_valid",
61 | style=VisualizationStyle(hide_content=True),
62 | ):
63 | _ = completed_but_cache_invalid()
64 | with GroupNode(
65 | "group_any_cache_valid",
66 | style=VisualizationStyle(hide_content=True),
67 | ):
68 | _ = completed_but_cache_invalid()
69 | _ = cache_valid()
70 | with GroupNode(
71 | "group_all_cache_valid",
72 | style=VisualizationStyle(hide_content=True),
73 | ):
74 | # avoid memoization (not counted as cache valid)
75 | _ = cache_valid2()
76 | with GroupNode(
77 | "group_any_failed", style=VisualizationStyle(hide_content=True)
78 | ):
79 | _ = completed_but_cache_invalid()
80 | out = failed()
81 | with GroupNode(
82 | "group_all_skipped", style=VisualizationStyle(hide_content=True)
83 | ):
84 | _ = skipped(out)
85 |
86 | # Run flow
87 | result = flow.run()
88 | assert not result.successful
89 |
90 | # Run flow again for cache validity
91 | result = flow.run()
92 | assert not result.successful
93 |
94 | # you can also visualize the flow explicitly:
95 | # kroki_url = result.visualize_url()
96 | # result.visualize()
97 |
98 |
99 | if __name__ == "__main__":
100 | setup_logging() # you can setup the logging and/or structlog libraries as you wish
101 | main()
102 |
--------------------------------------------------------------------------------
/example_imperative/failing_example.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import tempfile
4 |
5 | import sqlalchemy as sa
6 |
7 | from pydiverse.pipedag import Flow, Stage, Table, materialize
8 | from pydiverse.pipedag.core.config import create_basic_pipedag_config
9 | from pydiverse.pipedag.util.structlog import setup_logging
10 |
11 |
12 | @materialize(lazy=True)
13 | def lazy_task_1():
14 | return Table(sa.text("")).materialize()
15 |
16 |
17 | def main():
18 | with tempfile.TemporaryDirectory() as temp_dir:
19 | cfg = create_basic_pipedag_config(
20 | f"duckdb:///{temp_dir}/db.duckdb",
21 | disable_stage_locking=True, # This is special for duckdb
22 | # Attention: If uncommented, stage and task names might be sent to the
23 | # following URL. You can self-host kroki if you like:
24 | # https://docs.kroki.io/kroki/setup/install/
25 | # kroki_url="https://kroki.io",
26 | ).get("default")
27 | with cfg:
28 | with Flow() as f:
29 | with Stage("stage_1"):
30 | lazy_task_1()
31 |
32 | # Run flow
33 | f.run()
34 |
35 |
36 | if __name__ == "__main__":
37 | setup_logging() # you can setup the logging and/or structlog libraries as you wish
38 | main()
39 |
--------------------------------------------------------------------------------
/example_interactive/failing_flow_after_successful_debugging.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import tempfile
5 |
6 | import sqlalchemy as sa
7 | from sqlalchemy.exc import ProgrammingError
8 |
9 | from pydiverse.pipedag import Flow, Stage, Table, materialize
10 | from pydiverse.pipedag.core.config import create_basic_pipedag_config
11 | from pydiverse.pipedag.util.structlog import setup_logging
12 |
13 |
14 | @materialize(lazy=True)
15 | def lazy_task_1():
16 | try:
17 | tbl = Table(sa.text("SELECT-TYPO 1"), name="tbl").materialize()
18 | except ProgrammingError:
19 | # This error is expected
20 | logger = logging.getLogger(__name__ + "-lazy_task_1")
21 | logger.info("Caught expected error", exc_info=True)
22 |
23 | # now we succeed, but are still not done, yet
24 | tbl = Table(sa.text("SELECT 'not-done-yet' as a"), name="tbl").materialize()
25 |
26 | # this will create another two tables but they are not returned and won't switch to
27 | # debug mode
28 | Table(sa.text("SELECT 3 as a")).materialize()
29 | Table(sa.text("SELECT 4 as a"), name="tbl2").materialize()
30 |
31 | # now, we succeed with fixing `tbl` and automatically switch in debug mode
32 | tbl = Table(sa.text("SELECT 1 as a"), name="tbl").materialize()
33 |
34 | # we can also keep a table object:
35 | tbl_obj = Table(sa.text("SELECT 'not-done-yet' as a"))
36 | tbl_obj.materialize()
37 |
38 | # this will also automatically switch to debug mode
39 | tbl_obj.obj = sa.text("SELECT 1 as a")
40 | tbl_obj.materialize()
41 |
42 | # However, now the flow will stop because cache invalidation cannot deal with debug
43 | # mode
44 | return tbl
45 |
46 |
47 | def main():
48 | with tempfile.TemporaryDirectory() as temp_dir:
49 | cfg = create_basic_pipedag_config(
50 | f"duckdb:///{temp_dir}/db.duckdb",
51 | disable_stage_locking=True, # This is special for duckdb
52 | # Attention: If uncommented, stage and task names might be sent to the
53 | # following URL. You can self-host kroki if you like:
54 | # https://docs.kroki.io/kroki/setup/install/
55 | # kroki_url="https://kroki.io",
56 | ).get("default")
57 | with cfg:
58 | with Flow() as f:
59 | with Stage("stage_1"):
60 | lazy_task_1()
61 |
62 | # Run flow
63 | f.run()
64 |
65 |
66 | if __name__ == "__main__":
67 | setup_logging() # you can setup the logging and/or structlog libraries as you wish
68 | main()
69 |
--------------------------------------------------------------------------------
/example_postgres/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.9"
2 | services:
3 | postgres:
4 | image: postgres
5 | environment:
6 | POSTGRES_USER: sa
7 | POSTGRES_PASSWORD: Pydiverse23
8 | ports:
9 | - "6543:5432"
10 |
--------------------------------------------------------------------------------
/example_postgres/pipedag.yaml:
--------------------------------------------------------------------------------
1 | instances:
2 | __any__:
3 | network_interface: "127.0.0.1"
4 | auto_table:
5 | - "pandas.DataFrame"
6 | - "sqlalchemy.sql.expression.TextClause"
7 | - "sqlalchemy.sql.expression.Selectable"
8 |
9 | fail_fast: true
10 | instance_id: pipedag_default
11 |
12 | # Attention: For disable_kroki: false, stage and task names might be sent to the kroki_url.
13 | # You can self-host kroki if you like:
14 | # https://docs.kroki.io/kroki/setup/install/
15 | disable_kroki: true
16 | kroki_url: "https://kroki.io"
17 |
18 | table_store:
19 | class: "pydiverse.pipedag.backend.table.SQLTableStore"
20 | args:
21 | url: "postgresql://sa:Pydiverse23@127.0.0.1:6543/{instance_id}"
22 | create_database_if_not_exists: True
23 |
24 | print_materialize: true
25 | print_sql: true
26 |
27 | local_table_cache:
28 | store_input: true
29 | store_output: true
30 | use_stored_input_as_cache: true
31 | class: "pydiverse.pipedag.backend.table.cache.ParquetTableCache"
32 | args:
33 | base_path: "/tmp/pipedag/table_cache"
34 |
35 | blob_store:
36 | class: "pydiverse.pipedag.backend.blob.FileBlobStore"
37 | args:
38 | base_path: "/tmp/pipedag/blobs"
39 |
40 | lock_manager:
41 | class: "pydiverse.pipedag.backend.lock.DatabaseLockManager"
42 |
43 | orchestration:
44 | class: "pydiverse.pipedag.engine.SequentialEngine"
45 |
--------------------------------------------------------------------------------
/example_postgres/run_pipeline.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import sqlalchemy as sa
5 |
6 | from pydiverse.pipedag import Flow, Stage, Table, materialize
7 | from pydiverse.pipedag.context import StageLockContext
8 | from pydiverse.pipedag.util.structlog import setup_logging
9 |
10 |
11 | @materialize(lazy=True)
12 | def lazy_task_1():
13 | return sa.select(
14 | sa.literal(1).label("x"),
15 | sa.literal(2).label("y"),
16 | )
17 |
18 |
19 | @materialize(lazy=True, input_type=sa.Table)
20 | def lazy_task_2(input1: sa.sql.expression.Alias, input2: sa.sql.expression.Alias):
21 | query = sa.select(
22 | (input1.c.x * 5).label("x5"),
23 | input2.c.a,
24 | ).select_from(input1.outerjoin(input2, input2.c.x == input1.c.x))
25 |
26 | return Table(query, name="task_2_out", primary_key=["a"])
27 |
28 |
29 | @materialize(lazy=True, input_type=sa.Table)
30 | def lazy_task_3(input1: sa.sql.expression.Alias):
31 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}")
32 |
33 |
34 | @materialize(lazy=True, input_type=sa.Table)
35 | def lazy_task_4(input1: sa.sql.expression.Alias):
36 | return sa.text(f"SELECT * FROM {input1.original.schema}.{input1.original.name}")
37 |
38 |
39 | @materialize(nout=2, version="1.0.0")
40 | def eager_inputs():
41 | dfA = pd.DataFrame(
42 | {
43 | "a": [0, 1, 2, 4],
44 | "b": [9, 8, 7, 6],
45 | }
46 | )
47 | dfB = pd.DataFrame(
48 | {
49 | "a": [2, 1, 0, 1],
50 | "x": [1, 1, 2, 2],
51 | }
52 | )
53 | return Table(dfA, "dfA"), Table(dfB, "dfB_%%")
54 |
55 |
56 | @materialize(version="1.0.0", input_type=pd.DataFrame)
57 | def eager_task(tbl1: pd.DataFrame, tbl2: pd.DataFrame):
58 | return tbl1.merge(tbl2, on="x")
59 |
60 |
61 | def main():
62 | with Flow() as f:
63 | with Stage("stage_1"):
64 | lazy_1 = lazy_task_1()
65 | a, b = eager_inputs()
66 |
67 | with Stage("stage_2"):
68 | lazy_2 = lazy_task_2(lazy_1, b)
69 | lazy_3 = lazy_task_3(lazy_2)
70 | eager = eager_task(lazy_1, b)
71 |
72 | with Stage("stage_3"):
73 | lazy_4 = lazy_task_4(lazy_2)
74 | _ = lazy_3, lazy_4, eager # unused terminal output tables
75 |
76 | # Run flow
77 | result = f.run()
78 | assert result.successful
79 |
80 | # Run in a different way for testing
81 | with StageLockContext():
82 | result = f.run()
83 | assert result.successful
84 | assert result.get(lazy_1, as_type=pd.DataFrame)["x"][0] == 1
85 |
86 |
87 | if __name__ == "__main__":
88 | setup_logging() # you can setup the logging and/or structlog libraries as you wish
89 | main()
90 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "pydiverse-pipedag"
3 | version = "0.9.10"
4 | description = "A pipeline orchestration library executing tasks within one python session. It takes care of SQL table (de)materialization, caching and cache invalidation. Blob storage is supported as well for example for storing model files."
5 | authors = [
6 | { name = "QuantCo, Inc." },
7 | { name = "Nicolas Camenisch", email = "garnele007@gmail.com" },
8 | { name = "Martin Trautmann", email = "windiana@users.sf.net" },
9 | ]
10 | license = { file = "LICENSE" }
11 | readme = "docs/package/README.md"
12 | requires-python = ">=3.9"
13 |
14 | classifiers = [
15 | "Development Status :: 3 - Alpha",
16 | "Intended Audience :: Developers",
17 | "Intended Audience :: Science/Research",
18 | "Programming Language :: SQL",
19 | "Topic :: Database",
20 | ]
21 |
22 | dependencies = [
23 | "pandas>=1.4.3",
24 | "SQLAlchemy>=1.4.39",
25 | "typing-extensions>=4.1.0",
26 | "networkx>=2.8",
27 | "attrs>=22.1.0",
28 | "structlog>=22.1.0",
29 | "pynng>=0.7.1",
30 | "msgpack>=1.0.4",
31 | "packaging>=21.3",
32 | "python-box>=6.1.0",
33 | "PyYAML>=6.0",
34 | "pyarrow>=11.0.0",
35 | "cryptography>=41.0.1",
36 | "pydot>=1.4.2",
37 | "click>=8.1.3",
38 | "pyparsing>=3.0",
39 | ]
40 |
41 | [tool.hatch.build.targets.wheel]
42 | packages = ["src/pydiverse"]
43 |
44 | [project.scripts]
45 | pipedag-manage = "pydiverse.pipedag.management.cli:cli"
46 |
47 | [tool.ruff]
48 | select = ["F", "E", "UP", "W", "I001", "I002", "B", "A"]
49 | ignore = ["B028"]
50 | extend-exclude = ["docs/*"]
51 | ignore-init-module-imports = true
52 | fix = true
53 | target-version = "py38"
54 |
55 | [tool.ruff.per-file-ignores]
56 | "__init__.py" = ["F401", "F403"]
57 | "src/pydiverse/pipedag/backend/table/sql/ddl.py" = ["F811"]
58 | "tests/*" = ["F403", "F405"]
59 |
60 | [tool.ruff.isort]
61 | known-first-party = ["pydiverse"]
62 | required-imports = ["from __future__ import annotations"]
63 |
64 | [build-system]
65 | requires = ["hatchling"]
66 | build-backend = "hatchling.build"
67 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | pythonpath = src
3 | testpaths = tests
4 |
5 | markers =
6 | postgres: a test that requires postgres [SQLTableStore]
7 | mssql: a test that requires mssql [SQLTableStore]
8 | ibm_db2: a test that requires ibm_db2 [SQLTableStore]
9 | duckdb: a test that requires duckdb [SQLTableStore]
10 | snowflake: a test that requires snowflake [SQLTableStore]
11 |
12 | pdtransform: a test that requires pydiverse-transform [TableHook]
13 | ibis: a test that requires ibis [TableHook]
14 | polars: a test that requires polars/tidypolars [TableHook]
15 |
16 | dask: a test that requires dask [DaskEngine]
17 | prefect: a test that requires prefect [PrefectEngine]
18 |
19 | instances: marker used to run an test with different instances
20 | skip_instances: fixture used to skip running test for a list of instances
21 |
22 | parallelize: parallelize this test
23 |
24 | slow1: fastest of slow tests (this is more simulated treatment different slowness)
25 | slow2: slower tests
26 | slow3: even slower tests
27 | slow4: even much slower tests
28 | slow5: slowest tests
--------------------------------------------------------------------------------
/src/pydiverse/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.py
2 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .container import (
4 | Blob,
5 | ExternalTableReference,
6 | RawSql,
7 | Schema,
8 | Table,
9 | )
10 | from .context import ConfigContext, StageLockContext
11 | from .core import (
12 | Flow,
13 | GroupNode,
14 | PipedagConfig,
15 | Result,
16 | Stage,
17 | Task,
18 | VisualizationStyle,
19 | )
20 | from .materialize import (
21 | input_stage_versions,
22 | materialize,
23 | )
24 | from .materialize.core import AUTO_VERSION
25 |
26 | __all__ = [
27 | "Flow",
28 | "Stage",
29 | "materialize",
30 | "input_stage_versions",
31 | "AUTO_VERSION",
32 | "Table",
33 | "RawSql",
34 | "Blob",
35 | "GroupNode",
36 | "VisualizationStyle",
37 | "Schema",
38 | "Result",
39 | "PipedagConfig",
40 | "ConfigContext",
41 | "StageLockContext",
42 | ]
43 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/_typing.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING, Callable, TypeVar, Union
4 |
5 | if TYPE_CHECKING:
6 | from pydiverse.pipedag import Blob, Table
7 | from pydiverse.pipedag.backend.table.base import BaseTableStore, TableHookResolver
8 |
9 |
10 | def decorator_hint(decorator: Callable) -> Callable:
11 | # Used to fix incorrect type hints in pycharm
12 | return decorator
13 |
14 |
15 | T = TypeVar("T")
16 | CallableT = TypeVar("CallableT", bound=Callable)
17 | StoreT = TypeVar("StoreT", bound="BaseTableStore")
18 | TableHookResolverT = TypeVar("TableHookResolverT", bound="TableHookResolver")
19 |
20 | # Materializable
21 | MPrimitives = Union[int, float, bool, str]
22 | MTypes = Union["Table", "Blob"]
23 |
24 | BaseMaterializable = Union[MPrimitives, MTypes]
25 | Materializable = Union[
26 | BaseMaterializable,
27 | dict[str, "Materializable"],
28 | list["Materializable"],
29 | tuple["Materializable", ...],
30 | ]
31 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .blob import *
4 | from .lock import *
5 | from .table import *
6 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/lock/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .base import BaseLockManager, LockState
4 | from .database import DatabaseLockManager
5 | from .filelock import FileLockManager
6 | from .nolock import NoLockManager
7 | from .zookeeper import ZooKeeperLockManager
8 |
9 | __all__ = [
10 | "BaseLockManager",
11 | "LockState",
12 | "NoLockManager",
13 | "FileLockManager",
14 | "ZooKeeperLockManager",
15 | "DatabaseLockManager",
16 | ]
17 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/lock/filelock.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | import warnings
5 | from pathlib import Path
6 | from typing import Any
7 |
8 | from pydiverse.pipedag import ConfigContext, Stage
9 | from pydiverse.pipedag.backend.lock.base import BaseLockManager, Lockable, LockState
10 | from pydiverse.pipedag.errors import LockError
11 | from pydiverse.pipedag.util import normalize_name, requires
12 |
13 | try:
14 | import filelock as fl
15 | except ImportError as e:
16 | warnings.warn(str(e), ImportWarning)
17 | fl = None
18 |
19 |
20 | @requires(fl, ImportError("FileLockManager requires 'filelock' to be installed."))
21 | class FileLockManager(BaseLockManager):
22 | """Lock manager that uses lock files
23 |
24 | For details on how exactly the file locking is implemented, check out the
25 | `filelock documentation`_.
26 |
27 | :param base_path:
28 | A path to a folder where the lock files should get stored.
29 | To differentiate between different instances, the ``instance_id`` will
30 | automatically be appended to the provided path.
31 |
32 | .. _filelock documentation: https://py-filelock.readthedocs.io/en/latest/index.html
33 | """
34 |
35 | @classmethod
36 | def _init_conf_(cls, config: dict[str, Any]):
37 | instance_id = normalize_name(ConfigContext.get().instance_id)
38 | base_path = Path(config["base_path"]) / instance_id
39 | return cls(base_path)
40 |
41 | def __init__(self, base_path: str | Path):
42 | super().__init__()
43 | self.base_path = Path(base_path).absolute()
44 | self.locks: dict[Lockable, fl.BaseFileLock] = {}
45 |
46 | os.makedirs(self.base_path, exist_ok=True)
47 |
48 | @property
49 | def supports_stage_level_locking(self):
50 | return True
51 |
52 | def acquire(self, lockable: Lockable):
53 | if lockable not in self.locks:
54 | lock_path = self.lock_path(lockable)
55 | self.locks[lockable] = fl.FileLock(lock_path)
56 |
57 | lock = self.locks[lockable]
58 | if not lock.is_locked:
59 | self.logger.info(f"Locking '{lockable}'")
60 | lock.acquire()
61 | self.set_lock_state(lockable, LockState.LOCKED)
62 |
63 | def release(self, lockable: Lockable):
64 | if lockable not in self.locks:
65 | raise LockError(f"No lock '{lockable}' found.")
66 |
67 | lock = self.locks[lockable]
68 | lock.release()
69 | if not lock.is_locked:
70 | self.logger.info(f"Unlocking '{lockable}'")
71 | del self.locks[lockable]
72 | self.set_lock_state(lockable, LockState.UNLOCKED)
73 |
74 | def lock_path(self, lock: Lockable) -> Path:
75 | if isinstance(lock, Stage):
76 | return self.base_path / (lock.name + ".lock")
77 | elif isinstance(lock, str):
78 | return self.base_path / (lock + ".lock")
79 | else:
80 | raise NotImplementedError(
81 | f"Can't lock object of type '{type(lock).__name__}'"
82 | )
83 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/lock/nolock.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pydiverse.pipedag.backend.lock.base import BaseLockManager, Lockable, LockState
4 |
5 |
6 | class NoLockManager(BaseLockManager):
7 | """
8 | This lock manager doesn't do any locking and only serves as a placeholder
9 | for an actual lock manager for testing something locally.
10 |
11 | .. Warning::
12 | This lock manager is not intended for use in a production environment.
13 | Using a lock manager is essential for preventing data corruption.
14 | """
15 |
16 | @property
17 | def supports_stage_level_locking(self):
18 | return True
19 |
20 | def acquire(self, lockable: Lockable):
21 | self.set_lock_state(lockable, LockState.LOCKED)
22 |
23 | def release(self, lockable: Lockable):
24 | self.set_lock_state(lockable, LockState.UNLOCKED)
25 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/table/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from . import cache
4 | from .base import BaseTableStore
5 | from .dict import DictTableStore
6 | from .sql import SQLTableStore
7 |
8 | __all__ = [
9 | "BaseTableStore",
10 | "DictTableStore",
11 | "SQLTableStore",
12 | ]
13 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/table/cache/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .base import BaseTableCache
4 | from .parquet import ParquetTableCache
5 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/table/cache/base.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from abc import ABC, abstractmethod
4 |
5 | import structlog
6 |
7 | from pydiverse.pipedag import Stage
8 | from pydiverse.pipedag._typing import T
9 | from pydiverse.pipedag.backend.table.base import TableHookResolver
10 | from pydiverse.pipedag.container import Table
11 | from pydiverse.pipedag.context import RunContext
12 | from pydiverse.pipedag.materialize.core import MaterializingTask
13 | from pydiverse.pipedag.util import Disposable
14 |
15 |
16 | class BaseTableCache(ABC, TableHookResolver, Disposable):
17 | def __init__(
18 | self,
19 | store_input: bool = True,
20 | store_output: bool = False,
21 | use_stored_input_as_cache: bool = True,
22 | ):
23 | super().__init__()
24 |
25 | self.logger = structlog.get_logger(logger_name=type(self).__name__)
26 |
27 | self.should_store_input = store_input
28 | self.should_store_output = store_output
29 | self.should_use_stored_input_as_cache = use_stored_input_as_cache
30 |
31 | def setup(self):
32 | """Setup function
33 |
34 | This function gets called at the beginning of a flow run.
35 | Unlike the __init__ method, a lock is acquired before
36 | the setup method gets called to prevent race conditions.
37 | """
38 |
39 | def init_stage(self, stage: Stage):
40 | """Initialize a stage
41 |
42 | Gets called before any table is attempted to be stored in the stage.
43 | """
44 |
45 | @abstractmethod
46 | def clear_cache(self, stage: Stage):
47 | """Delete the cache for a specific stage"""
48 |
49 | def store_table(self, table: Table, task: MaterializingTask):
50 | if self.should_store_output:
51 | return self._store_table(table, task)
52 |
53 | def store_input(self, table: Table, task: MaterializingTask):
54 | if self.should_store_input:
55 | return self._store_table(table, task)
56 |
57 | def _store_table(self, table: Table, task: MaterializingTask | None) -> bool:
58 | """
59 | :return: bool flag indicating if storing was successful
60 | """
61 | try:
62 | hook = self.get_m_table_hook(type(table.obj))
63 | except TypeError:
64 | return False
65 |
66 | if not RunContext.get().should_store_table_in_cache(table):
67 | # Prevent multiple tasks writing at the same time
68 | return False
69 |
70 | try:
71 | hook.materialize(self, table, table.stage.transaction_name)
72 | except TypeError:
73 | return False
74 | return True
75 |
76 | def retrieve_table_obj(
77 | self,
78 | table: Table,
79 | as_type: type[T],
80 | for_auto_versioning: bool = False,
81 | ) -> T:
82 | assert not for_auto_versioning
83 |
84 | if not self.should_use_stored_input_as_cache:
85 | return None
86 | if not self._has_table(table, as_type):
87 | return None
88 | return self._retrieve_table_obj(table, as_type)
89 |
90 | def _retrieve_table_obj(self, table: Table, as_type: type[T]) -> T:
91 | try:
92 | hook = self.get_r_table_hook(as_type)
93 | obj = hook.retrieve(self, table, table.stage.name, as_type)
94 | self.logger.info("Retrieved table from local table cache", table=table)
95 | return obj
96 | except Exception as e:
97 | self.logger.warning(
98 | "Failed to retrieve table from local table cache",
99 | table=table,
100 | cause=str(e),
101 | )
102 | return None
103 |
104 | @abstractmethod
105 | def _has_table(self, table: Table, as_type: type) -> bool:
106 | """Check if the given table is in the cache"""
107 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/table/sql/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .sql import SQLTableStore
4 |
5 | __all__ = [
6 | "SQLTableStore",
7 | ]
8 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/table/sql/dialects/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .duckdb import DuckDBTableStore
4 | from .ibm_db2 import IBMDB2TableStore
5 | from .mssql import MSSqlTableStore
6 | from .postgres import PostgresTableStore
7 | from .snowflake import SnowflakeTableStore
8 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/table/sql/dialects/snowflake.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import time
4 | import warnings
5 | from typing import Literal
6 |
7 | from pydiverse.pipedag.backend.table.sql.hooks import (
8 | IbisTableHook,
9 | )
10 | from pydiverse.pipedag.backend.table.sql.sql import SQLTableStore
11 |
12 | try:
13 | import snowflake
14 | except ImportError as e:
15 | warnings.warn(str(e), ImportWarning)
16 | snowflake = None
17 |
18 |
19 | class SnowflakeTableStore(SQLTableStore):
20 | """
21 | SQLTableStore that supports `Snowflake`_.
22 |
23 | Takes the same arguments as
24 | :py:class:`SQLTableStore `
25 | """
26 |
27 | _dialect_name = "snowflake"
28 |
29 | def _default_isolation_level(self) -> str | None:
30 | return None # "READ UNCOMMITTED" does not exist in Snowflake
31 |
32 | def optional_pause_for_db_transactionality(
33 | self,
34 | prev_action: Literal[
35 | "table_drop",
36 | "table_create",
37 | "schema_drop",
38 | "schema_create",
39 | "schema_rename",
40 | ],
41 | ):
42 | _ = prev_action
43 | # The snowflake backend has transactionality problems with very quick
44 | # DROP/CREATE or RENAME activities for both schemas and tables
45 | # which happen in testing.
46 | time.sleep(2)
47 |
48 | def _init_database(self):
49 | create_database = self.engine_url.database.split("/")[0]
50 | with self.engine.connect() as conn:
51 | if not [
52 | x.name
53 | for x in conn.exec_driver_sql("SHOW DATABASES").mappings().all()
54 | if x.name.upper() == create_database.upper()
55 | ]:
56 | self._init_database_with_database(
57 | "snowflake",
58 | disable_exists_check=True,
59 | create_database=create_database,
60 | )
61 |
62 |
63 | try:
64 | import ibis
65 | except ImportError:
66 | ibis = None
67 |
68 |
69 | @SnowflakeTableStore.register_table(ibis)
70 | class IbisTableHook(IbisTableHook):
71 | @classmethod
72 | def _conn(cls, store: SnowflakeTableStore):
73 | return ibis.snowflake._from_url(store.engine_url)
74 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/backend/table/util/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .dtype import DType, PandasDTypeBackend
4 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/context/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pydiverse.pipedag.context.context import (
4 | ConfigContext,
5 | DAGContext,
6 | StageLockContext,
7 | TaskContext,
8 | )
9 | from pydiverse.pipedag.context.run_context import (
10 | FinalTaskState,
11 | RunContext,
12 | RunContextServer,
13 | )
14 |
15 | __all__ = [
16 | "DAGContext",
17 | "TaskContext",
18 | "ConfigContext",
19 | "RunContext",
20 | "RunContextServer",
21 | "StageLockContext",
22 | "FinalTaskState",
23 | ]
24 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/core/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .config import PipedagConfig
4 | from .flow import Flow, Subflow
5 | from .group_node import GroupNode, VisualizationStyle
6 | from .result import Result
7 | from .stage import Stage
8 | from .task import Task, UnboundTask
9 |
10 | __all__ = [
11 | "Flow",
12 | "Subflow",
13 | "PipedagConfig",
14 | "Result",
15 | "Stage",
16 | "GroupNode",
17 | "VisualizationStyle",
18 | "UnboundTask",
19 | "Task",
20 | ]
21 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/debug/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pydiverse.pipedag.materialize.debug import materialize_table
4 |
5 | __all__ = ["materialize_table"]
6 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/engine/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .base import OrchestrationEngine
4 | from .dask import DaskEngine
5 |
6 | # don't import prefect engines by default because importing prefect messes with
7 | # initialization of logging library
8 | # from .prefect import PrefectEngine, PrefectOneEngine, PrefectTwoEngine
9 | from .sequential import SequentialEngine
10 |
11 | __all__ = [
12 | "OrchestrationEngine",
13 | # "PrefectEngine",
14 | # "PrefectOneEngine",
15 | # "PrefectTwoEngine",
16 | "SequentialEngine",
17 | "DaskEngine",
18 | ]
19 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/engine/base.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from abc import ABC, abstractmethod
4 | from typing import TYPE_CHECKING
5 |
6 | from pydiverse.pipedag import ExternalTableReference, Task
7 | from pydiverse.pipedag.util import Disposable
8 |
9 | if TYPE_CHECKING:
10 | from pydiverse.pipedag.core import Result, Subflow
11 |
12 |
13 | class OrchestrationEngine(Disposable, ABC):
14 | """Flow orchestration engine base class"""
15 |
16 | @abstractmethod
17 | def run(
18 | self,
19 | flow: Subflow,
20 | ignore_position_hashes: bool = False,
21 | inputs: dict[Task, ExternalTableReference] | None = None,
22 | **kwargs,
23 | ) -> Result:
24 | """Execute a flow
25 |
26 | :param flow: the pipedag flow to execute
27 | :param ignore_position_hashes:
28 | If ``True``, the position hashes of tasks are not checked
29 | when retrieving the inputs of a task from the cache.
30 | This simplifies execution of subgraphs if you don't care whether inputs to
31 | that subgraph are cache invalid. This allows multiple modifications in the
32 | Graph before the next run updating the cache.
33 | Attention: This may break automatic cache invalidation.
34 | And for this to work, any task producing an input
35 | for the chosen subgraph may never be used more
36 | than once per stage.
37 | :param kwargs: Optional keyword arguments. How they get used is
38 | engine specific.
39 | :return: A result instance wrapping the flow execution result.
40 | """
41 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/engine/sequential.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING
4 |
5 | from pydiverse.pipedag import ExternalTableReference, Table, Task
6 | from pydiverse.pipedag.context import ConfigContext, RunContext
7 | from pydiverse.pipedag.core.result import Result
8 | from pydiverse.pipedag.engine.base import (
9 | OrchestrationEngine,
10 | )
11 |
12 | if TYPE_CHECKING:
13 | from pydiverse.pipedag.core import Subflow
14 |
15 |
16 | class SequentialEngine(OrchestrationEngine):
17 | """Most basic orchestration engine that just executes all tasks sequentially."""
18 |
19 | def run(
20 | self,
21 | flow: Subflow,
22 | ignore_position_hashes: bool = False,
23 | inputs: dict[Task, ExternalTableReference] | None = None,
24 | **run_kwargs,
25 | ):
26 | run_context = RunContext.get()
27 | config_context = ConfigContext.get()
28 |
29 | failed_tasks = set() # type: set[Task]
30 | results = {}
31 | exception = None
32 | inputs = inputs if inputs is not None else {}
33 |
34 | try:
35 | for task in flow.get_tasks():
36 | try:
37 | if not (set(task.input_tasks) & failed_tasks):
38 | task_inputs = {
39 | **{
40 | in_id: results[in_t]
41 | for in_id, in_t in task.input_tasks.items()
42 | if in_t in results and in_t not in inputs
43 | },
44 | **{
45 | in_id: Table(inputs[in_t])
46 | for in_id, in_t in task.input_tasks.items()
47 | if in_t in inputs
48 | },
49 | }
50 |
51 | results[task] = task.run(
52 | inputs=task_inputs,
53 | run_context=run_context,
54 | config_context=config_context,
55 | ignore_position_hashes=ignore_position_hashes,
56 | )
57 | else:
58 | failed_tasks.add(task)
59 | except Exception as e:
60 | if config_context.fail_fast:
61 | raise e
62 | if config_context._swallow_exceptions:
63 | exception = e
64 | failed_tasks.add(task)
65 | else:
66 | raise e
67 |
68 | except Exception as e:
69 | if config_context.fail_fast:
70 | raise e
71 | exception = e
72 |
73 | return Result.init_from(
74 | subflow=flow,
75 | underlying=results,
76 | successful=(exception is None),
77 | task_values=results,
78 | exception=exception,
79 | )
80 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/errors/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | class FlowError(Exception):
5 | """
6 | Exception raised when there is an issue with the flow definition.
7 | """
8 |
9 |
10 | class StageError(Exception):
11 | """
12 | Exception raised when something is wrong with the stage.
13 | """
14 |
15 |
16 | class GroupNodeError(Exception):
17 | """
18 | Exception raised when something is wrong with the stage.
19 | """
20 |
21 |
22 | class CacheError(Exception):
23 | """
24 | Exception raised if something couldn't be retrieved from the cache.
25 | """
26 |
27 |
28 | class LockError(Exception):
29 | """
30 | Exception raised if something goes wrong while locking, for example if
31 | a lock expires before it has been released.
32 | """
33 |
34 |
35 | class DuplicateNameError(ValueError):
36 | """
37 | Exception raised if an object that is supposed to have a unique name doesn't.
38 | """
39 |
40 |
41 | class IPCError(Exception):
42 | """
43 | Exception raised when inter process communication fails.
44 | """
45 |
46 |
47 | class RemoteProcessError(IPCError):
48 | """
49 | Exception raised if an exception occurred in the remote IPC process.
50 | """
51 |
52 |
53 | class DisposedError(Exception):
54 | """
55 | Exception raise when an object has been disposed, but some attributes are
56 | being accessed nevertheless.
57 | """
58 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/src/pydiverse/pipedag/management/__init__.py
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/management/cli.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import importlib
4 | import pkgutil
5 | from pathlib import Path
6 |
7 | import click
8 |
9 |
10 | @click.group()
11 | def cli():
12 | pass
13 |
14 |
15 | def find_commands():
16 | commands_dir = Path(__file__).parent / "commands"
17 | return [
18 | name
19 | for _, name, ispkg in pkgutil.iter_modules([str(commands_dir)])
20 | if not ispkg and not name.startswith("_")
21 | ]
22 |
23 |
24 | def load_command(command: str):
25 | importlib.import_module(f"pydiverse.pipedag.management.commands.{command}")
26 |
27 |
28 | def dynamically_load_commands():
29 | for command in find_commands():
30 | load_command(command)
31 |
32 |
33 | dynamically_load_commands()
34 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/src/pydiverse/pipedag/management/commands/__init__.py
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/management/commands/clear_metadata.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import click
4 |
5 | from pydiverse.pipedag import PipedagConfig
6 | from pydiverse.pipedag.backend.table import SQLTableStore
7 | from pydiverse.pipedag.backend.table.sql.ddl import DropSchema
8 | from pydiverse.pipedag.management.cli import cli
9 |
10 |
11 | @cli.command()
12 | @click.option(
13 | "--config",
14 | "config_path",
15 | type=str,
16 | help="path of the pipedag config file to use",
17 | )
18 | @click.option(
19 | "--instance",
20 | required=True,
21 | type=str,
22 | prompt=True,
23 | help="name of the instance to load from the config file",
24 | )
25 | @click.option(
26 | "--flow",
27 | type=str,
28 | help="name of the flow to load from the config file",
29 | )
30 | @click.option(
31 | "--per-user",
32 | is_flag=True,
33 | default=False,
34 | )
35 | @click.confirmation_option(
36 | prompt=(
37 | "Are you sure that you want to clear all metadata? "
38 | "This action can't be undone."
39 | )
40 | )
41 | def clear_metadata(
42 | config_path: str | None,
43 | instance: str,
44 | flow: str | None,
45 | per_user: bool,
46 | ):
47 | """Clears all pipedag metadata."""
48 |
49 | if config_path:
50 | pipedag_config = PipedagConfig(path=config_path)
51 | else:
52 | pipedag_config = PipedagConfig.default
53 |
54 | config = pipedag_config.get(
55 | instance=instance,
56 | flow=flow,
57 | per_user=per_user,
58 | )
59 |
60 | with config:
61 | table_store: SQLTableStore = config.store.table_store
62 |
63 | assert isinstance(
64 | table_store, SQLTableStore
65 | ), "clear-metadata only supported for SQLTableStore"
66 |
67 | drop_schema = DropSchema(
68 | table_store.metadata_schema,
69 | if_exists=True,
70 | cascade=True,
71 | engine=table_store.engine,
72 | )
73 |
74 | table_store.execute(drop_schema)
75 |
76 | click.echo("Did clear all metadata.")
77 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/management/commands/delete_schemas.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import click
4 | import sqlalchemy as sa
5 |
6 | from pydiverse.pipedag import PipedagConfig
7 | from pydiverse.pipedag.backend.table import SQLTableStore
8 | from pydiverse.pipedag.backend.table.sql.ddl import DropSchema
9 | from pydiverse.pipedag.container import Schema
10 | from pydiverse.pipedag.management.cli import cli
11 |
12 |
13 | @cli.command()
14 | @click.option(
15 | "--config",
16 | "config_path",
17 | type=str,
18 | help="path of the pipedag config file to use",
19 | )
20 | @click.option(
21 | "--instance",
22 | required=True,
23 | type=str,
24 | prompt=True,
25 | help="name of the instance to load from the config file",
26 | )
27 | @click.option(
28 | "--flow",
29 | type=str,
30 | help="name of the flow to load from the config file",
31 | )
32 | @click.option(
33 | "--per-user",
34 | is_flag=True,
35 | default=False,
36 | )
37 | @click.option(
38 | "--yes",
39 | is_flag=True,
40 | help="Confirm the action without prompting.",
41 | )
42 | def delete_schemas(
43 | config_path: str | None,
44 | instance: str,
45 | flow: str | None,
46 | per_user: bool,
47 | yes: bool,
48 | ):
49 | """
50 | Delete all schemas associated with an instance.
51 |
52 | Only works with SQLTableStore.
53 | """
54 |
55 | if config_path:
56 | pipedag_config = PipedagConfig(path=config_path)
57 | else:
58 | pipedag_config = PipedagConfig.default
59 |
60 | config = pipedag_config.get(
61 | instance=instance,
62 | flow=flow,
63 | per_user=per_user,
64 | )
65 |
66 | with config:
67 | table_store: SQLTableStore = config.store.table_store
68 |
69 | assert isinstance(
70 | table_store, SQLTableStore
71 | ), "delete-schemas only supported for SQLTableStore"
72 |
73 | prefix = table_store.schema_prefix
74 | suffix = table_store.schema_suffix
75 |
76 | inspector = sa.inspect(table_store.engine)
77 | schema_names = inspector.get_schema_names()
78 | schema_names = [
79 | schema
80 | for schema in schema_names
81 | if schema.startswith(prefix) and schema.endswith(suffix)
82 | ]
83 |
84 | if len(schema_names) == 0:
85 | click.echo("No matching schemas found. Aborting.")
86 | exit()
87 |
88 | database = table_store.engine_url.database
89 | click.echo(f"Found the following schemas (in database '{database}'):")
90 | for schema in schema_names:
91 | click.echo(f"- {schema}")
92 |
93 | if not yes:
94 | click.confirm(
95 | "Are you sure you want to continue? "
96 | "This will delete all the schemas listed above. "
97 | "This action can't be undone.",
98 | abort=True,
99 | )
100 |
101 | schemas = [Schema(name, "", "") for name in schema_names]
102 | for schema in schemas:
103 | drop_schema = DropSchema(schema, cascade=True, engine=table_store.engine)
104 | table_store.execute(drop_schema)
105 |
106 | click.echo("Did delete all schemas.")
107 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/materialize/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .core import input_stage_versions, materialize
4 |
5 | __all__ = [
6 | "materialize",
7 | "input_stage_versions",
8 | ]
9 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/materialize/cache.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import itertools
4 | from dataclasses import dataclass
5 | from functools import cached_property
6 | from typing import TYPE_CHECKING
7 |
8 | from pydiverse.pipedag.util.hashing import stable_hash
9 |
10 | if TYPE_CHECKING:
11 | from pydiverse.pipedag import Table
12 | from pydiverse.pipedag.materialize.core import MaterializingTask
13 |
14 |
15 | class ImperativeMaterializationState:
16 | def __init__(self):
17 | # every imperatively materialized table is an assumed dependency of
18 | # subsequently materialized tables of the same task
19 | self.assumed_dependencies: set[Table] = set()
20 | # Table(...).materialize() returns dematerialized objects. We need to find the
21 | # corresponding Table objects for handing returned objects over to consumer
22 | # tasks.
23 | self.object_lookup: dict[int, Table] = {}
24 | self.table_ids: set[int] = set()
25 | self.auto_suffix_counter = itertools.count()
26 |
27 | def add_table_lookup(self, obj, table: Table):
28 | self.assumed_dependencies.add(table)
29 | self.object_lookup[id(obj)] = table
30 | self.table_ids.add(id(table))
31 |
32 |
33 | @dataclass(frozen=True)
34 | class TaskCacheInfo:
35 | task: MaterializingTask
36 | input_hash: str
37 | cache_fn_hash: str
38 | cache_key: str
39 | assert_no_materialization: bool
40 | force_task_execution: bool
41 |
42 | @cached_property
43 | def imperative_materialization_state(self):
44 | """State used by Table.materialize()"""
45 | return ImperativeMaterializationState()
46 |
47 |
48 | def task_cache_key(task: MaterializingTask, input_hash: str, cache_fn_hash: str):
49 | """Cache key used to judge cache validity of the current task output.
50 |
51 | Also referred to as `task_hash`.
52 |
53 | For lazy objects, this hash isn't used to judge cache validity, instead it
54 | serves as an identifier to reference a specific task run. This can be the case
55 | if a task is determined to be cache-valid and the lazy query string is also
56 | the same, but the task_hash is different from a previous run. Then we can
57 | compute this combined_cache_key from the task's cache metadata to determine
58 | which lazy object to use as cache.
59 |
60 | :param task: task for which the cache key is computed
61 | :param input_hash: hash used for checking whether task is cache invalid due
62 | to changing input.
63 | :param cache_fn_hash: same as input_hash but for external inputs which need
64 | manual cache invalidation function.
65 | :return: The hash / cache key (str).
66 | """
67 |
68 | return stable_hash(
69 | "TASK",
70 | task.name,
71 | task.version,
72 | input_hash,
73 | cache_fn_hash,
74 | )
75 |
76 |
77 | def lazy_table_cache_key(task_hash: str, query_hash: str):
78 | return stable_hash(
79 | "LAZY_TABLE",
80 | task_hash,
81 | query_hash,
82 | )
83 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/materialize/metadata.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import datetime
4 | from dataclasses import dataclass
5 |
6 |
7 | @dataclass
8 | class TaskMetadata:
9 | """Metadata associated with a task
10 |
11 | This metadata object contains all the necessary information that is
12 | needed for determining if a task has already been executed with the
13 | same inputs, and all the information that is needed to reconstruct
14 | the output.
15 | """
16 |
17 | name: str
18 | stage: str
19 | version: str | None
20 | timestamp: datetime.datetime
21 | run_id: str
22 | position_hash: str
23 | input_hash: str
24 | cache_fn_hash: str
25 | output_json: str
26 |
27 |
28 | @dataclass
29 | class LazyTableMetadata:
30 | """Metadata associated with a 'lazy table'
31 |
32 | This class is only provided for convenience for those table store
33 | backends that implement the `lazy` option for the `store_table` method.
34 |
35 | The `query_hash` is a hash of the query string that produced this table.
36 | The `task_hash` is the combined hash of the task that produced this table.
37 |
38 | The `name` and `stage` values are used to retrieve the appropriate
39 | table from the cache.
40 |
41 | Attention: `task_hash` is sometimes taken from cache and thus is not guaranteed
42 | to refer to the `task_hash` that corresponds to the currently executed task.
43 | Instead, it refers to the task that originally produced this object.
44 | """
45 |
46 | name: str
47 | stage: str
48 | query_hash: str
49 | task_hash: str
50 |
51 |
52 | @dataclass
53 | class RawSqlMetadata:
54 | """Metadata associated with raw sql statements
55 |
56 | The `query_hash` is a hash of the raw sql string.
57 | The `task_hash` is the combined hash of the task that produced statement.
58 |
59 | The `prev_objects` and `stage` values are used to retrieve the appropriate
60 | tables from the cache.
61 |
62 | Attention: `task_hash` is sometimes taken from cache and thus is not guaranteed
63 | to refer to the `task_hash` that corresponds to the currently executed task.
64 | Instead, it refers to the task that originally produced this object.
65 | """
66 |
67 | prev_objects: list[str]
68 | new_objects: list[str]
69 | stage: str
70 | query_hash: str
71 | task_hash: str
72 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/util/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .deep_map import deep_map
4 | from .deep_merge import deep_merge
5 | from .disposable import Disposable
6 | from .import_ import requires
7 | from .naming import normalize_name, safe_name
8 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/util/deep_map.py:
--------------------------------------------------------------------------------
1 | """Generic deep map or mutation operations.
2 |
3 | Heavily inspired by the builtin copy module of python:
4 | https://github.com/python/cpython/blob/main/Lib/copy.py
5 | """
6 | from __future__ import annotations
7 |
8 | from typing import Callable
9 |
10 | _nil = []
11 |
12 |
13 | def deep_map(x, fn: Callable, memo=None):
14 | if memo is None:
15 | memo = {}
16 |
17 | d = id(x)
18 | y = memo.get(d, _nil)
19 | if y is not _nil:
20 | return y
21 |
22 | cls = type(x)
23 |
24 | if cls == list:
25 | y = _deep_map_list(x, fn, memo)
26 | elif cls == tuple:
27 | y = _deep_map_tuple(x, fn, memo)
28 | elif cls == dict:
29 | y = _deep_map_dict(x, fn, memo)
30 | else:
31 | y = fn(x)
32 |
33 | # If is its own copy, don't memoize.
34 | if y is not x:
35 | memo[d] = y
36 | _keep_alive(x, memo) # Make sure x lives at least as long as d
37 |
38 | return y
39 |
40 |
41 | def _deep_map_list(x, fn, memo):
42 | y = []
43 | append = y.append
44 | for a in x:
45 | append(deep_map(a, fn, memo))
46 | return fn(y)
47 |
48 |
49 | def _deep_map_tuple(x, fn, memo):
50 | y = [deep_map(a, fn, memo) for a in x]
51 | # We're not going to put the tuple in the memo, but it's still important we
52 | # check for it, in case the tuple contains recursive mutable structures.
53 | try:
54 | return memo[id(x)]
55 | except KeyError:
56 | pass
57 | for k, j in zip(x, y):
58 | if k is not j:
59 | y = tuple(y)
60 | break
61 | else:
62 | y = x
63 | return fn(y)
64 |
65 |
66 | def _deep_map_dict(x, fn, memo):
67 | y = {}
68 | memo[id(x)] = y
69 | for key, value in x.items():
70 | y[deep_map(key, fn, memo)] = deep_map(value, fn, memo)
71 | return fn(y)
72 |
73 |
74 | def _keep_alive(x, memo):
75 | """Keeps a reference to the object x in the memo.
76 | Because we remember objects by their id, we have
77 | to assure that possibly temporary objects are kept
78 | alive by referencing them.
79 | We store a reference at the id of the memo, which should
80 | normally not be used unless someone tries to deepcopy
81 | the memo itself...
82 | """
83 | try:
84 | memo[id(memo)].append(x)
85 | except KeyError:
86 | # aha, this is the first one :-)
87 | memo[id(memo)] = [x]
88 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/util/deep_merge.py:
--------------------------------------------------------------------------------
1 | """Generic deep update function for nested dictionaries.
2 |
3 | Seems to be solved already in various ways (do we like an extra dependency for pydantic.deep_update?)
4 | https://stackoverflow.com/questions/3232943/update-value-of-a-nested-dictionary-of-varying-depth
5 | But for snippets, license restrictions exist:
6 | https://www.ictrecht.nl/en/blog/what-is-the-license-status-of-stackoverflow-code-snippets
7 | """ # noqa: E501
8 | from __future__ import annotations
9 |
10 | from collections.abc import Iterable, Mapping
11 |
12 | from box import Box
13 |
14 |
15 | def deep_merge(x, y, check_enum=False):
16 | if type(x) != type(y) and not (isinstance(x, Mapping) and isinstance(y, Mapping)):
17 | raise TypeError(
18 | f"deep_merge failed due to type mismatch '{x}' (type: {type(x)}) vs. '{y}'"
19 | f" (type: {type(y)})"
20 | )
21 |
22 | if isinstance(x, Box):
23 | z = Box(_deep_merge_dict(x, y), frozen_box=True)
24 | elif isinstance(x, Mapping):
25 | z = _deep_merge_dict(x, y)
26 | elif isinstance(x, Iterable) and not isinstance(x, str):
27 | z = _deep_merge_iterable(x, y)
28 | else:
29 | z = y # update
30 |
31 | return z
32 |
33 |
34 | def _deep_merge_iterable(x: Iterable, y: Iterable):
35 | # Merging lists is not trivial.
36 | # There are a few different strategies: replace, unique, append, intersection, ...
37 | return y
38 | # return [*x, *y]
39 | # return [deep_merge(a, b) for a, b in zip(x, y)]
40 |
41 |
42 | def _deep_merge_dict(x: Mapping, y: Mapping):
43 | z = dict(x)
44 | for key in x:
45 | if key in y:
46 | if y[key] is None:
47 | # this is a special case but we have no other way in yaml to express
48 | # the deletion of fields from a dictionary in an override config
49 | del z[key]
50 | else:
51 | z[key] = deep_merge(x[key], y[key])
52 | z.update({key: value for key, value in y.items() if key not in z})
53 | return z
54 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/util/disposable.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pydiverse.pipedag.errors import DisposedError
4 |
5 |
6 | class Disposable:
7 | def __getattribute__(self, name):
8 | try:
9 | object.__getattribute__(self, "_Disposable__disposed")
10 | obj_type = object.__getattribute__(self, "__class__")
11 | raise DisposedError(f"Object of type {obj_type} has already been disposed.")
12 | except AttributeError:
13 | pass
14 |
15 | return object.__getattribute__(self, name)
16 |
17 | def __setattr__(self, key, value):
18 | try:
19 | object.__getattribute__(self, "_Disposable__disposed")
20 | obj_type = object.__getattribute__(self, "__class__")
21 | raise DisposedError(f"Object of type {obj_type} has already been disposed.")
22 | except AttributeError:
23 | pass
24 |
25 | return object.__setattr__(self, key, value)
26 |
27 | def dispose(self):
28 | object.__setattr__(self, "_Disposable__disposed", True)
29 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/util/hashing.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import base64
4 | import hashlib
5 |
6 |
7 | def stable_hash(*args: str) -> str:
8 | """Compute a hash over a set of strings
9 |
10 | :param args: Some strings from which to compute the cache key
11 | :return: A sha256 base32 digest, trimmed to 20 char length
12 | """
13 |
14 | combined_hash = hashlib.sha256(b"PIPEDAG")
15 | for arg in args:
16 | arg_bytes = str(arg).encode("utf8")
17 | arg_bytes_len = len(arg_bytes).to_bytes(length=8, byteorder="big")
18 |
19 | combined_hash.update(arg_bytes_len)
20 | combined_hash.update(arg_bytes)
21 |
22 | # Only take first 20 characters of base32 digest (100 bits). This
23 | # provides 50 bits of collision resistance, which is more than enough.
24 | # To illustrate: If you were to generate 1k hashes per second,
25 | # you still would have to wait over 800k years until you encounter
26 | # a collision.
27 |
28 | # NOTE: Can't use base64 because it contains lower and upper case
29 | # letters; identifiers in pipedag are all lowercase
30 | hash_digest = combined_hash.digest()
31 | hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
32 | return hash_str[:20]
33 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/util/naming.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import itertools
4 |
5 |
6 | def normalize_name(name: str) -> str:
7 | """Normalizes an identifier
8 |
9 | All names in PipeDAG are case-insensitive and can't contain any
10 | slashes. This helper function does exactly this conversion.
11 | """
12 | if name is not None:
13 | return name.casefold().strip().replace("/", "_")
14 |
15 |
16 | def safe_name(name: str) -> str:
17 | """Converts an identifier to one that is lowercase, ascii only
18 |
19 | Some backends might only support a limited set of characters for
20 | identifiers. This generic functions provides a mechanism for making
21 | a name safe (at least in most bases) by encoding non ascii characters
22 | using punycode.
23 |
24 | :param name: The identifier / name to make safe
25 | :return: The safe name
26 | """
27 | name = normalize_name(name)
28 | name = name.encode("punycode").decode("ascii")
29 | return name
30 |
31 |
32 | class NameDisambiguator:
33 | """State object for creating non-colliding names
34 |
35 | This object is used inside `TableHook.retrieve` to prevent SQLAlchemy issues...
36 | """
37 |
38 | def __init__(self):
39 | self.used_names = set()
40 | self.counter = itertools.count()
41 |
42 | def get_name(self, name: str | None) -> str:
43 | new_name = name
44 | while new_name in self.used_names:
45 | new_name = f"alias_{next(self.counter)}"
46 |
47 | self.used_names.add(new_name)
48 | return new_name
49 |
--------------------------------------------------------------------------------
/src/pydiverse/pipedag/util/structlog.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import sys
5 | import textwrap
6 | from io import StringIO
7 |
8 | import structlog
9 | from structlog.typing import EventDict, WrappedLogger
10 |
11 |
12 | class StructlogHandler(logging.Handler):
13 | """
14 | Stdlib logging handler that feeds all events back into structlog
15 |
16 | Can't be used with a structlog logger_factory that uses the logging library,
17 | otherwise logging would result in an infinite loop.
18 | """
19 |
20 | def __init__(self, *args, **kw):
21 | super().__init__(*args, **kw)
22 | self._log = structlog.get_logger()
23 |
24 | def emit(self, record):
25 | msg = self.format(record)
26 | self._log.log(record.levelno, msg, logger=record.name)
27 |
28 |
29 | class PipedagConsoleRenderer(structlog.dev.ConsoleRenderer):
30 | """
31 | Custom subclass of the structlog ConsoleRenderer that allows rendering
32 | specific values in the event dict on separate lines.
33 | """
34 |
35 | def __init__(self, *args, **kwargs):
36 | self._render_keys = kwargs.pop("render_keys", [])
37 | super().__init__(*args, **kwargs)
38 |
39 | def __call__(self, logger: WrappedLogger, name: str, event_dict: EventDict):
40 | render_objects = {}
41 | for key in self._render_keys:
42 | obj = event_dict.pop(key, None)
43 | if obj is not None:
44 | render_objects[key] = obj
45 |
46 | result = super().__call__(logger, name, event_dict)
47 | sio = StringIO()
48 | sio.write(result)
49 |
50 | for key, obj in render_objects.items():
51 | string_rep = str(obj)
52 | sio.write(
53 | "\n"
54 | + " ["
55 | + self._styles.kv_key
56 | + key
57 | + self._styles.reset
58 | + "]"
59 | + "\n"
60 | + textwrap.indent(string_rep, prefix=" " + self._styles.kv_value)
61 | + self._styles.reset
62 | )
63 |
64 | return sio.getvalue()
65 |
66 |
67 | def setup_logging(
68 | log_level=logging.INFO,
69 | log_stream=sys.stderr,
70 | timestamp_format="%Y-%m-%d %H:%M:%S.%f",
71 | ):
72 | """Configures structlog and logging with sane defaults."""
73 |
74 | # Redirect all logs submitted to logging to structlog
75 | logging.basicConfig(
76 | format="%(message)s",
77 | level=log_level,
78 | handlers=[StructlogHandler()],
79 | )
80 |
81 | # Configure structlog
82 | structlog.configure(
83 | processors=[
84 | structlog.contextvars.merge_contextvars,
85 | structlog.processors.StackInfoRenderer(),
86 | structlog.dev.set_exc_info,
87 | structlog.processors.add_log_level,
88 | structlog.processors.TimeStamper(timestamp_format),
89 | PipedagConsoleRenderer(
90 | render_keys=["query", "table_obj", "task", "table", "detail"]
91 | ),
92 | ],
93 | wrapper_class=structlog.make_filtering_bound_logger(log_level),
94 | logger_factory=structlog.PrintLoggerFactory(log_stream),
95 | cache_logger_on_first_use=True,
96 | )
97 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/__init__.py
--------------------------------------------------------------------------------
/tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/fixtures/__init__.py
--------------------------------------------------------------------------------
/tests/fixtures/instances.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from itertools import chain
4 |
5 | import pytest
6 |
7 | from pydiverse.pipedag import PipedagConfig
8 |
9 | __all__ = [
10 | "DATABASE_INSTANCES",
11 | "ORCHESTRATION_INSTANCES",
12 | "ALL_INSTANCES",
13 | "with_instances",
14 | "skip_instances",
15 | ]
16 |
17 |
18 | # Pytest markers associated with specific instance name
19 | INSTANCE_MARKS = {
20 | # Database Instances
21 | "postgres": pytest.mark.postgres,
22 | "postgres_unlogged": pytest.mark.postgres,
23 | "mssql": pytest.mark.mssql,
24 | "mssql_pytsql": pytest.mark.mssql,
25 | "ibm_db2": pytest.mark.ibm_db2,
26 | "ibm_db2_avoid_schema": pytest.mark.ibm_db2,
27 | "ibm_db2_materialization_details": pytest.mark.ibm_db2,
28 | "duckdb": pytest.mark.duckdb,
29 | "snowflake": pytest.mark.snowflake,
30 | # Local Table Cache Instances
31 | "local_table_cache": pytest.mark.postgres,
32 | "local_table_cache_inout": pytest.mark.postgres,
33 | "local_table_cache_inout_numpy": pytest.mark.postgres,
34 | "local_table_store": pytest.mark.postgres,
35 | # Orchestration Instances
36 | "dask_engine": [pytest.mark.dask, pytest.mark.postgres],
37 | "prefect_engine": [pytest.mark.prefect, pytest.mark.postgres],
38 | }
39 |
40 | # Collection of instances that represent different database technologies
41 | DATABASE_INSTANCES = (
42 | "postgres",
43 | "mssql",
44 | "ibm_db2",
45 | "duckdb",
46 | )
47 |
48 | ORCHESTRATION_INSTANCES = (
49 | "dask_engine",
50 | "prefect_engine",
51 | )
52 |
53 | # Extended collection of instances
54 | ALL_INSTANCES = (
55 | "postgres",
56 | "postgres_unlogged",
57 | "mssql",
58 | "mssql_pytsql",
59 | "ibm_db2",
60 | "ibm_db2_avoid_schema",
61 | "ibm_db2_materialization_details",
62 | "duckdb",
63 | "snowflake",
64 | "local_table_cache",
65 | )
66 |
67 |
68 | def with_instances(*instances, **kwargs):
69 | """Decorator to run a test with a specific set of instances
70 |
71 | :param instances: Names of the instances to use.
72 | :param kwargs: keyword arguments passed to PipedagConfig.default.get()
73 | """
74 | return pytest.mark.instances(*flatten(instances), **kwargs)
75 |
76 |
77 | def skip_instances(*instances):
78 | """Decorator to skip running a test with a specific set of instances"""
79 | return pytest.mark.skip_instances(*flatten(instances))
80 |
81 |
82 | def flatten(it):
83 | """Flatten an iterable"""
84 | if isinstance(it, (list, tuple)):
85 | yield from chain(*map(flatten, it))
86 | else:
87 | yield it
88 |
89 |
90 | # FIXTURE IMPLEMENTATION
91 |
92 |
93 | @pytest.fixture(autouse=True, scope="function", name="run_with_instance")
94 | def fixture_run_with_instance(request):
95 | """Fixture that runs test with different config instances"""
96 | if hasattr(request, "param"):
97 | instance, kwargs = request.param
98 | config = PipedagConfig.default.get(instance=instance, **kwargs)
99 | with config:
100 | yield instance
101 | else:
102 | yield None
103 |
--------------------------------------------------------------------------------
/tests/parallelize/README.md:
--------------------------------------------------------------------------------
1 | # Pipedag Parallelize
2 |
3 | This is a pytest plugin similar to `pytest-xdist` that allows executing tests in parallel.
4 | To prevent two tasks that run on the same instance from corrupting each other's data,
5 | it allows grouping tests together using the `pytest_parallelize_group_items` hook.
6 | Tests that have been grouped, run sequentially on the same worker.
7 | Different groups run in parallel on different workers.
8 |
9 | To specify the number of workers, use the `--workers` argument.
10 |
--------------------------------------------------------------------------------
/tests/parallelize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/parallelize/__init__.py
--------------------------------------------------------------------------------
/tests/parallelize/hooks.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 |
6 | @pytest.hookspec()
7 | def pytest_parallelize_group_items(config, items):
8 | ...
9 |
--------------------------------------------------------------------------------
/tests/parallelize/plugin.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from .sesson import Session
6 | from .util import parse_config
7 |
8 |
9 | def pytest_addoption(parser):
10 | workers_help = (
11 | "Set the max num of workers (aka processes) to start "
12 | "(int or 'auto' - one per core)"
13 | )
14 |
15 | group = parser.getgroup("parallelize")
16 | group.addoption("--workers", dest="workers", help=workers_help)
17 | parser.addini("workers", workers_help)
18 |
19 |
20 | @pytest.hookimpl(trylast=True)
21 | def pytest_configure(config):
22 | workers = parse_config(config, "workers")
23 | if config.option.collectonly or not workers:
24 | return
25 |
26 | config.pluginmanager.register(Session(config), "parallelize-session")
27 |
28 | try:
29 | # Patch _jb_pytest_runner to support parallel execution of test
30 | # when using the PyCharm IDE
31 | from _jb_runner_tools import set_parallel_mode
32 |
33 | set_parallel_mode()
34 | except ImportError:
35 | pass
36 |
37 |
38 | @pytest.hookimpl
39 | def pytest_addhooks(pluginmanager):
40 | from . import hooks
41 |
42 | pluginmanager.add_hookspecs(hooks)
43 |
--------------------------------------------------------------------------------
/tests/parallelize/util.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | def parse_config(config, name):
5 | return getattr(config.option, name, config.getini(name))
6 |
--------------------------------------------------------------------------------
/tests/parallelize/worker.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from abc import ABC
4 | from multiprocessing import Queue
5 |
6 | import pytest
7 | from _pytest.config import Config
8 |
9 |
10 | def start_worker(
11 | worker_id: int, work_queue: Queue, msg_queue: Queue, args: list, option_dict: dict
12 | ):
13 | option_dict["plugins"].append("no:terminal")
14 | config = Config.fromdictargs(option_dict, args)
15 | config.args = args
16 |
17 | from typing import TextIO
18 |
19 | class DontPrint(TextIO, ABC):
20 | def write(*_):
21 | pass
22 |
23 | # TODO: find a way to fix assert inspection code of pytest raised in threads
24 | # The following code meant to do this, but prevents tests from running at all.
25 | # # register dummy terminal reporter since it is needed by pytest even with
26 | # # plugins:"no:terminal" option
27 | # terminal_reporter = TerminalReporter(config, DontPrint())
28 | # config.pluginmanager.register(terminal_reporter, "terminalreporter")
29 |
30 | # Remove workers option to prevent triggering main plugin
31 | config.option.workers = None
32 |
33 | worker = Worker(config, worker_id, work_queue, msg_queue)
34 | config.pluginmanager.register(worker)
35 | config.hook.pytest_cmdline_main(config=config)
36 |
37 |
38 | class Worker:
39 | def __init__(self, config, worker_id: int, work_queue: Queue, msg_queue: Queue):
40 | super().__init__()
41 |
42 | self.config = config
43 | self.worker_id = worker_id
44 | self.work_queue = work_queue
45 | self.msg_queue = msg_queue
46 |
47 | self.session_items = {}
48 |
49 | def send(self, msg, **kwargs):
50 | kwargs["worker_id"] = self.worker_id
51 | self.msg_queue.put((msg, kwargs))
52 |
53 | @pytest.hookimpl
54 | def pytest_sessionstart(self, session):
55 | self.send("sessionstart")
56 |
57 | @pytest.hookimpl
58 | def pytest_sessionfinish(self, session):
59 | self.send("sessionfinish")
60 |
61 | @pytest.hookimpl
62 | def pytest_runtest_logstart(self, nodeid, location):
63 | self.send("logstart", nodeid=nodeid, location=location)
64 |
65 | @pytest.hookimpl
66 | def pytest_runtest_logfinish(self, nodeid, location):
67 | self.send("logfinish", nodeid=nodeid, location=location)
68 |
69 | @pytest.hookimpl
70 | def pytest_runtest_logreport(self, report):
71 | data = self.config.hook.pytest_report_to_serializable(
72 | config=self.config,
73 | report=report,
74 | )
75 | self.send("logreport", report=data)
76 |
77 | @pytest.hookimpl
78 | def pytest_runtestloop(self, session):
79 | self.session_items = {item.nodeid: item for item in session.items}
80 |
81 | should_terminate = False
82 | while not should_terminate:
83 | command, args = self.work_queue.get()
84 | should_terminate = self.process_one_item(session, command, args)
85 | return True
86 |
87 | def process_one_item(self, session, command, args):
88 | if command == "STOP":
89 | return True
90 |
91 | if command == "GROUP":
92 | group_name, node_ids = args
93 | items = [self.session_items[node_id] for node_id in node_ids]
94 |
95 | self.send("DEBUG_start_group", group_name=group_name)
96 | for i, item in enumerate(items):
97 | next_item = items[i + 1] if i + 1 < len(items) else None
98 | self.run_one_test(session, item, next_item)
99 |
100 | return False
101 |
102 | def run_one_test(self, session, item, next_item):
103 | self.send("DEBUG_start_test", nodeid=item.nodeid)
104 | item.ihook.pytest_runtest_protocol(item=item, nextitem=next_item)
105 | if session.shouldfail:
106 | raise session.Failed(session.shouldfail)
107 | if session.shouldstop:
108 | raise session.Interrupted(session.shouldstop)
109 |
--------------------------------------------------------------------------------
/tests/test_cache/test_auto_version.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import pytest
5 |
6 | from pydiverse.pipedag import AUTO_VERSION, Blob, Flow, Stage, Table
7 | from pydiverse.pipedag.container import RawSql
8 | from pydiverse.pipedag.materialize.core import materialize
9 | from tests.fixtures.instances import with_instances
10 | from tests.util import swallowing_raises
11 |
12 | pytestmark = [with_instances("postgres"), with_instances("local_table_store")]
13 |
14 |
15 | # Specific backends have tests in the test_table_hooks folder
16 |
17 |
18 | def test_lazy_incompatible_with_auto_version():
19 | with pytest.raises(ValueError):
20 |
21 | @materialize(input_type=pd.DataFrame, version=AUTO_VERSION, lazy=True)
22 | def task():
23 | ...
24 |
25 |
26 | def test_missing_input_type_auto_version():
27 | with pytest.raises(ValueError):
28 |
29 | @materialize(version=AUTO_VERSION)
30 | def task():
31 | ...
32 |
33 |
34 | @with_instances("postgres")
35 | def test_auto_version_illegal_return_types():
36 | @materialize(input_type=pd.DataFrame, version=AUTO_VERSION)
37 | def blob():
38 | return Blob(1), Table(pd.DataFrame())
39 |
40 | @materialize(input_type=pd.DataFrame, version=AUTO_VERSION)
41 | def raw_sql():
42 | return RawSql("..."), Table(pd.DataFrame())
43 |
44 | with Flow() as f:
45 | with Stage("auto_version"):
46 | _blob = blob()
47 | _raw_sql = raw_sql()
48 |
49 | with swallowing_raises(ValueError, match="Blob"):
50 | f.run(_blob)
51 |
52 | with swallowing_raises(ValueError, match="RawSql"):
53 | f.run(_raw_sql)
54 |
55 |
56 | def test_auto_version_not_supported():
57 | import sqlalchemy as sa
58 |
59 | @materialize(input_type=sa.Table, version=AUTO_VERSION)
60 | def not_supported():
61 | return Table(pd.DataFrame({"x": [1, 2, 3, 4]}))
62 |
63 | with Flow() as f:
64 | with Stage("auto_version"):
65 | _ = not_supported()
66 |
67 | with swallowing_raises(TypeError, match="Auto versioning not supported"):
68 | f.run()
69 |
70 |
71 | # TODO: Currently we only test that auto versioning actually works,
72 | # and that the task gets called the expected amount of times
73 | # in the polars hook tests.
74 | # Once we have support for auto versioning with pandas, we
75 | # might also want to put some tests into this file.
76 |
--------------------------------------------------------------------------------
/tests/test_cache/test_local_table_cache.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import sqlalchemy as sa
5 |
6 | from pydiverse.pipedag import *
7 | from tests.fixtures.instances import with_instances
8 |
9 |
10 | @with_instances(
11 | "local_table_cache",
12 | "local_table_cache_inout",
13 | "local_table_cache_inout_numpy",
14 | "local_table_store",
15 | )
16 | def test_local_table_cache(mocker):
17 | input_val_ = 0
18 |
19 | @materialize()
20 | def input_val():
21 | return input_val_
22 |
23 | @materialize(version="1.0")
24 | def select_pandas(x):
25 | # Supported by local caching
26 | return Table(pd.DataFrame({"x": [x]}), "pandas")
27 |
28 | @materialize(lazy=True)
29 | def select_sql(x):
30 | # Not supported by local caching
31 | return Table(sa.select(sa.literal(x).label("x")), "sql")
32 |
33 | @materialize(version="1.0", input_type=pd.DataFrame)
34 | def sink(*args):
35 | for arg in args:
36 | assert arg["x"][0] == input_val_
37 |
38 | with Flow() as f:
39 | with Stage("stage"):
40 | x = input_val()
41 |
42 | s_pandas = select_pandas(x)
43 | s_sql = select_sql(x)
44 |
45 | _ = sink(s_pandas, s_sql)
46 |
47 | # Initial run to invalidate cache
48 | input_val_ = -1
49 | f.run()
50 | input_val_ = 0
51 |
52 | # Spy Setup
53 | config_context = ConfigContext.get()
54 | local_table_cache = config_context.store.local_table_cache
55 |
56 | si = int(local_table_cache.should_store_input)
57 | so = int(local_table_cache.should_store_output)
58 | siac = int(local_table_cache.should_use_stored_input_as_cache)
59 |
60 | store_table_spy = mocker.spy(local_table_cache, "store_table")
61 | store_input_spy = mocker.spy(local_table_cache, "store_input")
62 | _store_table_spy = mocker.spy(local_table_cache, "_store_table")
63 |
64 | retrieve_table_obj_spy = mocker.spy(local_table_cache, "retrieve_table_obj")
65 | _retrieve_table_obj_spy = mocker.spy(local_table_cache, "_retrieve_table_obj")
66 |
67 | # Initial Run
68 | f.run()
69 |
70 | expected_retrieve_table_obj = 2
71 | expected_successful_retrieve_table_obj = 1 * so * siac # pandas
72 | expected_store_table = 2
73 | expected_store_input = 2 - expected_successful_retrieve_table_obj
74 |
75 | assert store_table_spy.call_count == expected_store_table
76 | assert store_input_spy.call_count == expected_store_input
77 | assert retrieve_table_obj_spy.call_count == expected_retrieve_table_obj
78 |
79 | assert _store_table_spy.call_count == (expected_store_input * si) + (
80 | expected_store_table * so
81 | )
82 | assert _retrieve_table_obj_spy.call_count == expected_successful_retrieve_table_obj
83 |
84 | # Second Run
85 | store_table_spy.reset_mock()
86 | store_input_spy.reset_mock()
87 | _store_table_spy.reset_mock()
88 | retrieve_table_obj_spy.reset_mock()
89 | _retrieve_table_obj_spy.reset_mock()
90 |
91 | f.run()
92 |
93 | # Everything should be cache valid, thus no task should get executed.
94 | assert store_table_spy.call_count == 0
95 | assert store_input_spy.call_count == 0
96 | assert retrieve_table_obj_spy.call_count == 0
97 |
--------------------------------------------------------------------------------
/tests/test_compression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 | import sqlalchemy as sa
5 |
6 | from pydiverse.pipedag import ConfigContext, Flow, Stage, Table, materialize
7 | from pydiverse.pipedag.backend.table.sql.dialects import (
8 | IBMDB2TableStore,
9 | MSSqlTableStore,
10 | )
11 |
12 | # Parameterize all tests in this file with several instance_id configurations
13 | from tests.fixtures.instances import (
14 | DATABASE_INSTANCES,
15 | skip_instances,
16 | with_instances,
17 | )
18 | from tests.util import tasks_library as m
19 |
20 | pytestmark = [with_instances(DATABASE_INSTANCES)]
21 |
22 |
23 | @pytest.mark.parametrize(
24 | "task, stage_materialization_details",
25 | [
26 | (m.simple_table_compressed_one_method, "adaptive_value_compression"),
27 | (m.simple_table_compressed_two_methods, "adaptive_value_compression"),
28 | (m.simple_dataframe_compressed_one_method, "adaptive_value_compression"),
29 | (m.simple_dataframe_compressed_two_methods, "adaptive_value_compression"),
30 | (m.simple_table_default_compressed, "adaptive_value_compression"),
31 | (m.simple_dataframe_uncompressed, None),
32 | ],
33 | )
34 | @with_instances(DATABASE_INSTANCES, "ibm_db2_materialization_details")
35 | @skip_instances("ibm_db2")
36 | def test_compression(task, stage_materialization_details):
37 | @materialize(input_type=sa.Table, lazy=False)
38 | def get_compression_attributes(table: sa.sql.expression.Alias):
39 | query = f"""
40 | SELECT COMPRESSION, ROWCOMPMODE FROM SYSCAT.TABLES
41 | WHERE TABSCHEMA = '{table.original.schema.upper()}'
42 | AND TABNAME = '{table.original.name.upper()}'
43 | """
44 | return Table(sa.text(query), f"compression_attributes_{table.name}")
45 |
46 | with Flow("flow") as f:
47 | with Stage("stage", materialization_details=stage_materialization_details):
48 | comp_exp_x, x = task()
49 | config = ConfigContext.get()
50 | store = config.store.table_store
51 | if isinstance(store, IBMDB2TableStore):
52 | comp_x = get_compression_attributes(x)
53 | m.assert_table_equal(comp_exp_x, comp_x)
54 |
55 | m.assert_table_equal(x, x)
56 |
57 | for _ in range(3):
58 | if (
59 | not isinstance(store, (MSSqlTableStore, IBMDB2TableStore))
60 | and task != m.simple_dataframe_uncompressed
61 | ):
62 | with pytest.raises(
63 | ValueError,
64 | match="To silence this exception set"
65 | " strict_materialization_details=False",
66 | ):
67 | assert f.run().successful
68 | else:
69 | assert f.run().successful
70 |
--------------------------------------------------------------------------------
/tests/test_dask.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import io
4 | import pickle
5 | from io import BytesIO
6 |
7 | import dask
8 | import structlog
9 | from _pytest.capture import EncodedFile
10 |
11 |
12 | class A(io.TextIOWrapper):
13 | def __getstate__(self):
14 | return "a"
15 |
16 | def __reduce__(self):
17 | return A, (BytesIO(b"hello"),)
18 |
19 | def __reduce_ex__(self, protocol):
20 | _ = protocol
21 | return self.__reduce__()
22 |
23 |
24 | def test_that_io_wrapper_is_pickleable():
25 | pickle.dumps(A(BytesIO(b"hello")))
26 |
27 |
28 | def test_that_encoded_file_is_picklable():
29 | pickle.dumps(EncodedFile(BytesIO(b"hello"), "utf-8"))
30 |
31 |
32 | def test_dask_structlog_configuration_does_not_prevent_pickling():
33 | def bind_run():
34 | structlog_config = structlog.get_config()
35 |
36 | def run(parent_futures, **kwargs):
37 | _ = parent_futures
38 |
39 | structlog.configure(**structlog_config)
40 |
41 | return 1
42 |
43 | run.__name__ = "hi"
44 | return dask.delayed(run, pure=False)
45 |
46 | results = [bind_run()(parent_futures=[])]
47 | kw = {
48 | "traverse": True,
49 | "optimize_graph": False,
50 | "scheduler": "processes",
51 | "num_workers": 8,
52 | "chunksize": 1,
53 | }
54 |
55 | dask.compute(results, **kw)
56 |
--------------------------------------------------------------------------------
/tests/test_flows/complex_config_flows/postgres_password.yaml:
--------------------------------------------------------------------------------
1 | username: sa
2 | password: Pydiverse23
3 | host: 127.0.0.1
4 | port: 6543
5 |
--------------------------------------------------------------------------------
/tests/test_flows/complex_config_flows/test_locking_instances.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from pydiverse.pipedag.context import StageLockContext
6 | from pydiverse.pipedag.core.config import PipedagConfig
7 | from tests.test_flows.complex_config_flows.test_instance_selection import (
8 | cfg_file_path,
9 | check_result,
10 | get_flow,
11 | )
12 |
13 | _ = cfg_file_path
14 |
15 |
16 | @pytest.mark.parametrize("instance", ["lock_zookeeper", "lock_file"])
17 | def test_lock_manager_instances(cfg_file_path, instance):
18 | # At this point, an instance is chosen from multi-pipedag-instance
19 | # configuration file
20 | pipedag_config = PipedagConfig(cfg_file_path)
21 | cfg = pipedag_config.get(instance=instance)
22 |
23 | flow, out1, out2 = get_flow(cfg.attrs, pipedag_config)
24 |
25 | with StageLockContext():
26 | result = flow.run(config=cfg)
27 | check_result(result, out1, out2)
28 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql/create_db_helpers.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | /*
4 | Section: Procedures
5 | */
6 | IF OBJECT_ID(N'{{out_schema}}.CREATEALLDATES', N'P') IS NOT NULL DROP PROCEDURE {{out_schema}}.CREATEALLDATES;
7 | GO
8 | CREATE PROCEDURE {{out_schema}}.CREATEALLDATES
9 | (
10 | @StartDate AS DATE, @EndDate AS DATE
11 | ) AS
12 | DECLARE @Current AS DATE = DATEADD(DD, 0, @StartDate); DROP TABLE IF EXISTS ##alldates CREATE TABLE ##alldates (
13 | dt DATE PRIMARY KEY
14 | ) WHILE @Current <= @EndDate BEGIN
15 | INSERT INTO ##alldates
16 | VALUES (@Current);
17 | SET @Current = DATEADD(DD, 1, @Current) -- add 1 to current day
18 | END
19 | GO
20 |
21 |
22 | /*
23 | Section: Functions
24 | */
25 | IF OBJECT_ID(N'{{out_schema}}.get_db_sampling_factor', N'FN') IS NOT NULL DROP FUNCTION {{out_schema}}.get_db_sampling_factor;
26 | GO
27 | CREATE FUNCTION {{out_schema}}.get_db_sampling_factor () RETURNS INT AS
28 | BEGIN
29 | DECLARE @sampling_rate INT;
30 | SELECT @sampling_rate = ISNULL(TRY_CAST(RIGHT(DB_NAME(), LEN(DB_NAME()) - CHARINDEX('_m', DB_NAME()) - 1) AS INT),
31 | 1 -- fallback: take full sample
32 | );
33 | RETURN @sampling_rate
34 | END;
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql/prep/entity_checks.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 | DROP TABLE IF EXISTS {{out_schema}}.table01
3 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Create table')
4 | GO
5 | CREATE TABLE {{out_schema}}.table01 (
6 | entity VARCHAR(17) NOT NULL
7 | , reason VARCHAR(50) NOT NULL
8 | PRIMARY KEY (entity, reason)
9 | )
10 |
11 |
12 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Missing')
13 | GO
14 | INSERT INTO {{out_schema}}.table01 WITH (TABLOCKX)
15 | SELECT DISTINCT raw01.entity entity
16 | , 'Missing in raw01' reason
17 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
18 | LEFT JOIN (
19 | SELECT DISTINCT entity
20 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
21 | ) raw01x
22 | ON raw01.entity = raw01x.entity
23 | WHERE raw01.end_date = '9999-01-01'
24 | AND raw01x.entity IS NULL
25 |
26 |
27 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - more missing in raw01')
28 | GO
29 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX)
30 | SELECT
31 | raw01.entity entity
32 | , 'missing' reason
33 | FROM {{in_schema}}.raw01 raw01 WITH(NOLOCK)
34 | GROUP BY raw01.entity
35 | HAVING MAX(raw01.end_date) < '9999-01-01'
36 |
37 |
38 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Inconsistency correction')
39 | GO
40 | WITH entity_ids AS (
41 | SELECT DISTINCT raw01.entity entity
42 | FROM {{in_schema}}.raw01 raw01 WITH (NOLOCK)
43 | INNER JOIN ( -- filter
44 | SELECT entity
45 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
46 | WHERE end_date = '9999-01-01'
47 | ) raw01_final
48 | ON raw01.entity = raw01_final.entity
49 | WHERE 1=1
50 | )
51 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX)
52 | SELECT x.entity
53 | , 'Inconsistency correction' reason
54 | FROM entity_ids x
55 | INNER JOIN entity_ids y
56 | ON x.entity = y.entity
57 | WHERE x.entity <> y.entity
58 | GROUP BY x.entity
59 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql/prep/more_tables.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | /*
4 | SECTION: raw01A
5 | */
6 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01A')
7 | GO
8 | CREATE TABLE {{out_schema}}.raw01A(
9 | entity VARCHAR(17) NOT NULL
10 | , start_date DATE NOT NULL
11 | , end_date DATE NOT NULL
12 | , PRIMARY KEY(entity, start_date)
13 | )
14 | INSERT INTO {{out_schema}}.raw01A WITH(TABLOCKX)
15 | SELECT apgs.entity entity
16 | , apgs.start_date start_date
17 | , apgs.end_date end_date
18 | FROM (
19 | SELECT entity
20 | , start_date
21 | , end_date
22 | FROM {{in_schema}}.raw01 apgs WITH(NOLOCK)
23 | ) apgs
24 | INNER JOIN (
25 | SELECT DISTINCT entity
26 | FROM {{in_schema}}.raw01 WITH(NOLOCK)
27 | ) base
28 | ON apgs.entity = base.entity
29 | CREATE INDEX raw_start_date ON {{out_schema}}.raw01A (start_date DESC)
30 | CREATE INDEX raw_start_date_end_date ON {{out_schema}}.raw01A (end_date, start_date DESC)
31 | GO
32 | SELECT 'äöüßéç' as string_col INTO {{out_schema}}.special_chars
33 | GO
34 | CREATE TABLE {{out_schema}}.special_chars2 (
35 | id TINYINT NOT NULL PRIMARY KEY,
36 | string_col VARCHAR(60) NOT NULL
37 | )
38 | INSERT INTO {{out_schema}}.special_chars2 (id, string_col) VALUES
39 | (1, 'äöüßéç')
40 | GO
41 | -- check that both strings match and have length 7 with NOT NULL constraint
42 | CREATE TABLE {{out_schema}}.special_chars_join (
43 | string_col VARCHAR(60) NOT NULL,
44 | string_col2 VARCHAR(60) NOT NULL,
45 | string_col3 VARCHAR(60) NOT NULL
46 | )
47 | INSERT INTO {{out_schema}}.special_chars_join
48 | SELECT a.string_col, b.string_col, c.string_col
49 | FROM {{out_schema}}.special_chars a
50 | FULL OUTER JOIN {{out_schema}}.special_chars2 b ON a.string_col = b.string_col
51 | FULL OUTER JOIN {{out_schema}}.special_chars2 c ON a.string_col = c.string_col
52 | and len(a.string_col) = 6 and len(c.string_col) = 6
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql/raw/raw_views.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 | {{helper_schema}}.CREATEALLDATES '2022-01-01', '2023-01-01'
3 | SELECT * INTO {{out_schema}}.dummy_dates FROM ##alldates
4 | GO
5 | SELECT 1000000 as entity_nr, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO {{out_schema}}.schema00_raw01_table
6 | GO
7 | SELECT '1' as mod_type, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO {{out_schema}}.filter_table
8 | GO
9 |
10 | /*
11 | SECTION: SAMPLING
12 | */
13 | GO
14 | DECLARE @START BIGINT = 0 + (SELECT CAST(MIN(entity_nr) AS BIGINT) FROM {{out_schema}}.schema00_raw01_table);
15 | DECLARE @END BIGINT = (SELECT CAST(MAX(entity_nr) AS BIGINT) FROM {{out_schema}}.schema00_raw01_table);
16 | DECLARE @STEP INT = {{helper_schema}}.get_db_sampling_factor();
17 | DROP TABLE IF EXISTS {{out_schema}}.sample_entities;
18 | WITH L0 AS (SELECT c FROM (SELECT 1 UNION ALL SELECT 1) AS D(c)), -- 2^1
19 | L1 AS (SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B), -- 2^2
20 | L2 AS (SELECT 1 AS c FROM L1 AS A CROSS JOIN L1 AS B), -- 2^4
21 | L3 AS (SELECT 1 AS c FROM L2 AS A CROSS JOIN L2 AS B), -- 2^8
22 | L4 AS (SELECT 1 AS c FROM L3 AS A CROSS JOIN L3 AS B), -- 2^16
23 | L5 AS (SELECT 1 AS c FROM L4 AS A CROSS JOIN L4 AS B), -- 2^32
24 | Nums AS (SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS k FROM L5)
25 | SELECT k * @STEP + @START AS nr
26 | INTO {{out_schema}}.sample_entities
27 | FROM nums
28 | WHERE k <= (@END - @START) / @STEP
29 | CREATE UNIQUE CLUSTERED INDEX nr_index ON {{out_schema}}.sample_entities (nr) WITH ( FILLFACTOR = 100, DATA_COMPRESSION = ROW );
30 |
31 |
32 | /*
33 | SECTION: Raw-Tables
34 | */
35 | GO
36 | PRINT (CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01')
37 | DROP VIEW IF EXISTS {{out_schema}}.raw01
38 | GO
39 | CREATE VIEW {{out_schema}}.raw01
40 | AS
41 | SELECT entity_nr entity
42 | , start_date start_date
43 | , end_date end_date
44 | FROM {{out_schema}}.schema00_raw01_table WITH (NOLOCK)
45 | INNER JOIN sample_entities WITH (NOLOCK)
46 | ON entity_nr = sample_entities.nr
47 |
48 |
49 | /*
50 | SECTION: Reference tables
51 | */
52 |
53 | GO
54 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.fm_mod_type')
55 | DROP VIEW IF EXISTS {{out_schema}}.fm_mod_type
56 | GO
57 | CREATE VIEW {{out_schema}}.fm_mod_type
58 | AS
59 | SELECT mod_type x_inv_type
60 | , start_date start_date
61 | , end_date end_date
62 | FROM {{out_schema}}.filter_table WITH(NOLOCK)
63 | GO
64 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql/create_db_helpers.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 | USE {{out_database}}
3 | GO
4 |
5 |
6 | /*
7 | Section: Procedures
8 | */
9 | IF OBJECT_ID(N'dbo.CREATEALLDATES', N'P') IS NOT NULL DROP PROCEDURE dbo.CREATEALLDATES;
10 | GO
11 | CREATE PROCEDURE CREATEALLDATES
12 | (
13 | @StartDate AS DATE, @EndDate AS DATE
14 | ) AS
15 | DECLARE @Current AS DATE = DATEADD(DD, 0, @StartDate); DROP TABLE IF EXISTS ##alldates CREATE TABLE ##alldates (
16 | dt DATE PRIMARY KEY
17 | ) WHILE @Current <= @EndDate BEGIN
18 | INSERT INTO ##alldates
19 | VALUES (@Current);
20 | SET @Current = DATEADD(DD, 1, @Current) -- add 1 to current day
21 | END
22 | GO
23 |
24 |
25 | /*
26 | Section: Functions
27 | */
28 | IF OBJECT_ID(N'dbo.get_db_sampling_factor', N'FN') IS NOT NULL DROP FUNCTION get_db_sampling_factor;
29 | GO
30 | CREATE FUNCTION dbo.get_db_sampling_factor () RETURNS INT AS
31 | BEGIN
32 | DECLARE @sampling_rate INT;
33 | SELECT @sampling_rate = ISNULL(TRY_CAST(RIGHT(DB_NAME(), LEN(DB_NAME()) - CHARINDEX('_m', DB_NAME()) - 1) AS INT),
34 | 1 -- fallback: take full sample
35 | );
36 | RETURN @sampling_rate
37 | END;
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql/prep/entity_checks.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | USE master -- no default schema
4 | GO
5 |
6 | CREATEALLDATES('2022-01-01', '2023-01-01')
7 |
8 | SELECT * INTO {{out_schema}}.dummy_dates FROM ##alldates
9 |
10 | DROP TABLE IF EXISTS {{out_schema}}.table01
11 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Create table')
12 | GO
13 | CREATE TABLE {{out_schema}}.table01 (
14 | entity VARCHAR(17) NOT NULL
15 | , reason VARCHAR(50) NOT NULL
16 | PRIMARY KEY (entity, reason)
17 | )
18 |
19 |
20 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Missing')
21 | GO
22 | INSERT INTO {{out_schema}}.table01 WITH (TABLOCKX)
23 | SELECT DISTINCT raw01.entity entity
24 | , 'Missing in raw01' reason
25 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
26 | LEFT JOIN (
27 | SELECT DISTINCT entity
28 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
29 | ) raw01x
30 | ON raw01.entity = raw01x.entity
31 | WHERE raw01.end_date = '9999-01-01'
32 | AND raw01x.entity IS NULL
33 |
34 |
35 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - more missing in raw01')
36 | GO
37 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX)
38 | SELECT
39 | raw01.entity entity
40 | , 'missing' reason
41 | FROM {{in_schema}}.raw01 raw01 WITH(NOLOCK)
42 | GROUP BY raw01.entity
43 | HAVING MAX(raw01.end_date) < '9999-01-01'
44 |
45 |
46 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Inconsistency correction')
47 | GO
48 | WITH entity_ids AS (
49 | SELECT DISTINCT raw01.entity entity
50 | FROM {{in_schema}}.raw01 raw01 WITH (NOLOCK)
51 | INNER JOIN ( -- filter
52 | SELECT entity
53 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
54 | WHERE end_date = '9999-01-01'
55 | ) raw01_final
56 | ON raw01.entity = raw01_final.entity
57 | WHERE 1=1
58 | )
59 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX)
60 | SELECT x.entity
61 | , 'Inconsistency correction' reason
62 | FROM entity_ids x
63 | INNER JOIN entity_ids y
64 | ON x.entity = y.entity
65 | WHERE x.entity <> y.entity
66 | GROUP BY x.entity
67 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql/prep/more_tables.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | USE master
4 | GO
5 |
6 | /*
7 | SECTION: raw01A
8 | */
9 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01A')
10 | GO
11 | CREATE TABLE {{out_schema}}.raw01A(
12 | entity VARCHAR(17) NOT NULL
13 | , start_date DATE NOT NULL
14 | , end_date DATE NOT NULL
15 | , PRIMARY KEY(entity, start_date)
16 | )
17 | INSERT INTO {{out_schema}}.raw01A WITH(TABLOCKX)
18 | SELECT apgs.entity entity
19 | , apgs.start_date start_date
20 | , apgs.end_date end_date
21 | FROM (
22 | SELECT entity
23 | , start_date
24 | , end_date
25 | FROM {{in_schema}}.raw01 apgs WITH(NOLOCK)
26 | ) apgs
27 | INNER JOIN (
28 | SELECT DISTINCT entity
29 | FROM {{in_schema}}.raw01 WITH(NOLOCK)
30 | ) base
31 | ON apgs.entity = base.entity
32 | CREATE INDEX raw_start_date ON {{out_schema}}.raw01A (start_date DESC)
33 | CREATE INDEX raw_start_date_end_date ON {{out_schema}}.raw01A (end_date, start_date DESC)
34 |
35 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql/raw/raw_views.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | USE {{out_database}} -- needed for views
4 | GO
5 |
6 | SELECT 1000000 as entity_nr, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.schema00_raw01_table
7 | GO
8 | SELECT '1' as mod_type, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.filter_table
9 | GO
10 |
11 | /*
12 | SECTION: SAMPLING
13 | */
14 | GO
15 | DECLARE @START BIGINT = 0 + (SELECT CAST(MIN(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table);
16 | DECLARE @END BIGINT = (SELECT CAST(MAX(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table);
17 | DECLARE @STEP INT = {{helper_schema}}.get_db_sampling_factor();
18 | DROP TABLE IF EXISTS {{out_schema}}.sample_entities;
19 | WITH L0 AS (SELECT c FROM (SELECT 1 UNION ALL SELECT 1) AS D(c)), -- 2^1
20 | L1 AS (SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B), -- 2^2
21 | L2 AS (SELECT 1 AS c FROM L1 AS A CROSS JOIN L1 AS B), -- 2^4
22 | L3 AS (SELECT 1 AS c FROM L2 AS A CROSS JOIN L2 AS B), -- 2^8
23 | L4 AS (SELECT 1 AS c FROM L3 AS A CROSS JOIN L3 AS B), -- 2^16
24 | L5 AS (SELECT 1 AS c FROM L4 AS A CROSS JOIN L4 AS B), -- 2^32
25 | Nums AS (SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS k FROM L5)
26 | SELECT k * @STEP + @START AS nr
27 | INTO {{out_schema}}.sample_entities
28 | FROM nums
29 | WHERE k <= (@END - @START) / @STEP
30 | CREATE UNIQUE CLUSTERED INDEX nr_index ON {{out_schema}}.sample_entities (nr) WITH ( FILLFACTOR = 100, DATA_COMPRESSION = ROW );
31 |
32 |
33 | /*
34 | SECTION: Raw-Tables
35 | */
36 | GO
37 | PRINT (CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01')
38 | DROP VIEW IF EXISTS {{out_schema_only}}.raw01
39 | GO
40 | CREATE VIEW {{out_schema_only}}.raw01
41 | AS
42 | SELECT entity_nr entity
43 | , start_date start_date
44 | , end_date end_date
45 | FROM dbo.schema00_raw01_table WITH (NOLOCK)
46 | INNER JOIN sample_entities WITH (NOLOCK)
47 | ON entity_nr = sample_entities.nr
48 |
49 |
50 | /*
51 | SECTION: Reference tables
52 | */
53 |
54 | GO
55 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.fm_mod_type')
56 | DROP VIEW IF EXISTS {{out_schema_only}}.fm_mod_type
57 | GO
58 | CREATE VIEW {{out_schema_only}}.fm_mod_type
59 | AS
60 | SELECT mod_type x_inv_type
61 | , start_date start_date
62 | , end_date end_date
63 | FROM dbo.filter_table WITH(NOLOCK)
64 | GO
65 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/create_db_helpers.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 | USE {{out_database}}
3 | GO
4 |
5 |
6 | /*
7 | Section: Procedures
8 | */
9 | IF OBJECT_ID(N'dbo.CREATEALLDATES', N'P') IS NOT NULL DROP PROCEDURE dbo.CREATEALLDATES;
10 | GO
11 | CREATE PROCEDURE CREATEALLDATES
12 | (
13 | @StartDate AS DATE, @EndDate AS DATE
14 | ) AS
15 | DECLARE @Current AS DATE = DATEADD(DD, 0, @StartDate); DROP TABLE IF EXISTS ##alldates CREATE TABLE ##alldates (
16 | dt DATE PRIMARY KEY
17 | ) WHILE @Current <= @EndDate BEGIN
18 | INSERT INTO ##alldates
19 | VALUES (@Current);
20 | SET @Current = DATEADD(DD, 1, @Current) -- add 1 to current day
21 | END
22 | GO
23 |
24 |
25 | /*
26 | Section: Functions
27 | */
28 | -- IF OBJECT_ID(N'dbo.get_db_sampling_factor', N'FN') IS NOT NULL DROP FUNCTION get_db_sampling_factor;
29 | -- GO
30 | CREATE FUNCTION dbo.get_db_sampling_factor () RETURNS INT AS
31 | BEGIN
32 | DECLARE @sampling_rate INT;
33 | SELECT @sampling_rate = ISNULL(TRY_CAST(RIGHT(DB_NAME(), LEN(DB_NAME()) - CHARINDEX('_m', DB_NAME()) - 1) AS INT),
34 | 1 -- fallback: take full sample
35 | );
36 | RETURN @sampling_rate
37 | END;
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/prep/entity_checks.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | USE master -- no default schema
4 | GO
5 |
6 | CREATEALLDATES('2022-01-01', '2023-01-01')
7 |
8 | SELECT * INTO {{out_schema}}.dummy_dates FROM ##alldates
9 |
10 | DROP TABLE IF EXISTS {{out_schema}}.table01
11 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Create table')
12 | GO
13 | CREATE TABLE {{out_schema}}.table01 (
14 | entity VARCHAR(17) NOT NULL
15 | , reason VARCHAR(50) NOT NULL
16 | PRIMARY KEY (entity, reason)
17 | )
18 |
19 |
20 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Missing')
21 | GO
22 | INSERT INTO {{out_schema}}.table01 WITH (TABLOCKX)
23 | SELECT DISTINCT raw01.entity entity
24 | , 'Missing in raw01' reason
25 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
26 | LEFT JOIN (
27 | SELECT DISTINCT entity
28 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
29 | ) raw01x
30 | ON raw01.entity = raw01x.entity
31 | WHERE raw01.end_date = '9999-01-01'
32 | AND raw01x.entity IS NULL
33 |
34 |
35 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - more missing in raw01')
36 | GO
37 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX)
38 | SELECT
39 | raw01.entity entity
40 | , 'missing' reason
41 | FROM {{in_schema}}.raw01 raw01 WITH(NOLOCK)
42 | GROUP BY raw01.entity
43 | HAVING MAX(raw01.end_date) < '9999-01-01'
44 |
45 |
46 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.table01 - Inconsistency correction')
47 | GO
48 | WITH entity_ids AS (
49 | SELECT DISTINCT raw01.entity entity
50 | FROM {{in_schema}}.raw01 raw01 WITH (NOLOCK)
51 | INNER JOIN ( -- filter
52 | SELECT entity
53 | FROM {{in_schema}}.raw01 WITH (NOLOCK)
54 | WHERE end_date = '9999-01-01'
55 | ) raw01_final
56 | ON raw01.entity = raw01_final.entity
57 | WHERE 1=1
58 | )
59 | INSERT INTO {{out_schema}}.table01 WITH(TABLOCKX)
60 | SELECT x.entity
61 | , 'Inconsistency correction' reason
62 | FROM entity_ids x
63 | INNER JOIN entity_ids y
64 | ON x.entity = y.entity
65 | WHERE x.entity <> y.entity
66 | GROUP BY x.entity
67 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/prep/more_tables.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | USE master
4 | GO
5 |
6 | /*
7 | SECTION: raw01A
8 | */
9 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01A')
10 | GO
11 | CREATE TABLE {{out_schema}}.raw01A(
12 | entity VARCHAR(17) NOT NULL
13 | , start_date DATE NOT NULL
14 | , end_date DATE NOT NULL
15 | , PRIMARY KEY(entity, start_date)
16 | )
17 | INSERT INTO {{out_schema}}.raw01A WITH(TABLOCKX)
18 | SELECT apgs.entity entity
19 | , apgs.start_date start_date
20 | , apgs.end_date end_date
21 | FROM (
22 | SELECT entity
23 | , start_date
24 | , end_date
25 | FROM {{in_schema}}.raw01 apgs WITH(NOLOCK)
26 | ) apgs
27 | INNER JOIN (
28 | SELECT DISTINCT entity
29 | FROM {{in_schema}}.raw01 WITH(NOLOCK)
30 | ) base
31 | ON apgs.entity = base.entity
32 | CREATE INDEX raw_start_date ON {{out_schema}}.raw01A (start_date DESC)
33 | CREATE INDEX raw_start_date_end_date ON {{out_schema}}.raw01A (end_date, start_date DESC)
34 |
--------------------------------------------------------------------------------
/tests/test_flows/raw_sql_scripts/mssql_pytsql_isolate/raw/raw_views.sql:
--------------------------------------------------------------------------------
1 | -- This is intentionally crazy TSQL code similar to code "found in the wild"
2 |
3 | USE {{out_database}} -- needed for views
4 | GO
5 |
6 | SELECT 1000000 as entity_nr, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.schema00_raw01_table
7 | GO
8 | SELECT '1' as mod_type, cast('1000-01-01' as DATE) as start_date, cast('9999-01-01' as DATE) as end_date INTO dbo.filter_table
9 | GO
10 |
11 | /*
12 | SECTION: SAMPLING
13 | */
14 | GO
15 | DECLARE @START BIGINT = 0 + (SELECT CAST(MIN(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table);
16 | DECLARE @END BIGINT = (SELECT CAST(MAX(entity_nr) AS BIGINT) FROM dbo.schema00_raw01_table);
17 | DECLARE @STEP INT = {{helper_schema}}.get_db_sampling_factor();
18 | DROP TABLE IF EXISTS {{out_schema}}.sample_entities;
19 | WITH L0 AS (SELECT c FROM (SELECT 1 UNION ALL SELECT 1) AS D(c)), -- 2^1
20 | L1 AS (SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B), -- 2^2
21 | L2 AS (SELECT 1 AS c FROM L1 AS A CROSS JOIN L1 AS B), -- 2^4
22 | L3 AS (SELECT 1 AS c FROM L2 AS A CROSS JOIN L2 AS B), -- 2^8
23 | L4 AS (SELECT 1 AS c FROM L3 AS A CROSS JOIN L3 AS B), -- 2^16
24 | L5 AS (SELECT 1 AS c FROM L4 AS A CROSS JOIN L4 AS B), -- 2^32
25 | Nums AS (SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS k FROM L5)
26 | SELECT k * @STEP + @START AS nr
27 | INTO {{out_schema}}.sample_entities
28 | FROM nums
29 | WHERE k <= (@END - @START) / @STEP
30 | CREATE UNIQUE CLUSTERED INDEX nr_index ON {{out_schema}}.sample_entities (nr) WITH ( FILLFACTOR = 100, DATA_COMPRESSION = ROW );
31 |
32 |
33 | /*
34 | SECTION: Raw-Tables
35 | */
36 | GO
37 | PRINT (CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.raw01')
38 | DROP VIEW IF EXISTS {{out_schema_only}}.raw01
39 | GO
40 | CREATE VIEW {{out_schema_only}}.raw01
41 | AS
42 | SELECT entity_nr entity
43 | , start_date start_date
44 | , end_date end_date
45 | FROM dbo.schema00_raw01_table WITH (NOLOCK)
46 | INNER JOIN sample_entities WITH (NOLOCK)
47 | ON entity_nr = sample_entities.nr
48 |
49 |
50 | /*
51 | SECTION: Reference tables
52 | */
53 |
54 | GO
55 | PRINT(CAST(GETDATE() AS VARCHAR) + ': {{out_schema}}.fm_mod_type')
56 | DROP VIEW IF EXISTS {{out_schema_only}}.fm_mod_type
57 | GO
58 | CREATE VIEW {{out_schema_only}}.fm_mod_type
59 | AS
60 | SELECT mod_type x_inv_type
61 | , start_date start_date
62 | , end_date end_date
63 | FROM dbo.filter_table WITH(NOLOCK)
64 | GO
65 |
--------------------------------------------------------------------------------
/tests/test_flows/sql_scripts/script1-db2.sql:
--------------------------------------------------------------------------------
1 | SELECT 12 AS coltab1 FROM SYSIBM.SYSDUMMY1
2 |
--------------------------------------------------------------------------------
/tests/test_flows/sql_scripts/script1.sql:
--------------------------------------------------------------------------------
1 | SELECT 12 AS coltab1
2 |
--------------------------------------------------------------------------------
/tests/test_flows/sql_scripts/script2.sql:
--------------------------------------------------------------------------------
1 | SELECT COLTAB1 + 12 AS coltab2
2 | FROM {{dependent}}
3 |
--------------------------------------------------------------------------------
/tests/test_flows/test_example.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from example.run_pipeline import main as example_flow_main
6 | from example.simple_pipeline import main as simple_pipeline_main
7 | from example.visualization import main as visualization_main
8 | from example_imperative.run_pipeline import main as example_imperative_flow_main
9 | from example_interactive.run_tasks_interactively import main as example_interactive_main
10 | from example_postgres.run_pipeline import main as example_postgres_flow_main
11 |
12 |
13 | @pytest.mark.parametrize(
14 | "fn",
15 | [
16 | example_flow_main,
17 | simple_pipeline_main,
18 | visualization_main,
19 | example_imperative_flow_main,
20 | example_postgres_flow_main,
21 | example_interactive_main,
22 | ],
23 | )
24 | def test_examples(fn):
25 | """
26 | This test just runs the example pipeline that we provide in example/run_pipeline.py
27 | """
28 |
29 | fn()
30 |
--------------------------------------------------------------------------------
/tests/test_flows/test_flow.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import sqlalchemy as sa
5 | from pandas.testing import assert_frame_equal
6 |
7 | from pydiverse.pipedag import Blob, Flow, Stage, Table, materialize
8 | from pydiverse.pipedag.context import StageLockContext
9 |
10 | dfA = pd.DataFrame(
11 | {
12 | "a": [0, 1, 2, 4],
13 | "b": [9, 8, 7, 6],
14 | }
15 | )
16 |
17 | dfB = pd.DataFrame(
18 | {
19 | "a": [2, 1, 0, 1],
20 | "x": [1, 1, 2, 2],
21 | }
22 | )
23 |
24 |
25 | @materialize(nout=2, version="1")
26 | def inputs():
27 | import time
28 |
29 | time.sleep(1)
30 | return Table(dfA, "dfA"), Table(dfB, "dfB_%%")
31 |
32 |
33 | @materialize(input_type=pd.DataFrame)
34 | def double_values(df: pd.DataFrame):
35 | return Table(df.transform(lambda x: x * 2))
36 |
37 |
38 | @materialize(input_type=sa.Table, lazy=True)
39 | def join_on_a(left: sa.sql.expression.Alias, right: sa.sql.expression.Alias):
40 | return Table(left.select().join(right, left.c.a == right.c.a))
41 |
42 |
43 | @materialize(input_type=pd.DataFrame)
44 | def list_arg(x: list[pd.DataFrame]):
45 | assert isinstance(x[0], pd.DataFrame)
46 | return Blob(x)
47 |
48 |
49 | @materialize
50 | def blob_task(x, y):
51 | return Blob(x), Blob(y)
52 |
53 |
54 | def test_simple_flow(with_blob=True):
55 | with Flow() as flow:
56 | with Stage("simple_flow_stage1"):
57 | inp = inputs()
58 | a, b = inp
59 |
60 | a2 = double_values(a)
61 | b2 = double_values(b)
62 | b4 = double_values(b2)
63 | b4 = double_values(b4)
64 | x = list_arg([a2, b, b4])
65 |
66 | with Stage("simple_flow_stage2"):
67 | joined = join_on_a(a2, b4)
68 | joined_times_2 = double_values(joined)
69 |
70 | if with_blob:
71 | v = blob_task(x, x)
72 | v = blob_task(v, v)
73 | v = blob_task(v, v)
74 |
75 | blob_tuple = blob_task(1, 2)
76 |
77 | with StageLockContext():
78 | result = flow.run() # this will use the default configuration instance=__any__
79 | assert result.successful
80 |
81 | # Check result.get works
82 | res_a = result.get(a, as_type=pd.DataFrame)
83 | res_b = result.get(b, as_type=pd.DataFrame)
84 | res_inp = result.get(inp, as_type=pd.DataFrame)
85 | res_joined = result.get(joined, as_type=pd.DataFrame)
86 | res_joined_times_2 = result.get(joined_times_2, as_type=pd.DataFrame)
87 |
88 | assert_frame_equal(res_a, dfA, check_dtype=False)
89 | assert_frame_equal(res_b, dfB, check_dtype=False)
90 | assert_frame_equal(res_inp[0], dfA, check_dtype=False)
91 | assert_frame_equal(res_inp[1], dfB, check_dtype=False)
92 | assert_frame_equal(res_joined * 2, res_joined_times_2)
93 |
94 | result.get(x)
95 | if with_blob:
96 | result.get(v)
97 | assert tuple(result.get(blob_tuple)) == (1, 2)
98 |
99 |
100 | if __name__ == "__main__":
101 | test_simple_flow()
102 |
--------------------------------------------------------------------------------
/tests/test_flows/test_simple_flow.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import sqlalchemy as sa
5 |
6 | from pydiverse.pipedag import Flow, Stage, Table, materialize
7 | from tests.fixtures.instances import (
8 | DATABASE_INSTANCES,
9 | ORCHESTRATION_INSTANCES,
10 | with_instances,
11 | )
12 |
13 |
14 | @materialize(nout=2, version="1.1")
15 | def inputs():
16 | df_a = pd.DataFrame(
17 | {
18 | "a": [0, 1, 2, 4],
19 | "b": [9, 8, 7, 6],
20 | }
21 | )
22 |
23 | df_b = pd.DataFrame(
24 | {
25 | "a": [2, 1, 0, 1],
26 | "x": [1, 1, 2, 2],
27 | }
28 | )
29 | return Table(df_a, "dfA", primary_key=["a"]), Table(df_b, "dfB")
30 |
31 |
32 | @materialize(input_type=pd.DataFrame, version="1.0")
33 | def double_values(df: pd.DataFrame):
34 | df["a"] = df["a"] * 2
35 | return Table(df)
36 |
37 |
38 | @materialize(input_type=sa.Table, lazy=True)
39 | def join_on_a(left: sa.sql.expression.Alias, right: sa.sql.expression.Alias):
40 | return Table(left.select().join(right, left.c.a == right.c.a))
41 |
42 |
43 | # noinspection PyTypeChecker
44 | def get_flow():
45 | with Flow() as flow:
46 | with Stage("simple_flow_stage1"):
47 | a, b = inputs()
48 | a2 = double_values(a)
49 |
50 | with Stage("simple_flow_stage2"):
51 | b2 = double_values(b)
52 | joined = join_on_a(a2, b2)
53 | _ = joined
54 | return flow
55 |
56 |
57 | @with_instances(DATABASE_INSTANCES, ORCHESTRATION_INSTANCES)
58 | def test_simple_flow():
59 | flow = get_flow()
60 | result = flow.run()
61 | assert result.successful
62 |
63 |
64 | if __name__ == "__main__":
65 | test_simple_flow()
66 |
--------------------------------------------------------------------------------
/tests/test_flows/test_source_invalidation.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 |
5 | from pydiverse.pipedag import Flow, Stage, Table, materialize
6 | from pydiverse.pipedag.context import StageLockContext
7 | from pydiverse.pipedag.context.context import CacheValidationMode
8 |
9 | dfA_source = pd.DataFrame(
10 | {
11 | "a": [0, 1, 2, 4],
12 | "b": [9, 8, 7, 6],
13 | }
14 | )
15 | dfA = dfA_source.copy()
16 | input_hash = hash(str(dfA))
17 |
18 |
19 | def has_new_input(dummy_arg):
20 | """Returns whether new input is available via input hash.
21 |
22 | :param dummy_arg: Argument used to test that custom cache invalidation function
23 | gets same arguments as task function
24 | :return: hash value of input (stored hash must not exactly be input hash)
25 | """
26 | assert dummy_arg == "irrelevant"
27 | global input_hash
28 | return input_hash
29 |
30 |
31 | # noinspection DuplicatedCode
32 | @materialize(nout=2, cache=has_new_input, version="1.0")
33 | def input_task(dummy_arg):
34 | global dfA
35 | return Table(dfA, "dfA"), Table(dfA, "dfB")
36 |
37 |
38 | @materialize(input_type=pd.DataFrame, version="1.0")
39 | def double_values(df: pd.DataFrame):
40 | return Table(df.transform(lambda x: x * 2))
41 |
42 |
43 | # noinspection PyTypeChecker
44 | def get_flow():
45 | with Flow("FLOW") as flow:
46 | with Stage("stage_1"):
47 | dummy_arg = "irrelevant"
48 | a, b = input_task(dummy_arg)
49 | a2 = double_values(a)
50 |
51 | with Stage("stage_2"):
52 | b2 = double_values(b)
53 | a3 = double_values(a2)
54 |
55 | return flow, b2, a3
56 |
57 |
58 | def test_source_invalidation():
59 | # trigger reload of input data
60 | global dfA
61 | global input_hash
62 |
63 | flow, out1, out2 = get_flow()
64 |
65 | with StageLockContext():
66 | result = flow.run()
67 | assert result.successful
68 |
69 | v_out1, v_out2 = result.get(out1), result.get(out2)
70 | pd.testing.assert_frame_equal(dfA_source * 2, v_out1, check_dtype=False)
71 | pd.testing.assert_frame_equal(dfA_source * 4, v_out2, check_dtype=False)
72 |
73 | # modify input without updating input hash => cached version is used
74 | dfA["a"] = 10 + dfA_source["a"]
75 |
76 | # this run should work from caches and not change outputs
77 | with StageLockContext():
78 | result = flow.run()
79 | assert result.successful
80 |
81 | v_out1, v_out2 = result.get(out1), result.get(out2)
82 | pd.testing.assert_frame_equal(dfA_source * 2, v_out1, check_dtype=False)
83 | pd.testing.assert_frame_equal(dfA_source * 4, v_out2, check_dtype=False)
84 |
85 | # update input hash trigger reload of new input data
86 | input_hash = hash(str(dfA))
87 |
88 | with StageLockContext():
89 | # this run should ignore fresh input at source nodes and not change outputs
90 | result = flow.run(cache_validation_mode=CacheValidationMode.IGNORE_FRESH_INPUT)
91 | assert result.successful
92 |
93 | v_out1, v_out2 = result.get(out1), result.get(out2)
94 | pd.testing.assert_frame_equal(dfA_source * 2, v_out1, check_dtype=False)
95 | pd.testing.assert_frame_equal(dfA_source * 4, v_out2, check_dtype=False)
96 |
97 | with StageLockContext():
98 | result = flow.run()
99 | assert result.successful
100 |
101 | v_out1, v_out2 = result.get(out1), result.get(out2)
102 |
103 | pd.testing.assert_frame_equal(dfA * 2, v_out1, check_dtype=False)
104 | pd.testing.assert_frame_equal(dfA * 4, v_out2, check_dtype=False)
105 |
106 |
107 | if __name__ == "__main__":
108 | test_source_invalidation()
109 |
--------------------------------------------------------------------------------
/tests/test_flows/test_sql_text_node.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | import pandas as pd
6 | import sqlalchemy as sa
7 |
8 | from pydiverse.pipedag import ConfigContext, Flow, Stage, Table, materialize
9 | from tests.fixtures.instances import with_instances
10 |
11 |
12 | @materialize(input_type=sa.Table, lazy=True)
13 | def table_1(script_path: str):
14 | sql = Path(script_path).read_text(encoding="utf-8")
15 | return Table(sa.text(sql), name="table_1")
16 |
17 |
18 | @materialize(input_type=sa.Table, lazy=True)
19 | def table_2(script_path: str, dependent_table: Table):
20 | sql = (
21 | Path(script_path)
22 | .read_text(encoding="utf-8")
23 | .replace("{{dependent}}", str(dependent_table.original))
24 | )
25 | return Table(sa.text(sql), name="test_table2")
26 |
27 |
28 | @materialize(input_type=pd.DataFrame, lazy=True)
29 | def assert_result(df: pd.DataFrame):
30 | pd.testing.assert_frame_equal(
31 | df, pd.DataFrame({"coltab2": [24]}), check_dtype=False
32 | )
33 |
34 |
35 | @with_instances("postgres", "mssql", "ibm_db2", per_user=True)
36 | def test_sql_node():
37 | instance_name = ConfigContext.get().instance_name
38 |
39 | script_1_name = {
40 | "ibm_db2": "script1-db2.sql",
41 | }.get(instance_name, "script1.sql")
42 | script_2_name = "script2.sql"
43 |
44 | with Flow("FLOW") as flow:
45 | with Stage("schema1"):
46 | parent_dir = Path(__file__).parent
47 | tab1 = table_1(str(parent_dir / "sql_scripts" / script_1_name))
48 | tab2 = table_2(str(parent_dir / "sql_scripts" / script_2_name), tab1)
49 | assert_result(tab2)
50 |
51 | flow_result = flow.run()
52 | assert flow_result.successful
53 |
54 |
55 | if __name__ == "__main__":
56 | test_sql_node()
57 |
--------------------------------------------------------------------------------
/tests/test_indexes.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from pydiverse.pipedag import Flow, Stage
6 |
7 | # Parameterize all tests in this file with several instance_id configurations
8 | from tests.fixtures.instances import DATABASE_INSTANCES, with_instances
9 | from tests.util import tasks_library as m
10 | from tests.util import tasks_library_imperative as m2
11 |
12 | pytestmark = [with_instances(DATABASE_INSTANCES)]
13 |
14 |
15 | @pytest.mark.parametrize(
16 | "task",
17 | [
18 | m.simple_dataframe,
19 | m.simple_dataframe_with_pk,
20 | m.simple_dataframe_with_pk2,
21 | m.simple_dataframe_with_index,
22 | m.simple_dataframe_with_indexes,
23 | m.simple_dataframes_with_indexes,
24 | m.simple_lazy_table,
25 | m.simple_lazy_table_with_pk,
26 | m.simple_lazy_table_with_pk2,
27 | m.simple_lazy_table_with_index,
28 | m.simple_lazy_table_with_indexes,
29 | m2.simple_lazy_table,
30 | m2.simple_lazy_table_with_pk,
31 | ],
32 | )
33 | def test_materialize_table_with_indexes(task):
34 | with Flow("flow") as f:
35 | with Stage("stage"):
36 | x = task()
37 |
38 | m.assert_table_equal(x, x)
39 | m.check_pk_length(x)
40 |
41 | assert f.run().successful
42 |
--------------------------------------------------------------------------------
/tests/test_raw_sql/scripts/mssql/create_tables/simple_tables.sql:
--------------------------------------------------------------------------------
1 | SELECT 1 as x, 1 as y INTO {{out_schema}}.table_1;
2 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 2);
3 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 3);
4 | GO
5 |
6 | SELECT 1 as x, 1 as y INTO {{out_schema}}.table_2;
7 | INSERT INTO {{out_schema}}.table_2 VALUES (2, 2);
8 | INSERT INTO {{out_schema}}.table_2 VALUES (3, 3);
9 | GO
10 |
--------------------------------------------------------------------------------
/tests/test_raw_sql/scripts/mssql/schema_swap/check_objects.sql:
--------------------------------------------------------------------------------
1 | -- Test that table exists
2 | SELECT 1 FROM {{in_schema}}.t;
3 | GO
4 |
5 | -- Test that view exists
6 | SELECT 1 FROM {{in_schema}}.v;
7 | GO
8 |
9 | -- Test that procedure exists
10 | {{in_schema}}.p 1;
11 | GO
12 |
13 | -- Test that function exists
14 | SELECT ({{in_schema}}.f (1, 2));
15 | GO
16 |
17 |
--------------------------------------------------------------------------------
/tests/test_raw_sql/scripts/mssql/schema_swap/create_objects.sql:
--------------------------------------------------------------------------------
1 | -- Create a table
2 | SELECT 1 as x, 2 as y INTO {{out_schema}}.t;
3 | GO
4 |
5 | -- Create a view
6 | CREATE VIEW {{out_schema}}.v AS SELECT * FROM t;
7 | GO
8 |
9 | -- Create a procedure
10 | CREATE PROC {{out_schema}}.p(@id INT) AS
11 | BEGIN
12 | SELECT *
13 | FROM t
14 | WHERE x = @id
15 | END;
16 | GO
17 |
18 | -- Create a function
19 | CREATE FUNCTION {{out_schema}}.f(@x INT, @y INT)
20 | RETURNS INT
21 | AS
22 | BEGIN
23 | RETURN (@x + @y)
24 | END;
25 | GO
--------------------------------------------------------------------------------
/tests/test_raw_sql/scripts/postgres/create_tables/simple_tables.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {{out_schema}}.table_1 AS SELECT 1 as x, 1 as y;
2 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 2);
3 | INSERT INTO {{out_schema}}.table_1 VALUES (1, 3);
4 |
5 | CREATE TABLE {{out_schema}}.table_2 AS SELECT 1 as x, 1 as y;
6 | INSERT INTO {{out_schema}}.table_2 VALUES (2, 2);
7 | INSERT INTO {{out_schema}}.table_2 VALUES (3, 3);
8 |
--------------------------------------------------------------------------------
/tests/test_raw_sql/test_raw_sql_input.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | import pandas as pd
6 |
7 | from pydiverse.pipedag import Flow, Stage, materialize
8 | from pydiverse.pipedag.context import ConfigContext
9 | from tests.fixtures.instances import with_instances
10 | from tests.test_raw_sql.util import sql_script
11 |
12 |
13 | @materialize(input_type=pd.DataFrame)
14 | def raw_sql_object(raw_sql):
15 | df_1 = raw_sql["table_1"]
16 | df_2 = raw_sql["table_2"]
17 |
18 | assert not df_1.empty
19 | assert not df_2.empty
20 |
21 |
22 | @materialize(input_type=pd.DataFrame)
23 | def raw_sql_individual_table(df_1):
24 | assert not df_1.empty
25 |
26 |
27 | @with_instances("postgres", "mssql")
28 | def test_raw_sql_task_input():
29 | instance_name = ConfigContext.get().instance_name
30 | dir_ = Path(__file__).parent / "scripts" / instance_name / "create_tables"
31 |
32 | with Flow() as f:
33 | with Stage("raw_0"):
34 | simple_tables = sql_script("simple_tables.sql", dir_)
35 |
36 | raw_sql_object(simple_tables)
37 | raw_sql_individual_table(simple_tables["table_1"])
38 | raw_sql_individual_table(simple_tables["table_2"])
39 |
40 | f.run()
41 | f.run()
42 |
--------------------------------------------------------------------------------
/tests/test_raw_sql/test_raw_sql_schema_swap.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | from pydiverse.pipedag import Flow, Stage
6 | from pydiverse.pipedag.context import ConfigContext, FinalTaskState
7 | from tests.fixtures.instances import with_instances
8 | from tests.test_raw_sql.util import sql_script
9 |
10 |
11 | # TODO: Extend tests for other backends
12 | @with_instances("mssql")
13 | def test_raw_sql_schema_swap():
14 | # This test creates various different objects in one schema and then
15 | # checks if, after swapping the schema, if they are still working correctly.
16 |
17 | instance_name = ConfigContext.get().instance_name
18 | dir_ = Path(__file__).parent / "scripts" / instance_name / "schema_swap"
19 |
20 | with Flow() as f:
21 | with Stage("raw_0") as raw_0:
22 | sql_1 = sql_script("create_objects.sql", dir_)
23 | with Stage("raw_1"):
24 | sql_2 = sql_script(
25 | "check_objects.sql", dir_, input_stage=raw_0, depend=[sql_1]
26 | )
27 |
28 | f.run()
29 |
30 | # Check that running the flow again results in the cache being used
31 | for _ in range(2):
32 | result = f.run()
33 | assert result.task_states[sql_1] == FinalTaskState.CACHE_VALID
34 | assert result.task_states[sql_2] == FinalTaskState.CACHE_VALID
35 |
--------------------------------------------------------------------------------
/tests/test_raw_sql/util.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | import sqlalchemy as sa
6 |
7 | from pydiverse.pipedag import Stage, materialize
8 | from pydiverse.pipedag.container import RawSql
9 | from pydiverse.pipedag.context import ConfigContext, TaskContext
10 |
11 |
12 | @materialize(input_type=sa.Table, lazy=True)
13 | def sql_script(
14 | name: str,
15 | script_directory: Path,
16 | *,
17 | input_stage=None,
18 | depend=None,
19 | ):
20 | _ = depend # only relevant for adding additional task dependency
21 | stage = TaskContext.get().task.stage
22 |
23 | script_path = script_directory / name
24 | sql = Path(script_path).read_text(encoding="utf-8")
25 | sql = raw_sql_bind_schema(sql, "out_", stage, transaction=True)
26 | sql = raw_sql_bind_schema(sql, "in_", input_stage)
27 | return RawSql(sql)
28 |
29 |
30 | def raw_sql_bind_schema(
31 | sql, prefix: str, stage: Stage | RawSql | None, *, transaction=False
32 | ):
33 | config = ConfigContext.get()
34 | store = config.store.table_store
35 | if stage is not None:
36 | stage_name = stage.transaction_name if transaction else stage.name
37 | schema_name = store.get_schema(stage_name).get()
38 | sql = sql.replace(f"{{{{{prefix}schema}}}}", schema_name)
39 | return sql
40 |
--------------------------------------------------------------------------------
/tests/test_sql_ddl.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pydiverse.pipedag.backend.table.sql.ddl import insert_into_in_query
4 |
5 |
6 | def test_insert_into():
7 | test_pairs = {
8 | "Select 1": "Select 1 INTO a.b",
9 | "Select 1 as _from": "Select 1 as _from INTO a.b",
10 | "Select 1 as afrom": "Select 1 as afrom INTO a.b",
11 | "Select 1 WHERE TRUE": "Select 1 INTO a.b WHERE TRUE",
12 | "Select 1 GROUP\nBY x": "Select 1 INTO a.b GROUP\nBY x",
13 | "Select 1 FROM A GROUP BY x": "Select 1 INTO a.b FROM A GROUP BY x",
14 | "Select 1 UNION ALL SELECT 2": "Select 1 INTO a.b UNION ALL SELECT 2",
15 | "Select 1 From X": "Select 1 INTO a.b From X",
16 | "Select (SELECT 1 FROM Y) From X": "Select (SELECT 1 FROM Y) INTO a.b From X",
17 | "Select (SELECT (SELECT 1 FROM Z) FROM Y) From X": (
18 | "Select (SELECT (SELECT 1 FROM Z) FROM Y) INTO a.b From X"
19 | ),
20 | "Select a.[from] from a": "Select a.[from] INTO a.b from a",
21 | "Select a.[ from ] from a": "Select a.[ from ] INTO a.b from a",
22 | 'Select "from" from a': 'Select "from" INTO a.b from a',
23 | }
24 | for raw_query, expected_query in test_pairs.items():
25 | res = insert_into_in_query(raw_query, "a", "b")
26 | assert res == expected_query
27 |
--------------------------------------------------------------------------------
/tests/test_sql_dialect/scripts/lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/test_sql_dialect/scripts/lock
--------------------------------------------------------------------------------
/tests/test_sql_dialect/scripts/simple_nicknames.sql:
--------------------------------------------------------------------------------
1 | BEGIN
2 | IF EXISTS (SELECT * FROM SYSCAT.WRAPPERS WHERE WRAPNAME = 'DRDA')
3 | THEN EXECUTE IMMEDIATE 'DROP WRAPPER DRDA';
4 | END IF;
5 | END|
6 | CREATE WRAPPER DRDA|
7 | CREATE SERVER remote_db TYPE DB2/LUW VERSION 11 WRAPPER DRDA
8 | AUTHORIZATION "db2inst1" PASSWORD "password" OPTIONS (
9 | HOST '127.0.0.1', PORT '50000', DBNAME 'testdb'
10 | )|
11 |
12 | CREATE NICKNAME {{out_schema}}.nick1 FOR remote_db.{{out_schema}}.{{out_table}}|
13 | CREATE NICKNAME {{out_schema}}.nick2 FOR remote_db.{{out_schema}}.{{out_table}}|
--------------------------------------------------------------------------------
/tests/test_sql_dialect/scripts/simple_table_spaces.sql:
--------------------------------------------------------------------------------
1 | BEGIN
2 | IF NOT EXISTS (SELECT * FROM SYSCAT.TABLESPACES WHERE TBSPACE = 'S1')
3 | THEN EXECUTE IMMEDIATE 'CREATE TABLESPACE S1';
4 | END IF;
5 | END|
6 | BEGIN
7 | IF NOT EXISTS (SELECT * FROM SYSCAT.TABLESPACES WHERE TBSPACE = 'S2')
8 | THEN EXECUTE IMMEDIATE 'CREATE TABLESPACE S2';
9 | END IF;
10 | END|
11 | BEGIN
12 | IF NOT EXISTS (SELECT * FROM SYSCAT.TABLESPACES WHERE TBSPACE = 'S3')
13 | THEN EXECUTE IMMEDIATE 'CREATE TABLESPACE S3';
14 | END IF;
15 | END|
16 |
--------------------------------------------------------------------------------
/tests/test_sql_dialect/test_postgres.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import uuid
4 |
5 | import pandas as pd
6 | import sqlalchemy as sa
7 | import structlog
8 |
9 | from pydiverse.pipedag import Flow, Stage, materialize
10 | from pydiverse.pipedag.context import ConfigContext
11 | from tests.fixtures.instances import with_instances
12 |
13 |
14 | @with_instances("postgres", "postgres_unlogged")
15 | def test_postgres_unlogged():
16 | @materialize(version="1.0.0")
17 | def dataframe(manual_invalidate):
18 | _ = manual_invalidate
19 | return pd.DataFrame({"x": [1]})
20 |
21 | @materialize(lazy=True)
22 | def sql_table(manual_invalidate):
23 | _ = manual_invalidate
24 | return sa.select(sa.literal(1).label("x"))
25 |
26 | @materialize(input_type=sa.Table)
27 | def get_relpersistence(table: sa.sql.expression.Alias):
28 | return sa.text(
29 | """
30 | SELECT relpersistence
31 | FROM pg_class
32 | LEFT JOIN pg_namespace ON pg_class.relnamespace = pg_namespace.oid
33 | WHERE nspname = :schema
34 | AND relname = :name
35 | """
36 | ).bindparams(
37 | schema=str(table.original.schema),
38 | name=str(table.original.name),
39 | )
40 |
41 | @materialize(input_type=pd.DataFrame)
42 | def assert_relpersistence(df: pd.DataFrame):
43 | relpersistence = (
44 | "u"
45 | if ConfigContext.get()
46 | .store.table_store.materialization_details["__any__"]
47 | .unlogged
48 | else "p"
49 | )
50 | assert df["relpersistence"][0] == relpersistence
51 |
52 | def get_flow(manual_invalidate, partial_invalidate):
53 | with Flow() as f:
54 | with Stage("stage"):
55 | df = dataframe(manual_invalidate)
56 | tbl = sql_table(manual_invalidate)
57 | # just to prevent 100% cache validity
58 | _ = sql_table(partial_invalidate)
59 | with Stage("check"):
60 | rp_df = get_relpersistence(df)
61 | rp_tbl = get_relpersistence(tbl)
62 | assert_relpersistence(rp_df)
63 | assert_relpersistence(rp_tbl)
64 | return f
65 |
66 | manual_invalidate = str(uuid.uuid4())
67 | partial_invalidate = str(uuid.uuid4())
68 |
69 | logger = structlog.get_logger("test_postgres_unlogged")
70 | logger.info("1st run")
71 | f = get_flow(manual_invalidate, partial_invalidate)
72 | f.run()
73 |
74 | logger.info("2nd run with 100% cache valid stage")
75 | f.run()
76 |
77 | logger.info("3rd run with partial cache invalid stage")
78 | partial_invalidate = str(uuid.uuid4())
79 | f = get_flow(manual_invalidate, partial_invalidate)
80 | f.run()
81 |
--------------------------------------------------------------------------------
/tests/test_table_hooks/lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pydiverse/pydiverse.pipedag/df295a3056aec42facf50e3b1ad40416da4b740d/tests/test_table_hooks/lock
--------------------------------------------------------------------------------
/tests/test_table_hooks/test_dtype_polars.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING
4 |
5 | import pytest
6 |
7 | from pydiverse.pipedag.backend.table.util import DType
8 |
9 | pl = pytest.importorskip("polars")
10 |
11 | if TYPE_CHECKING:
12 | import polars as pl
13 |
14 |
15 | def test_dtype_from_polars():
16 | def assert_conversion(type_, expected):
17 | assert DType.from_polars(type_) == expected
18 |
19 | assert_conversion(pl.Int64, DType.INT64)
20 | assert_conversion(pl.Int32, DType.INT32)
21 | assert_conversion(pl.Int16, DType.INT16)
22 | assert_conversion(pl.Int8, DType.INT8)
23 |
24 | assert_conversion(pl.UInt64, DType.UINT64)
25 | assert_conversion(pl.UInt32, DType.UINT32)
26 | assert_conversion(pl.UInt16, DType.UINT16)
27 | assert_conversion(pl.UInt8, DType.UINT8)
28 |
29 | assert_conversion(pl.Float64, DType.FLOAT64)
30 | assert_conversion(pl.Float32, DType.FLOAT32)
31 |
32 | assert_conversion(pl.Utf8, DType.STRING)
33 | assert_conversion(pl.Boolean, DType.BOOLEAN)
34 |
35 | assert_conversion(pl.Date, DType.DATE)
36 | assert_conversion(pl.Time, DType.TIME)
37 | assert_conversion(pl.Datetime, DType.DATETIME)
38 | assert_conversion(pl.Datetime("ms"), DType.DATETIME)
39 | assert_conversion(pl.Datetime("us"), DType.DATETIME)
40 | assert_conversion(pl.Datetime("ns"), DType.DATETIME)
41 |
42 |
43 | def test_dtype_to_polars():
44 | def assert_conversion(type_: DType, expected):
45 | assert type_.to_polars() == expected
46 |
47 | assert_conversion(DType.INT64, pl.Int64)
48 | assert_conversion(DType.INT32, pl.Int32)
49 | assert_conversion(DType.INT16, pl.Int16)
50 | assert_conversion(DType.INT8, pl.Int8)
51 |
52 | assert_conversion(DType.UINT64, pl.UInt64)
53 | assert_conversion(DType.UINT32, pl.UInt32)
54 | assert_conversion(DType.UINT16, pl.UInt16)
55 | assert_conversion(DType.UINT8, pl.UInt8)
56 |
57 | assert_conversion(DType.FLOAT64, pl.Float64)
58 | assert_conversion(DType.FLOAT32, pl.Float32)
59 |
60 | assert_conversion(DType.STRING, pl.Utf8)
61 | assert_conversion(DType.BOOLEAN, pl.Boolean)
62 |
63 | assert_conversion(DType.DATE, pl.Date)
64 | assert_conversion(DType.TIME, pl.Time)
65 | assert_conversion(DType.DATETIME, pl.Datetime("us"))
66 |
--------------------------------------------------------------------------------
/tests/test_table_hooks/test_dtype_pyarrow.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pyarrow as pa
4 |
5 | from pydiverse.pipedag.backend.table.util import DType
6 |
7 |
8 | def test_dtype_from_pyarrow():
9 | def assert_conversion(type_, expected):
10 | assert DType.from_arrow(type_) == expected
11 |
12 | assert_conversion(pa.int64(), DType.INT64)
13 | assert_conversion(pa.int32(), DType.INT32)
14 | assert_conversion(pa.int16(), DType.INT16)
15 | assert_conversion(pa.int8(), DType.INT8)
16 |
17 | assert_conversion(pa.uint64(), DType.UINT64)
18 | assert_conversion(pa.uint32(), DType.UINT32)
19 | assert_conversion(pa.uint16(), DType.UINT16)
20 | assert_conversion(pa.uint8(), DType.UINT8)
21 |
22 | assert_conversion(pa.float64(), DType.FLOAT64)
23 | assert_conversion(pa.float32(), DType.FLOAT32)
24 | assert_conversion(pa.float16(), DType.FLOAT32)
25 |
26 | assert_conversion(pa.string(), DType.STRING)
27 | assert_conversion(pa.bool_(), DType.BOOLEAN)
28 |
29 | assert_conversion(pa.date32(), DType.DATE)
30 | assert_conversion(pa.date64(), DType.DATE)
31 |
32 | assert_conversion(pa.time32("s"), DType.TIME)
33 | assert_conversion(pa.time32("ms"), DType.TIME)
34 | assert_conversion(pa.time64("us"), DType.TIME)
35 | assert_conversion(pa.time64("ns"), DType.TIME)
36 |
37 | assert_conversion(pa.timestamp("s"), DType.DATETIME)
38 | assert_conversion(pa.timestamp("ms"), DType.DATETIME)
39 | assert_conversion(pa.timestamp("us"), DType.DATETIME)
40 | assert_conversion(pa.timestamp("ns"), DType.DATETIME)
41 |
42 |
43 | def test_dtype_to_pyarrow():
44 | def assert_conversion(type_: DType, expected):
45 | assert type_.to_arrow() == expected
46 |
47 | assert_conversion(DType.INT64, pa.int64())
48 | assert_conversion(DType.INT32, pa.int32())
49 | assert_conversion(DType.INT16, pa.int16())
50 | assert_conversion(DType.INT8, pa.int8())
51 |
52 | assert_conversion(DType.UINT64, pa.uint64())
53 | assert_conversion(DType.UINT32, pa.uint32())
54 | assert_conversion(DType.UINT16, pa.uint16())
55 | assert_conversion(DType.UINT8, pa.uint8())
56 |
57 | assert_conversion(DType.FLOAT64, pa.float64())
58 | assert_conversion(DType.FLOAT32, pa.float32())
59 |
60 | assert_conversion(DType.STRING, pa.string())
61 | assert_conversion(DType.BOOLEAN, pa.bool_())
62 |
63 | assert_conversion(DType.DATE, pa.date32())
64 | assert_conversion(DType.TIME, pa.time64("us"))
65 | assert_conversion(DType.DATETIME, pa.timestamp("us"))
66 |
--------------------------------------------------------------------------------
/tests/test_table_hooks/test_dtype_sqlalchemy.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import sqlalchemy as sa
4 |
5 | from pydiverse.pipedag.backend.table.util import DType
6 |
7 |
8 | def test_dtype_from_sqlalchemy():
9 | def assert_conversion(type_, expected):
10 | assert DType.from_sql(type_) == expected
11 |
12 | assert_conversion(sa.BigInteger(), DType.INT64)
13 | assert_conversion(sa.Integer(), DType.INT32)
14 | assert_conversion(sa.SmallInteger(), DType.INT16)
15 |
16 | assert_conversion(sa.Numeric(), DType.FLOAT64)
17 | assert_conversion(sa.Numeric(13, 2), DType.FLOAT64)
18 | assert_conversion(sa.Numeric(1, 0), DType.FLOAT64)
19 | assert_conversion(sa.DECIMAL(13, 2), DType.FLOAT64)
20 | assert_conversion(sa.DECIMAL(1, 0), DType.FLOAT64)
21 | assert_conversion(sa.Float(), DType.FLOAT64)
22 | assert_conversion(sa.Float(24), DType.FLOAT32)
23 | assert_conversion(sa.Float(53), DType.FLOAT64)
24 |
25 | assert_conversion(sa.String(), DType.STRING)
26 | assert_conversion(sa.Boolean(), DType.BOOLEAN)
27 |
28 | assert_conversion(sa.Date(), DType.DATE)
29 | assert_conversion(sa.Time(), DType.TIME)
30 | assert_conversion(sa.DateTime(), DType.DATETIME)
31 |
32 |
33 | def test_dtype_to_sqlalchemy():
34 | def assert_conversion(type_: DType, expected):
35 | assert isinstance(type_.to_sql(), expected)
36 |
37 | assert_conversion(DType.INT64, sa.BigInteger)
38 | assert_conversion(DType.INT32, sa.Integer)
39 | assert_conversion(DType.INT16, sa.SmallInteger)
40 | assert_conversion(DType.INT8, sa.SmallInteger)
41 |
42 | assert_conversion(DType.UINT64, sa.BigInteger)
43 | assert_conversion(DType.UINT32, sa.BigInteger)
44 | assert_conversion(DType.UINT16, sa.Integer)
45 | assert_conversion(DType.UINT8, sa.SmallInteger)
46 |
47 | assert_conversion(DType.FLOAT64, sa.Float)
48 | assert_conversion(DType.FLOAT32, sa.Float)
49 |
50 | assert_conversion(DType.STRING, sa.String)
51 | assert_conversion(DType.BOOLEAN, sa.Boolean)
52 |
53 | assert_conversion(DType.DATE, sa.Date)
54 | assert_conversion(DType.TIME, sa.Time)
55 | assert_conversion(DType.DATETIME, sa.DateTime)
56 |
--------------------------------------------------------------------------------
/tests/test_table_hooks/test_ibis.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import pytest
5 |
6 | from pydiverse.pipedag import *
7 |
8 | # Parameterize all tests in this file with several instance_id configurations
9 | from tests.fixtures.instances import DATABASE_INSTANCES, skip_instances, with_instances
10 | from tests.util.tasks_library import assert_table_equal
11 |
12 | pytestmark = [pytest.mark.ibis, with_instances(DATABASE_INSTANCES)]
13 |
14 |
15 | try:
16 | import ibis
17 | except ImportError:
18 | ibis = None
19 |
20 |
21 | # connectorx and thus ibis have trouble with db2+ibm_db:// URLs and mssql
22 | @skip_instances("ibm_db2", "mssql")
23 | def test_table_store():
24 | IbisTable = ibis.api.Table
25 |
26 | @materialize()
27 | def in_table():
28 | return Table(
29 | pd.DataFrame(
30 | {
31 | "col": [0, 1, 2, 3],
32 | }
33 | )
34 | )
35 |
36 | @materialize()
37 | def expected_out_table():
38 | return Table(
39 | pd.DataFrame(
40 | {
41 | "col": [0, 1, 2, 3],
42 | "x": [1, 1, 1, 1],
43 | "y": [2, 2, 2, 2],
44 | }
45 | )
46 | )
47 |
48 | @materialize(input_type=IbisTable)
49 | def noop(x):
50 | return Table(x)
51 |
52 | @materialize(lazy=True, input_type=IbisTable)
53 | def noop_lazy(x):
54 | return Table(x)
55 |
56 | @materialize(input_type=IbisTable)
57 | def add_column(x: IbisTable):
58 | return Table(x.mutate(x=ibis.literal(1)))
59 |
60 | @materialize(lazy=True, input_type=IbisTable)
61 | def add_column_lazy(x: IbisTable):
62 | return Table(x.mutate(y=ibis.literal(2)))
63 |
64 | with Flow() as f:
65 | with Stage("ibis"):
66 | table = in_table()
67 | table = noop(table)
68 | table = noop_lazy(table)
69 | table = add_column(table)
70 | table = add_column_lazy(table)
71 |
72 | expected = expected_out_table()
73 | _ = assert_table_equal(table, expected, check_dtype=False)
74 |
75 | assert f.run().successful
76 |
--------------------------------------------------------------------------------
/tests/test_table_hooks/test_pdtransform.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import pytest
5 |
6 | from pydiverse.pipedag import *
7 |
8 | # Parameterize all tests in this file with several instance_id configurations
9 | from tests.fixtures.instances import DATABASE_INSTANCES, with_instances
10 | from tests.util.tasks_library import assert_table_equal
11 |
12 | pytestmark = [pytest.mark.pdtransform, with_instances(DATABASE_INSTANCES)]
13 |
14 | try:
15 | import pydiverse.transform as pdt
16 |
17 | _ = pdt
18 |
19 | try:
20 | from pydiverse.transform.core.verbs import mutate
21 | from pydiverse.transform.eager import PandasTableImpl
22 | from pydiverse.transform.lazy import SQLTableImpl
23 |
24 | # ensures a "used" state for the import, preventing black from deleting it
25 | _ = PandasTableImpl
26 |
27 | test_list = [SQLTableImpl, PandasTableImpl]
28 | except ImportError:
29 | try:
30 | from pydiverse.transform.extended import Pandas, Polars, SqlAlchemy, mutate
31 |
32 | test_list = [SqlAlchemy, Polars, Pandas]
33 | except ImportError:
34 | raise NotImplementedError(
35 | "pydiverse.transform 0.2.0 - 0.2.2 isn't supported"
36 | ) from None
37 | except ImportError:
38 | test_list = []
39 |
40 |
41 | @pytest.mark.parametrize(
42 | "impl_type",
43 | test_list,
44 | )
45 | def test_table_store(impl_type: type):
46 | def cache_fn(*args, **kwargs):
47 | return impl_type.__name__
48 |
49 | @materialize()
50 | def in_table():
51 | return Table(
52 | pd.DataFrame(
53 | {
54 | "col": [0, 1, 2, 3],
55 | }
56 | )
57 | )
58 |
59 | @materialize()
60 | def expected_out_table():
61 | return Table(
62 | pd.DataFrame(
63 | {
64 | "col": [0, 1, 2, 3],
65 | "x": [1, 1, 1, 1],
66 | "y": [2, 2, 2, 2],
67 | }
68 | )
69 | )
70 |
71 | @materialize(input_type=impl_type, cache=cache_fn)
72 | def noop(x):
73 | return Table(x)
74 |
75 | @materialize(lazy=True, input_type=impl_type, cache=cache_fn)
76 | def noop_lazy(x):
77 | return Table(x)
78 |
79 | @materialize(input_type=impl_type, cache=cache_fn)
80 | def add_column(x):
81 | return Table(x >> mutate(x=1))
82 |
83 | @materialize(lazy=True, input_type=impl_type, cache=cache_fn)
84 | def add_column_lazy(x):
85 | return Table(x >> mutate(y=2))
86 |
87 | with Flow() as f:
88 | with Stage("pdtransform"):
89 | table = in_table()
90 | table = noop(table)
91 | table = noop_lazy(table)
92 | table = add_column(table)
93 | table = add_column_lazy(table)
94 |
95 | expected = expected_out_table()
96 | _ = assert_table_equal(table, expected, check_dtype=False)
97 |
98 | assert f.run().successful
99 |
--------------------------------------------------------------------------------
/tests/test_table_hooks/test_tidypolars.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from pydiverse.pipedag import *
6 |
7 | # Parameterize all tests in this file with several instance_id configurations
8 | from tests.fixtures.instances import DATABASE_INSTANCES, skip_instances, with_instances
9 | from tests.util.tasks_library import assert_table_equal
10 |
11 | pytestmark = [
12 | pytest.mark.polars,
13 | with_instances(DATABASE_INSTANCES),
14 | skip_instances("duckdb"),
15 | ]
16 |
17 |
18 | try:
19 | import tidypolars as tp
20 | except ImportError:
21 | tp = None
22 |
23 |
24 | @pytest.mark.skipif(tp is None, reason="Test requires tidypolars to be installed")
25 | def test_table_store():
26 | @materialize()
27 | def in_table():
28 | return Table(
29 | tp.Tibble(
30 | {
31 | "col": [0, 1, 2, 3],
32 | }
33 | )
34 | )
35 |
36 | @materialize()
37 | def expected_out_table():
38 | return Table(
39 | tp.Tibble(
40 | {
41 | "col": [0, 1, 2, 3],
42 | "x": [1, 1, 1, 1],
43 | "y": [2, 2, 2, 2],
44 | }
45 | )
46 | )
47 |
48 | @materialize(input_type=tp.Tibble)
49 | def noop(x):
50 | return Table(x)
51 |
52 | @materialize(lazy=True, input_type=tp.Tibble)
53 | def noop_lazy(x):
54 | return Table(x)
55 |
56 | @materialize(input_type=tp.Tibble)
57 | def add_column(x):
58 | return Table(x.mutate(x=1))
59 |
60 | @materialize(lazy=True, input_type=tp.Tibble)
61 | def add_column_lazy(x):
62 | return Table(x.mutate(y=2))
63 |
64 | with Flow() as f:
65 | with Stage("tidypolars"):
66 | table = in_table()
67 | table = noop(table)
68 | table = noop_lazy(table)
69 | table = add_column(table)
70 | table = add_column_lazy(table)
71 |
72 | expected = expected_out_table()
73 | _ = assert_table_equal(table, expected, check_dtype=False)
74 |
75 | assert f.run().successful
76 |
--------------------------------------------------------------------------------
/tests/test_unicode.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | import sqlalchemy as sa
5 |
6 | from pydiverse.pipedag import Flow, Stage, materialize
7 | from pydiverse.pipedag.context import StageLockContext
8 |
9 | # Parameterize all tests in this file with several instance_id configurations
10 | from tests.fixtures.instances import (
11 | ALL_INSTANCES,
12 | ORCHESTRATION_INSTANCES,
13 | skip_instances,
14 | with_instances,
15 | )
16 | from tests.util import tasks_library as m
17 | from tests.util.tasks_library import simple_dataframe
18 |
19 | pytestmark = [with_instances(ALL_INSTANCES, ORCHESTRATION_INSTANCES)]
20 |
21 |
22 | def test_unicode(unicode_str="äöüßéç"):
23 | @materialize(lazy=True, input_type=sa.Table)
24 | def unicode(src):
25 | return sa.select(sa.literal(unicode_str).label("a")).select_from(src).limit(1)
26 |
27 | with Flow("flow") as f:
28 | with Stage("stage"):
29 | dummy_source = simple_dataframe()
30 | x = unicode(dummy_source)
31 | x2 = m.noop(x)
32 | x3 = m.noop_lazy(x2)
33 | m.assert_table_equal(x, x2)
34 | m.assert_table_equal(x, x3)
35 |
36 | with StageLockContext():
37 | result = f.run()
38 | assert result.successful
39 | assert result.get(x3, as_type=pd.DataFrame)["a"][0] == unicode_str
40 |
41 |
42 | @skip_instances("mssql", "mssql_pytsql")
43 | def test_unicode_beyond_mssql():
44 | test_unicode("λ")
45 |
--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import traceback
4 |
5 | import pytest
6 |
7 | from pydiverse.pipedag.errors import DisposedError
8 | from pydiverse.pipedag.util import Disposable, requires
9 |
10 |
11 | def test_requires():
12 | @requires(None, ImportError("Some Error"))
13 | class BadClass:
14 | a = 1
15 | b = 2
16 |
17 | # Shouldn't be able to create instance
18 | with pytest.raises(ImportError, match="Some Error"):
19 | BadClass()
20 |
21 | # Shouldn't be able to access class attribute
22 | with pytest.raises(ImportError, match="Some Error"):
23 | _ = BadClass.a
24 |
25 | # If all requirements are fulfilled, nothing should change
26 | @requires((pytest,), Exception("This shouldn't happen"))
27 | class GoodClass:
28 | a = 1
29 |
30 | _ = GoodClass()
31 | _ = GoodClass.a
32 |
33 |
34 | def test_disposable():
35 | class Foo(Disposable):
36 | a = 1
37 |
38 | def bar(self):
39 | return 2
40 |
41 | x = Foo()
42 |
43 | assert x.a == 1
44 | assert x.bar() == 2
45 |
46 | x.dispose()
47 |
48 | with pytest.raises(DisposedError):
49 | _ = x.a
50 | with pytest.raises(DisposedError):
51 | x.foo()
52 | with pytest.raises(DisposedError):
53 | x.dispose()
54 | with pytest.raises(DisposedError):
55 | x.a = 1
56 |
57 |
58 | def test_format_exception():
59 | # traceback.format_exception syntax changed from python 3.9 to 3.10
60 | # thus we use traceback.format_exc()
61 | try:
62 | raise RuntimeError("this error is intended by test")
63 | except RuntimeError:
64 | trace = traceback.format_exc()
65 | assert 'RuntimeError("this error is intended by test")' in trace
66 | assert "test_util.py" in trace
67 |
--------------------------------------------------------------------------------
/tests/util/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from tests.util.pytest_raises import swallowing_raises
4 | from tests.util.sql import compile_sql, select_as
5 |
--------------------------------------------------------------------------------
/tests/util/dask_patch.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from io import BytesIO
4 |
5 | # Patch pytest EncodedFile (from pytest-capture plugin) to be pickleable
6 | # https://github.com/mariusvniekerk/pytest-dask/blob/master/pytest_dask/serde_patch.py
7 | from _pytest.capture import EncodedFile
8 |
9 |
10 | def apply_getsetstate(cls):
11 | def inner(ref):
12 | cls.__getstate__ = ref.__getstate__
13 | cls.__reduce__ = ref.__reduce__
14 | cls.__reduce_ex__ = ref.__reduce_ex__
15 | return cls
16 |
17 | return inner
18 |
19 |
20 | @apply_getsetstate(EncodedFile)
21 | class _EncodedFile:
22 | def __getstate__(self):
23 | assert isinstance(self, EncodedFile)
24 | current_position = self.buffer.seek(0, 1)
25 | self.buffer.seek(0)
26 | value = self.buffer.read()
27 | self.buffer.seek(current_position, 0)
28 | return {"value": value, "encoding": self.encoding}
29 |
30 | def __reduce__(self):
31 | state = self.__getstate__()
32 | return self.__class__, (BytesIO(state["value"]), state["encoding"])
33 |
34 | def __reduce_ex__(self, protocol):
35 | _ = protocol
36 | return self.__reduce__()
37 |
--------------------------------------------------------------------------------
/tests/util/pytest_raises.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import contextlib
4 |
5 | import pytest
6 |
7 | from pydiverse.pipedag import ConfigContext
8 |
9 |
10 | @contextlib.contextmanager
11 | def swallowing_raises(*args, **kwargs):
12 | with ConfigContext.get().evolve(swallow_exceptions=True):
13 | with pytest.raises(*args, **kwargs) as raises:
14 | yield raises
15 |
--------------------------------------------------------------------------------
/tests/util/spy.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import copy
4 | import unittest.mock
5 |
6 | from pydiverse.pipedag.core.task import Task, TaskGetItem
7 | from pydiverse.pipedag.materialize.core import MaterializingTask
8 |
9 |
10 | class PipedagMock:
11 | def __init__(self, mock: unittest.mock.Mock):
12 | self.mock = mock
13 | self._last_call_count = mock.call_count
14 |
15 | def reset_call_count(self):
16 | self._last_call_count = self.mock.call_count
17 |
18 | def _calls_since_last_time(self):
19 | delta = self.mock.call_count - self._last_call_count
20 | self.reset_call_count()
21 | return delta
22 |
23 | def _assert_call_count(self, n):
24 | __tracebackhide__ = True
25 | m = self._calls_since_last_time()
26 | if n == m:
27 | return
28 | name = self.mock.mock.__dict__["_mock_name"]
29 | msg = (
30 | f"Expected function '{name}' to have been called {n} times, but it has"
31 | f" been called {m} times ({self.mock.call_count} times in total)."
32 | )
33 | raise AssertionError(msg)
34 |
35 | def assert_not_called(self):
36 | __tracebackhide__ = True
37 | self._assert_call_count(0)
38 |
39 | def assert_called_once(self):
40 | __tracebackhide__ = True
41 | self._assert_call_count(1)
42 |
43 | def assert_called(self, times):
44 | __tracebackhide__ = True
45 | self._assert_call_count(times)
46 |
47 |
48 | def spy_task(mocker, task) -> PipedagMock:
49 | if isinstance(task, TaskGetItem):
50 | task = task.task
51 | if isinstance(task, MaterializingTask):
52 | task.fn = copy.copy(task.fn)
53 | spy = mocker.spy(task.fn, "fn")
54 | elif isinstance(task, Task):
55 | task_fn = task.fn
56 |
57 | def fn(*args, **kwargs):
58 | return task_fn(*args, **kwargs)
59 |
60 | task.fn = fn
61 | spy = mocker.spy(task, "fn")
62 | else:
63 | raise TypeError("Expected object of type Task or TaskGetItem")
64 |
65 | spy.mock.__dict__["_mock_name"] = task.name
66 | return PipedagMock(spy)
67 |
--------------------------------------------------------------------------------
/tests/util/sql.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import copy
4 |
5 | import sqlalchemy as sa
6 |
7 | from pydiverse.pipedag.backend import BaseTableStore
8 | from pydiverse.pipedag.context import ConfigContext
9 |
10 |
11 | def select_as(value, as_):
12 | return sa.select(sa.literal(value).label(as_))
13 |
14 |
15 | def sql_table_expr(cols: dict):
16 | num_values = {len(vals) for vals in cols.values()}
17 | assert len(num_values) == 1
18 |
19 | queries = []
20 | num_values = num_values.pop()
21 | for i in range(num_values):
22 | literals = []
23 | for col, vals in cols.items():
24 | literals.append(sa.literal(vals[i]).label(col))
25 |
26 | queries.append(sa.select(*literals))
27 |
28 | return sa.union_all(*queries)
29 |
30 |
31 | def compile_sql(query):
32 | engine = ConfigContext.get().store.table_store.engine
33 | return str(query.compile(engine, compile_kwargs={"literal_binds": True}))
34 |
35 |
36 | def get_config_with_table_store(
37 | base_cfg: ConfigContext, table_store_class: type[BaseTableStore]
38 | ):
39 | instance = base_cfg.instance_name
40 | flow = base_cfg.flow_name
41 | cfg = ConfigContext.new(
42 | copy.deepcopy(base_cfg._config_dict), base_cfg.pipedag_name, flow, instance
43 | )
44 | cfg._config_dict["table_store"]["class"] = table_store_class
45 | # this actually instantiates the table store
46 | table_store = cfg.store.table_store
47 | assert type(table_store) == table_store_class
48 | return cfg
49 |
--------------------------------------------------------------------------------