├── .codeclimate.yml ├── .github ├── CODEOWNERS ├── pull_request_template.md └── workflows │ ├── prepare-release.yml │ ├── publish.yml │ └── python-package.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── dbt_airflow_factory ├── __init__.py ├── airflow_dag_factory.py ├── bash │ ├── __init__.py │ ├── bash_operator.py │ ├── bash_parameters.py │ └── bash_parameters_loader.py ├── builder_factory.py ├── config_utils.py ├── constants.py ├── dbt_parameters.py ├── ecs │ ├── __init__.py │ ├── ecs_operator.py │ ├── ecs_parameters.py │ └── ecs_parameters_loader.py ├── ingestion.py ├── k8s │ ├── __init__.py │ ├── k8s_operator.py │ ├── k8s_parameters.py │ └── k8s_parameters_loader.py ├── notifications │ ├── __init__.py │ ├── handler.py │ ├── ms_teams_webhook_hook.py │ └── ms_teams_webhook_operator.py ├── operator.py ├── tasks.py └── tasks_builder │ ├── __init__.py │ ├── builder.py │ └── parameters.py ├── docs ├── Makefile ├── api.rst ├── changelog.md ├── cli.rst ├── conf.py ├── configuration.rst ├── features.rst ├── images │ ├── dag.png │ ├── downstream.png │ ├── ephemeral.png │ ├── gateway.png │ ├── grouped.png │ ├── ingestions_tasks.png │ ├── msteams_notification.png │ ├── slack_notification.png │ ├── tests.png │ └── upstream.png ├── index.rst ├── installation.md ├── requirements.txt ├── source │ ├── dbt_airflow_factory.ecs.rst │ ├── dbt_airflow_factory.k8s.rst │ └── dbt_airflow_factory.rst └── usage.rst ├── pyproject.toml ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── config │ ├── airbyte_dev │ │ ├── airbyte.yml │ │ ├── airflow_seed_disabled.yml │ │ ├── airflow_seed_enabled.yml │ │ ├── ingestion_disabled.yml │ │ └── ingestion_enabled.yml │ ├── airflow_vars │ │ └── airflow.yml │ ├── base │ │ ├── airflow.yml │ │ ├── dbt.yml │ │ ├── execution_env.yml │ │ └── k8s.yml │ ├── dev │ │ └── dbt.yml │ ├── ephemeral_operator │ │ └── airflow.yml │ ├── gateway │ │ └── airflow.yml │ ├── gateway_source │ │ └── airflow.yml │ ├── no_ephemeral_operator │ │ └── airflow.yml │ ├── no_gateway │ │ └── airflow.yml │ ├── no_task_group │ │ └── airflow.yml │ ├── notifications_slack │ │ └── airflow.yml │ ├── notifications_teams │ │ └── airflow.yml │ ├── qa │ │ ├── datahub.yml │ │ ├── dbt.yml │ │ ├── execution_env.yml │ │ └── k8s.yml │ ├── task_group │ │ └── airflow.yml │ └── vars │ │ └── dbt.yml ├── manifest.json ├── manifest_ephemeral.json ├── manifest_gateway.json ├── manifest_gateway_source.json ├── manifest_task_group_tests.json ├── teams_webhook_expected_paylaod.json ├── test_config_propagation.py ├── test_config_propagation_qa.py ├── test_dag_dependencies.py ├── test_dag_factory.py ├── test_dependencies.py ├── test_edges.py ├── test_ephemeral_operator.py ├── test_notifications.py ├── test_task_group.py ├── test_tasks.py └── utils.py └── tox.ini /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | version: "2" # required to adjust maintainability checks 2 | 3 | checks: 4 | argument-count: 5 | enabled: true 6 | config: 7 | threshold: 6 8 | complex-logic: 9 | enabled: true 10 | config: 11 | threshold: 4 12 | file-lines: 13 | enabled: true 14 | config: 15 | threshold: 250 16 | method-complexity: 17 | enabled: true 18 | config: 19 | threshold: 5 20 | method-count: 21 | enabled: true 22 | config: 23 | threshold: 20 24 | method-lines: 25 | enabled: true 26 | config: 27 | threshold: 25 28 | nested-control-flow: 29 | enabled: true 30 | config: 31 | threshold: 4 32 | return-statements: 33 | enabled: true 34 | config: 35 | threshold: 4 36 | similar-code: 37 | enabled: true 38 | config: 39 | threshold: #language-specific defaults. overrides affect all languages. 40 | identical-code: 41 | enabled: true 42 | config: 43 | threshold: #language-specific defaults. overrides affect all languages. 44 | 45 | plugins: 46 | pylint: 47 | enabled: true 48 | checks: 49 | import-error: 50 | enabled: false 51 | bad-continuation: 52 | enabled: false 53 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Define global code owners 2 | * @p-pekala -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | `` 2 | 3 | Resolves `` 4 | 5 | --- 6 | Keep in mind: 7 | - [ ] Documentation updates 8 | - [ ] [Changelog](CHANGELOG.md) updates -------------------------------------------------------------------------------- /.github/workflows/prepare-release.yml: -------------------------------------------------------------------------------- 1 | name: Prepare release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version_part: 7 | description: The part of the version to update (patch, minor or major) 8 | required: true 9 | default: 'minor' 10 | 11 | jobs: 12 | prepare-release: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.8"] 17 | env: 18 | PYTHON_PACKAGE: dbt_airflow_factory 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Validate inputs 26 | run: | 27 | echo "INPUT_VERSION_PART: ${{ github.event.inputs.version_part }}" 28 | python -c "if '${{ github.event.inputs.version_part }}' not in ['patch', 'minor', 'major']: raise ValueError(\"'${{ github.event.inputs.version_part }}' must be one of ['patch', 'minor', 'major'])\")" 29 | - name: Bump the version number # bump2version is a maintained fork of original bumpversion 30 | id: bump_version 31 | run: | 32 | pip install bump2version 33 | bump2version ${{ github.event.inputs.version_part }} 34 | echo "::set-output name=package_version::$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" 35 | - name: Update the CHANGELOG according to 'Keep a Changelog' guidelines 36 | uses: thomaseizinger/keep-a-changelog-new-release@v1 37 | with: 38 | version: ${{ steps.bump_version.outputs.package_version }} 39 | - name: Create a new release branch 40 | run: | 41 | git config user.name github-actions 42 | git config user.email github-actions@github.com 43 | git checkout -b release-${{ steps.bump_version.outputs.package_version }} 44 | git push -u origin release-${{ steps.bump_version.outputs.package_version }} 45 | - name: Open a PR to merge the release to main 46 | id: open_pr 47 | uses: vsoch/pull-request-action@1.0.12 48 | env: 49 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 50 | PULL_REQUEST_BRANCH: main 51 | PULL_REQUEST_FROM_BRANCH: release-${{ steps.bump_version.outputs.package_version }} 52 | PULL_REQUEST_TITLE: "Release ${{ steps.bump_version.outputs.package_version }}" 53 | PULL_REQUEST_BODY: "Bump version and CHANGELOG for next release." 54 | PULL_REQUEST_ASSIGNEES: " p-pekala" 55 | - name: Commit the changes 56 | run: | 57 | git commit -am "FIX #${{ steps.open_pr.outputs.pull_request_number }} - Bump version and CHANGELOG for release ${{ steps.bump_version.outputs.package_version }}" 58 | git push 59 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.8"] 14 | env: 15 | PYTHON_PACKAGE: dbt_airflow_factory 16 | steps: 17 | - name: Checkout the repo 18 | uses: actions/checkout@v2 19 | with: 20 | fetch-depth: 0 # necessary to enable merging, all the history is needed 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Build package dist from source # A better way will be : https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ but pep 517 is still marked as experimental 26 | run: | 27 | python setup.py sdist 28 | - name: Merge back to develop # we have to set the config first on a fresh machine 29 | run: | 30 | git config user.name github-actions 31 | git config user.email github-actions@github.com 32 | git checkout -b develop --track origin/develop 33 | git merge main 34 | git push 35 | - name: Set dynamically package version as output variable # see https://github.com/actions/create-release/issues/39 36 | # see https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions#setting-an-environment-variable 37 | id: set_package_version 38 | run: | 39 | echo "::set-output name=PACKAGE_VERSION::$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" 40 | - name: Create temporary file with the body content for the release 41 | run: | 42 | grep -Poz "## \[${{steps.set_package_version.outputs.PACKAGE_VERSION}}] - \d{4}-\d{2}-\d{2}[\S\s]+?(?=## \[\d+\.\d+\.\d+\]|\[.+\]:)" CHANGELOG.md > release_body.md 43 | - name: Create Release # https://github.com/actions/create-release 44 | id: create_release 45 | uses: actions/create-release@v1.1.4 46 | env: 47 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token 48 | with: 49 | tag_name: ${{ steps.set_package_version.outputs.PACKAGE_VERSION }} 50 | release_name: Release ${{ steps.set_package_version.outputs.PACKAGE_VERSION }} 51 | body_path: ./release_body.md 52 | draft: false 53 | prerelease: false 54 | - name: Rollback Release in case of run failure 55 | if: failure() && steps.create_release.outputs.id != '' 56 | uses: author/action-rollback@stable 57 | with: 58 | # Using a known release ID 59 | release_id: ${{ steps.create_release.outputs.id }} 60 | env: 61 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 62 | - name: Publish distribution to PyPI # official action from python maintainers 63 | uses: pypa/gh-action-pypi-publish@master 64 | with: 65 | user: __token__ 66 | password: ${{ secrets.PYPI_PASSWORD }} 67 | verbose: true # trace if the upload fails 68 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - develop 8 | pull_request: 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Setup python 17 | uses: actions/setup-python@v2.2.1 18 | with: 19 | python-version: "3.8" 20 | 21 | - name: Setup virtualenv 22 | run: | 23 | python -V 24 | python -m pip install virtualenv pipdeptree 25 | virtualenv venv 26 | source venv/bin/activate 27 | pip install --upgrade pip 28 | 29 | - name: Check pre-commit status 30 | run: | 31 | pip install .[tests] 32 | pip freeze 33 | pipdeptree 34 | pre-commit run --all-files 35 | 36 | - name: Test with tox 37 | run: | 38 | tox -e py38 39 | 40 | - name: Report coverage 41 | uses: paambaati/codeclimate-action@v2.7.5 42 | env: 43 | CC_TEST_REPORTER_ID: 2f513a15560b9848db21cca75606745b5283b68b5935e8c481c25eb0090a2a36 44 | with: 45 | coverageCommand: coverage xml 46 | debug: true 47 | coverageLocations: coverage.xml:coverage.py 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | target/ 3 | build/ 4 | dist/ 5 | dbt_modules/ 6 | logs/ 7 | dbt_airflow_manifest_parser.egg-info 8 | dbt_airflow_factory.egg-info 9 | .idea 10 | out 11 | 12 | .user.yml 13 | 14 | .coverage 15 | coverage.xml 16 | .tox 17 | 18 | *.iml 19 | 20 | docs/_build 21 | __pycache__/ 22 | 23 | .venv/ 24 | .vscode/ 25 | .python-version 26 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | known_first_party = dbt_airflow_factory 4 | default_section = THIRDPARTY 5 | 6 | [settings] 7 | known_third_party = airflow,dbt_graph_builder,jinja2,pytest,pytimeparse,setuptools,yaml -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | 3 | - repo: https://github.com/asottile/seed-isort-config 4 | rev: v2.2.0 5 | hooks: 6 | - id: seed-isort-config 7 | 8 | - repo: https://github.com/pycqa/isort 9 | rev: 5.12.0 10 | hooks: 11 | - id: isort 12 | args: ["--profile", "black", "--filter-files"] 13 | 14 | - repo: https://github.com/psf/black 15 | rev: 23.7.0 16 | hooks: 17 | - id: black 18 | 19 | - repo: https://github.com/pre-commit/pre-commit-hooks 20 | rev: v4.4.0 21 | hooks: 22 | - id: trailing-whitespace 23 | args: [ --markdown-linebreak-ext=md ] 24 | - id: check-merge-conflict 25 | - id: debug-statements 26 | 27 | - repo: https://github.com/pycqa/flake8 28 | rev: 6.1.0 29 | hooks: 30 | - id: flake8 31 | additional_dependencies: [ 32 | 'flake8-blind-except', 33 | 'flake8-comprehensions', 34 | 'flake8-pep3101', 35 | ] 36 | 37 | - repo: https://github.com/pre-commit/mirrors-mypy 38 | rev: v1.5.1 39 | hooks: 40 | - id: mypy 41 | additional_dependencies: 42 | - 'types-PyYAML' 43 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.9" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [Unreleased] 4 | 5 | ## [0.35.0] - 2023-09-08 6 | 7 | ## [0.34.0] - 2023-08-10 8 | 9 | - Add `MS Teams` notifications handler 10 | 11 | ## [0.33.0] - 2023-08-04 12 | 13 | - Add `kwargs` to `BashExecutionParameters` [#90](https://github.com/getindata/dbt-airflow-factory/issues/90) 14 | - Correcting required packages [#97](https://github.com/getindata/dbt-airflow-factory/issues/97) 15 | 16 | ## [0.32.0] - 2023-07-04 17 | 18 | ## [0.31.1] - 2023-05-12 19 | 20 | ### Fixed 21 | 22 | - Replace `config_file` default value from `"~/.kube/config"` to `None` in `KubernetesPodOperator` [#90](https://github.com/getindata/dbt-airflow-factory/issues/90) 23 | 24 | ## [0.31.0] - 2023-03-27 25 | 26 | ### Fixed 27 | 28 | - Use `node_selector` and `container_resources` parameters in `KubernetesPodOperator` if Airflow is 2.3+. 29 | 30 | ## [0.30.0] - 2023-02-08 31 | 32 | - Add in_cluster, cluster_context params 33 | - Repair secrets to be not required 34 | - Update docs 35 | - Add BashOperator 36 | - Exposes param to control the pod startup timeout 37 | 38 | ## [0.29.0] - 2022-09-02 39 | 40 | ## [0.28.0] - 2022-07-19 41 | 42 | ## [0.27.0] - 2022-07-01 43 | 44 | ## [0.26.0] - 2022-05-13 45 | 46 | - Documentation improvements 47 | 48 | ## [0.25.0] - 2022-04-27 49 | 50 | ## [0.24.0] - 2022-04-22 51 | 52 | - Dependencies between project in Airflow 53 | 54 | ## [0.23.0] - 2022-03-22 55 | 56 | ## [0.22.0] - 2022-03-21 57 | 58 | - Failure notifications via slack 59 | 60 | ### Added 61 | 62 | - Ephemeral nodes can be hidden from DAG by setting `show_ephemeral_models: False` in project's `airflow.yml`. 63 | 64 | ## [0.21.0] - 2022-02-11 65 | 66 | This version brings compatibility with `dbt 1.0`. 67 | 68 | ## [0.20.1] - 2022-02-08 69 | 70 | ## [0.20.0] - 2022-02-08 71 | 72 | ### Added 73 | 74 | - Run tests with more than one dependency in a different node. 75 | 76 | ## [0.19.0] - 2022-02-02 77 | 78 | ## [0.18.1] - 2022-01-18 79 | 80 | ### Fixed 81 | 82 | - Jinja's `FileSystemLoader` gets `str` instead of `pathlib.Path` to fix types incompatibility for `Jinja < 2.11.0`. 83 | - Use `get_start_task()` and `get_end_task()` in `AirflowDagFactory.create_tasks(config)` to prevent ephemeral ending tasks from throwing. 84 | 85 | ## [0.18.0] - 2022-01-14 86 | 87 | ### Added 88 | 89 | - ReadTheDocs documentation in `docs` directory. 90 | - `{{ var.value.VARIABLE_NAME }}` gets replaced with Airflow variables when parsing `airflow.yml` file. 91 | 92 | ### Changed 93 | 94 | - Rename project from `dbt-airflow-manifest-parser` to `dbt-airflow-factory`. 95 | 96 | ### Fixed 97 | 98 | - `KubernetesExecutionParameters.env_vars` works in Airflow 1 too. Airflow 1 is expecting a real dictionary of 99 | environment variables instead of a list of `k8s.V1EnvVar` objects. 100 | - Fix `DummyOperator` import in `operator.py` to work in Airflow 1. 101 | 102 | ## [0.17.0] - 2022-01-11 103 | 104 | ### Changed 105 | 106 | - Ephemeral models are not run anymore, presented as an `EphemeralOperator` deriving from the `DummyOperator`. 107 | 108 | ## [0.16.0] - 2022-01-05 109 | 110 | ### Added 111 | 112 | - Add support for `vars` in `dbt.yml`. 113 | 114 | ## [0.15.0] - 2021-12-13 115 | 116 | ### Changed 117 | 118 | - Drop `_` prefix from Task names when using TaskGroup. 119 | 120 | ## [0.14.0] - 2021-12-06 121 | 122 | ### Changed 123 | 124 | - Add `**kwargs` argument to `DbtExecutionEnvironmentParameters` and `KubernetesExecutionParameters` constructors, 125 | making them ignore additional arguments, if provided. 126 | - Add support for Kubernetes environment variables. 127 | 128 | ## [0.13.0] - 2021-11-17 129 | 130 | - Allow usage of TaskGroup when `use_task_group` flag is set to `True` 131 | 132 | ## [0.12.0] - 2021-11-17 133 | 134 | ## [0.11.0] - 2021-11-10 135 | 136 | ## [0.10.0] - 2021-11-09 137 | 138 | ## [0.9.0] - 2021-11-05 139 | 140 | ## [0.8.0] - 2021-11-03 141 | 142 | ## [0.7.0] - 2021-11-02 143 | 144 | ## [0.6.0] - 2021-11-02 145 | 146 | ## [0.5.0] - 2021-10-29 147 | 148 | - Automatic parsing config files 149 | 150 | ## [0.4.0] - 2021-10-27 151 | 152 | ## [0.3.0] - 2021-10-27 153 | 154 | - Support for Airflow 2.x 155 | 156 | ## [0.2.0] - 2021-10-25 157 | 158 | - Initial implementation of `dbt_airflow_manifest_parser` library. 159 | 160 | [Unreleased]: https://github.com/getindata/dbt-airflow-factory/compare/0.35.0...HEAD 161 | 162 | [0.35.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.34.0...0.35.0 163 | 164 | [0.34.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.33.0...0.34.0 165 | 166 | [0.33.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.32.0...0.33.0 167 | 168 | [0.32.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.31.1...0.32.0 169 | 170 | [0.31.1]: https://github.com/getindata/dbt-airflow-factory/compare/0.31.0...0.31.1 171 | 172 | [0.31.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.30.0...0.31.0 173 | 174 | [0.30.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.29.0...0.30.0 175 | 176 | [0.29.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.28.0...0.29.0 177 | 178 | [0.28.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.27.0...0.28.0 179 | 180 | [0.27.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.26.0...0.27.0 181 | 182 | [0.26.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.25.0...0.26.0 183 | 184 | [0.25.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.24.0...0.25.0 185 | 186 | [0.24.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.23.0...0.24.0 187 | 188 | [0.23.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.22.0...0.23.0 189 | 190 | [0.22.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.21.0...0.22.0 191 | 192 | [0.21.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.20.1...0.21.0 193 | 194 | [0.20.1]: https://github.com/getindata/dbt-airflow-factory/compare/0.20.0...0.20.1 195 | 196 | [0.20.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.19.0...0.20.0 197 | 198 | [0.19.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.18.1...0.19.0 199 | 200 | [0.18.1]: https://github.com/getindata/dbt-airflow-factory/compare/0.18.0...0.18.1 201 | 202 | [0.18.0]: https://github.com/getindata/dbt-airflow-factory/compare/0.17.0...0.18.0 203 | 204 | [0.17.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.16.0...0.17.0 205 | 206 | [0.16.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.15.0...0.16.0 207 | 208 | [0.15.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.14.0...0.15.0 209 | 210 | [0.14.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.13.0...0.14.0 211 | 212 | [0.13.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.12.0...0.13.0 213 | 214 | [0.12.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.11.0...0.12.0 215 | 216 | [0.11.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.10.0...0.11.0 217 | 218 | [0.10.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.9.0...0.10.0 219 | 220 | [0.9.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.8.0...0.9.0 221 | 222 | [0.8.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.7.0...0.8.0 223 | 224 | [0.7.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.6.0...0.7.0 225 | 226 | [0.6.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.5.0...0.6.0 227 | 228 | [0.5.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.4.0...0.5.0 229 | 230 | [0.4.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.3.0...0.4.0 231 | 232 | [0.3.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/0.2.0...0.3.0 233 | 234 | [0.2.0]: https://github.com/getindata/dbt-airflow-manifest-parser/compare/6395f7ea175caa3bd1aca361e9d2f7fb7f7a7820...0.2.0 235 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## PR Guidelines 2 | 1. Fork branch from `develop`. 3 | 1. Ensure to provide unit tests for new functionality. 4 | 1. Install dev requirements: `pip install -r requirements.txt` and setup a hook: `pre-commit install` 5 | 1. Update documentation accordingly. 6 | 1. Update [changelog](CHANGELOG.md) according to ["Keep a changelog"](https://keepachangelog.com/en/1.0.0/) guidelines. 7 | 1. Squash changes with a single commit as much as possible and ensure verbose PR name. 8 | 1. Open a PR against `develop` 9 | 10 | *We reserve the right to take over and modify or abandon PRs that do not match the workflow or are abandoned.* 11 | 12 | ## Release workflow 13 | 14 | 1. Create the release candidate: 15 | - Go to the [Prepare release](https://github.com/getindata/kedro-airflow-k8s/actions?query=workflow%3A%22Prepare+release%22) action. 16 | - Click "Run workflow" 17 | - Enter the part of the version to bump (one of `..`). Minor (x.**x**.x) is a default. 18 | 2. If the workflow has run sucessfully: 19 | - Go to the newly openened PR named `Release candidate ` 20 | - Check that changelog and version have been properly updated. If not pull the branch and apply manual changes if necessary. 21 | - Merge the PR to main 22 | 3. Checkout the [Publish](https://github.com/getindata/kedro-airflow-k8s/actions?query=workflow%3APublish) workflow to see if: 23 | - The package has been uploaded on PyPI successfully 24 | - The changes have been merged back to develop -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DBT Airflow Factory 2 | 3 | [![Python Version](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)](https://github.com/getindata/dbt-airflow-factory) 4 | [![PyPI Version](https://badge.fury.io/py/dbt-airflow-factory.svg)](https://pypi.org/project/dbt-airflow-factory/) 5 | [![Downloads](https://pepy.tech/badge/dbt-airflow-factory)](https://pepy.tech/project/dbt-airflow-factory) 6 | [![Maintainability](https://api.codeclimate.com/v1/badges/47fd3570c858b6c166ad/maintainability)](https://codeclimate.com/github/getindata/dbt-airflow-factory/maintainability) 7 | [![Test Coverage](https://api.codeclimate.com/v1/badges/47fd3570c858b6c166ad/test_coverage)](https://codeclimate.com/github/getindata/dbt-airflow-factory/test_coverage) 8 | [![Documentation Status](https://readthedocs.org/projects/dbt-airflow-factory/badge/?version=latest)](https://dbt-airflow-factory.readthedocs.io/en/latest/?badge=latest) 9 | 10 | Library to convert DBT manifest metadata to Airflow tasks 11 | 12 | ## Documentation 13 | 14 | Read the full documentation at [https://dbt-airflow-factory.readthedocs.io/](https://dbt-airflow-factory.readthedocs.io/en/latest/index.html) 15 | 16 | ## Installation 17 | 18 | Use the package manager [pip][pip] to install the library: 19 | 20 | ```bash 21 | pip install dbt-airflow-factory 22 | ``` 23 | 24 | ## Usage 25 | 26 | The library is expected to be used inside an Airflow environment with a Kubernetes image referencing **dbt**. 27 | 28 | **dbt-airflow-factory**'s main task is to parse `manifest.json` and create Airflow DAG out of it. It also reads config 29 | files from `config` directory and therefore is highly customizable (e.g., user can set path to `manifest.json`). 30 | 31 | To start, create a directory with a following structure, where `manifest.json` is a file generated by **dbt**: 32 | ``` 33 | . 34 | ├── config 35 | │ ├── base 36 | │ │ ├── airflow.yml 37 | │ │ ├── dbt.yml 38 | │ │ └── k8s.yml 39 | │ └── dev 40 | │ └── dbt.yml 41 | ├── dag.py 42 | └── manifest.json 43 | ``` 44 | 45 | Then, put the following code into `dag.py`: 46 | ```python 47 | from dbt_airflow_factory.airflow_dag_factory import AirflowDagFactory 48 | from os import path 49 | 50 | dag = AirflowDagFactory(path.dirname(path.abspath(__file__)), "dev").create() 51 | ``` 52 | 53 | When uploaded to Airflow DAGs directory, it will get picked up by Airflow, parse `manifest.json` and prepare a DAG to run. 54 | 55 | ### Configuration files 56 | 57 | It is best to look up the example configuration files in [tests directory][tests] to get a glimpse of correct configs. 58 | 59 | You can use [Airflow template variables][airflow-vars] in your `dbt.yml` and `k8s.yml` files, as long as they are inside 60 | quotation marks: 61 | ```yaml 62 | target: "{{ var.value.env }}" 63 | some_other_field: "{{ ds_nodash }}" 64 | ``` 65 | 66 | Analogously, you can use `"{{ var.value.VARIABLE_NAME }}"` in `airflow.yml`, but only the Airflow variable getter. 67 | Any other Airflow template variables will not work in `airflow.yml`. 68 | 69 | ### Creation of the directory with data-pipelines-cli 70 | 71 | **DBT Airflow Factory** works best in tandem with [data-pipelines-cli][dp-cli] tool. **dp** not only prepares directory 72 | for the library to digest, but also automates Docker image building and pushes generated directory to the cloud storage 73 | of your choice. 74 | 75 | [airflow-vars]: https://airflow.apache.org/docs/apache-airflow/stable/templates-ref.html#variables 76 | [dp-cli]: https://pypi.org/project/data-pipelines-cli/ 77 | [pip]: https://pip.pypa.io/en/stable/ 78 | [tests]: https://github.com/getindata/dbt-airflow-factory/tree/develop/tests/config 79 | -------------------------------------------------------------------------------- /dbt_airflow_factory/__init__.py: -------------------------------------------------------------------------------- 1 | version = "0.35.0" 2 | -------------------------------------------------------------------------------- /dbt_airflow_factory/airflow_dag_factory.py: -------------------------------------------------------------------------------- 1 | """Factory creating Airflow DAG.""" 2 | 3 | import os 4 | 5 | from airflow import DAG 6 | from airflow.models import BaseOperator 7 | 8 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 9 | from dbt_airflow_factory.ingestion import IngestionEngine, IngestionFactory 10 | 11 | if IS_FIRST_AIRFLOW_VERSION: 12 | from airflow.operators.dummy_operator import DummyOperator 13 | else: 14 | from airflow.operators.dummy import DummyOperator 15 | 16 | from pytimeparse import parse 17 | 18 | from dbt_airflow_factory.builder_factory import DbtAirflowTasksBuilderFactory 19 | from dbt_airflow_factory.config_utils import read_config, read_env_config 20 | from dbt_airflow_factory.notifications.handler import NotificationHandlersFactory 21 | from dbt_airflow_factory.tasks_builder.builder import DbtAirflowTasksBuilder 22 | 23 | 24 | class AirflowDagFactory: 25 | """ 26 | Factory creating Airflow DAG. 27 | 28 | :param dag_path: path to ``manifest.json`` file. 29 | :type dag_path: str 30 | :param env: name of the environment. 31 | :type env: str 32 | :param dbt_config_file_name: name of the DBT config file. 33 | If not specified, default value is ``dbt.yml``. 34 | :type dbt_config_file_name: str 35 | :param execution_env_config_file_name: name of the execution env config file. 36 | If not specified, default value is ``execution_env.yml``. 37 | :type execution_env_config_file_name: str 38 | :param airflow_config_file_name: name of the Airflow config file. 39 | If not specified, default value is ``airflow.yml``. 40 | :type airflow_config_file_name: str 41 | """ 42 | 43 | _builder: DbtAirflowTasksBuilder 44 | dag_path: str 45 | """path to ``manifest.json`` file.""" 46 | env: str 47 | """name of the environment.""" 48 | airflow_config_file_name: str 49 | """name of the Airflow config file (default: ``airflow.yml``).""" 50 | 51 | def __init__( 52 | self, 53 | dag_path: str, 54 | env: str, 55 | dbt_config_file_name: str = "dbt.yml", 56 | execution_env_config_file_name: str = "execution_env.yml", 57 | airflow_config_file_name: str = "airflow.yml", 58 | airbyte_config_file_name: str = "airbyte.yml", 59 | ingestion_config_file_name: str = "ingestion.yml", 60 | ): 61 | self._notifications_handlers_builder = NotificationHandlersFactory() 62 | self.airflow_config = self._read_config(dag_path, env, airflow_config_file_name) 63 | self._builder = DbtAirflowTasksBuilderFactory( 64 | dag_path, 65 | env, 66 | self.airflow_config, 67 | dbt_config_file_name, 68 | execution_env_config_file_name, 69 | ).create() 70 | self.dag_path = dag_path 71 | airbyte_config = read_env_config( 72 | dag_path=dag_path, env=env, file_name=airbyte_config_file_name 73 | ) 74 | self.ingestion_config = read_env_config( 75 | dag_path=dag_path, env=env, file_name=ingestion_config_file_name 76 | ) 77 | self.ingestion_tasks_builder_factory = IngestionFactory( 78 | ingestion_config=airbyte_config, 79 | name=IngestionEngine.value_of( 80 | self.ingestion_config.get("engine", IngestionEngine.AIRBYTE.value) 81 | ), 82 | ) 83 | 84 | def create(self) -> DAG: 85 | """ 86 | Parse ``manifest.json`` and create tasks based on the data contained there. 87 | 88 | :return: Generated DAG. 89 | :rtype: airflow.models.dag.DAG 90 | """ 91 | with DAG( 92 | default_args=self.airflow_config["default_args"], **self.airflow_config["dag"] 93 | ) as dag: 94 | self.create_tasks() 95 | return dag 96 | 97 | def create_tasks(self) -> None: 98 | """ 99 | Parse ``manifest.json`` and create tasks based on the data contained there. 100 | """ 101 | 102 | ingestion_enabled = self.ingestion_config.get("enable", False) 103 | 104 | start = self._create_starting_task() 105 | if ingestion_enabled and self.ingestion_tasks_builder_factory: 106 | builder = self.ingestion_tasks_builder_factory.create() 107 | ingestion_tasks = builder.build() 108 | ingestion_tasks >> start 109 | end = DummyOperator(task_id="end") 110 | tasks = self._builder.parse_manifest_into_tasks(self._manifest_file_path()) 111 | for starting_task in tasks.get_starting_tasks(): 112 | start >> starting_task.get_start_task() 113 | for ending_task in tasks.get_ending_tasks(): 114 | ending_task.get_end_task() >> end 115 | 116 | def _create_starting_task(self) -> BaseOperator: 117 | if self.airflow_config.get("seed_task", True): 118 | return self._builder.create_seed_task() 119 | else: 120 | return DummyOperator(task_id="start") 121 | 122 | def _manifest_file_path(self) -> str: 123 | file_dir = self.airflow_config.get("manifest_dir_path", self.dag_path) 124 | return os.path.join( 125 | file_dir, self.airflow_config.get("manifest_file_name", "manifest.json") 126 | ) 127 | 128 | def _read_config(self, dag_path: str, env: str, airflow_config_file_name: str) -> dict: 129 | """ 130 | Read ``airflow.yml`` from ``config`` directory into a dictionary. 131 | 132 | :return: Dictionary representing ``airflow.yml``. 133 | :rtype: dict 134 | :raises KeyError: No ``default_args`` key in ``airflow.yml``. 135 | """ 136 | config = read_config(dag_path, env, airflow_config_file_name, replace_jinja=True) 137 | if "retry_delay" in config["default_args"]: 138 | config["default_args"]["retry_delay"] = parse(config["default_args"]["retry_delay"]) 139 | if "failure_handlers" in config: 140 | config["default_args"][ 141 | "on_failure_callback" 142 | ] = self._notifications_handlers_builder.create_failure_handler( 143 | config["failure_handlers"] 144 | ) 145 | return config 146 | -------------------------------------------------------------------------------- /dbt_airflow_factory/bash/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/dbt_airflow_factory/bash/__init__.py -------------------------------------------------------------------------------- /dbt_airflow_factory/bash/bash_operator.py: -------------------------------------------------------------------------------- 1 | """Factories creating Airflow Operators running DBT tasks.""" 2 | 3 | from typing import List, Optional 4 | 5 | from airflow.models.baseoperator import BaseOperator 6 | from airflow.operators.bash_operator import BashOperator 7 | 8 | from dbt_airflow_factory.bash.bash_parameters import BashExecutionParameters 9 | from dbt_airflow_factory.dbt_parameters import DbtExecutionEnvironmentParameters 10 | from dbt_airflow_factory.operator import DbtRunOperatorBuilder 11 | 12 | 13 | class BashOperatorBuilder(DbtRunOperatorBuilder): 14 | """ 15 | Builder of Bash Operator running a single DBT task. 16 | 17 | :param dbt_execution_env_parameters: POD representing DBT operator config file. 18 | :type dbt_execution_env_parameters: DbtExecutionEnvironmentParameters 19 | :param bash_execution_parameters: 20 | POD representing bash execution parameters. 21 | :type bash_execution_parameters: BashExecutionParameters 22 | """ 23 | 24 | dbt_execution_env_parameters: DbtExecutionEnvironmentParameters 25 | """POD representing DBT operator config file.""" 26 | bash_execution_parameters: BashExecutionParameters 27 | """POD representing bash execution parameters.""" 28 | 29 | def __init__( 30 | self, 31 | dbt_execution_env_parameters: DbtExecutionEnvironmentParameters, 32 | bash_execution_parameters: BashExecutionParameters, 33 | ): 34 | self.dbt_execution_env_parameters = dbt_execution_env_parameters 35 | self.bash_execution_parameters = bash_execution_parameters 36 | 37 | def create( 38 | self, 39 | name: str, 40 | command: str, 41 | model: Optional[str] = None, 42 | additional_dbt_args: Optional[List[str]] = None, 43 | ) -> BaseOperator: 44 | return self._create(self._prepare_arguments(command, model, additional_dbt_args), name) 45 | 46 | def _prepare_arguments( 47 | self, 48 | command: str, 49 | model: Optional[str], 50 | additional_dbt_args: Optional[List[str]], 51 | ) -> List[str]: 52 | args = [ 53 | f"{self.bash_execution_parameters.execution_script}", 54 | f"{command}", 55 | f"--target {self.dbt_execution_env_parameters.target}", 56 | f'--vars "{self.dbt_execution_env_parameters.vars}"', 57 | f"--project-dir {self.dbt_execution_env_parameters.project_dir_path}", 58 | f"--profiles-dir {self.dbt_execution_env_parameters.profile_dir_path}", 59 | ] 60 | if model: 61 | args += [f"--select {model}"] 62 | if additional_dbt_args: 63 | args += additional_dbt_args 64 | return [" ".join(args)] 65 | 66 | def _create(self, args: List[str], name: str) -> BashOperator: 67 | return BashOperator(bash_command=" ".join(args), task_id=name) 68 | -------------------------------------------------------------------------------- /dbt_airflow_factory/bash/bash_parameters.py: -------------------------------------------------------------------------------- 1 | """POD representing Bash operator config file.""" 2 | 3 | from typing import Any 4 | 5 | 6 | class BashExecutionParameters: 7 | """POD representing Bash operator config file. 8 | :param execution_script: Script that will be executed inside pod. 9 | :type execution_script: str 10 | """ 11 | 12 | def __init__(self, execution_script: str = "dbt --no-write-json", **kwargs: Any) -> None: 13 | self.execution_script = execution_script 14 | -------------------------------------------------------------------------------- /dbt_airflow_factory/bash/bash_parameters_loader.py: -------------------------------------------------------------------------------- 1 | from dbt_airflow_factory.bash.bash_parameters import BashExecutionParameters 2 | from dbt_airflow_factory.config_utils import read_config 3 | 4 | 5 | class BashExecutionParametersLoader: 6 | @staticmethod 7 | def create_config( 8 | dag_path: str, env: str, execution_env_config_file_name: str 9 | ) -> BashExecutionParameters: 10 | config = read_config(dag_path, env, "bash.yml") 11 | config.update(read_config(dag_path, env, execution_env_config_file_name)) 12 | return BashExecutionParameters(**config) 13 | -------------------------------------------------------------------------------- /dbt_airflow_factory/builder_factory.py: -------------------------------------------------------------------------------- 1 | """Factory creating Airflow tasks.""" 2 | 3 | from dbt_graph_builder.builder import create_gateway_config 4 | 5 | from dbt_airflow_factory.bash.bash_operator import BashOperatorBuilder 6 | from dbt_airflow_factory.bash.bash_parameters_loader import ( 7 | BashExecutionParametersLoader, 8 | ) 9 | from dbt_airflow_factory.config_utils import read_config 10 | from dbt_airflow_factory.dbt_parameters import DbtExecutionEnvironmentParameters 11 | from dbt_airflow_factory.ecs.ecs_operator import EcsPodOperatorBuilder 12 | from dbt_airflow_factory.ecs.ecs_parameters_loader import EcsExecutionParametersLoader 13 | from dbt_airflow_factory.k8s.k8s_operator import KubernetesPodOperatorBuilder 14 | from dbt_airflow_factory.k8s.k8s_parameters_loader import ( 15 | KubernetesExecutionParametersLoader, 16 | ) 17 | from dbt_airflow_factory.operator import DbtRunOperatorBuilder 18 | from dbt_airflow_factory.tasks_builder.builder import DbtAirflowTasksBuilder 19 | from dbt_airflow_factory.tasks_builder.parameters import TasksBuildingParameters 20 | 21 | 22 | class DbtAirflowTasksBuilderFactory: 23 | """ 24 | Factory creating Airflow tasks. 25 | 26 | :param dag_path: path to ``manifest.json`` file. 27 | :type dag_path: str 28 | :param env: name of the environment. 29 | :type env: str 30 | :param dbt_config_file_name: name of the DBT config file. 31 | If not specified, default value is ``dbt.yml``. 32 | :type dbt_config_file_name: str 33 | :param execution_env_config_file_name: name of the execution environment config file. 34 | If not specified, default value is ``execution_env.yml``. 35 | :type execution_env_config_file_name: str 36 | """ 37 | 38 | base_config_name: str 39 | """Name of the ``base`` environment (default: ``base``).""" 40 | dag_path: str 41 | """path to ``manifest.json`` file.""" 42 | env: str 43 | """name of the environment.""" 44 | dbt_config_file_name: str 45 | """name of the DBT config file (default: ``dbt.yml``).""" 46 | execution_env_config_file_name: str 47 | """name of the execution env config file (default: ``execution_env.yml``).""" 48 | 49 | def __init__( 50 | self, 51 | dag_path: str, 52 | env: str, 53 | airflow_config: dict, 54 | dbt_config_file_name: str = "dbt.yml", 55 | execution_env_config_file_name: str = "execution_env.yml", 56 | ): 57 | self.base_config_name = "base" 58 | self.dag_path = dag_path 59 | self.env = env 60 | self.airflow_config = airflow_config 61 | self.dbt_config_file_name = dbt_config_file_name 62 | self.execution_env_config_file_name = execution_env_config_file_name 63 | 64 | def create(self) -> DbtAirflowTasksBuilder: 65 | """ 66 | Create :class:`.DbtAirflowTasksBuilder` to use. 67 | 68 | :return: Instance of :class:`.DbtAirflowTasksBuilder`. 69 | :rtype: DbtAirflowTasksBuilder 70 | """ 71 | dbt_params = self._create_dbt_config() 72 | execution_env_type = self._read_execution_env_type() 73 | tasks_airflow_config = self._create_tasks_airflow_config() 74 | 75 | return DbtAirflowTasksBuilder( 76 | tasks_airflow_config, 77 | self._create_operator_builder(execution_env_type, dbt_params), 78 | gateway_config=create_gateway_config(self.airflow_config), 79 | ) 80 | 81 | def _create_tasks_airflow_config(self) -> TasksBuildingParameters: 82 | return TasksBuildingParameters( 83 | self.airflow_config.get("use_task_group", False), 84 | self.airflow_config.get("show_ephemeral_models", True), 85 | self.airflow_config.get("enable_project_dependencies", False), 86 | self.airflow_config.get("check_all_deps_for_multiple_deps_tests", True), 87 | ) 88 | 89 | def _create_operator_builder( 90 | self, type: str, dbt_params: DbtExecutionEnvironmentParameters 91 | ) -> DbtRunOperatorBuilder: 92 | if type == "k8s": 93 | return KubernetesPodOperatorBuilder( 94 | dbt_params, 95 | KubernetesExecutionParametersLoader.create_config( 96 | self.dag_path, self.env, self.execution_env_config_file_name 97 | ), 98 | ) 99 | elif type == "ecs": 100 | return EcsPodOperatorBuilder( 101 | dbt_params, 102 | EcsExecutionParametersLoader.create_config( 103 | self.dag_path, self.env, self.execution_env_config_file_name 104 | ), 105 | ) 106 | elif type == "bash": 107 | return BashOperatorBuilder( 108 | dbt_params, 109 | BashExecutionParametersLoader.create_config( 110 | self.dag_path, self.env, self.execution_env_config_file_name 111 | ), 112 | ) 113 | else: 114 | raise TypeError(f"Unsupported env type {type}") 115 | 116 | def _create_dbt_config(self) -> DbtExecutionEnvironmentParameters: 117 | return DbtExecutionEnvironmentParameters( 118 | **read_config(self.dag_path, self.env, self.dbt_config_file_name) 119 | ) 120 | 121 | def _read_execution_env_type(self) -> str: 122 | return read_config(self.dag_path, self.env, self.execution_env_config_file_name)["type"] 123 | -------------------------------------------------------------------------------- /dbt_airflow_factory/config_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for configuration files reading.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import os 7 | import pathlib 8 | from typing import Any, Union 9 | 10 | import yaml 11 | from airflow.models import Variable 12 | from jinja2 import FileSystemLoader 13 | from jinja2.nativetypes import NativeEnvironment 14 | 15 | 16 | def read_config( 17 | dag_path: Union[str, os.PathLike[str]], 18 | env: str, 19 | file_name: str, 20 | replace_jinja: bool = False, 21 | ) -> dict: 22 | """ 23 | Reads dictionaries out of *file_name* in both `base` and *env* directories, 24 | and compiles them into one. Values from *env* directory get precedence over 25 | `base` ones 26 | 27 | :param dag_path: Path to the directory containing ``config`` directory. 28 | :type dag_path: Union[str, os.PathLike[str]] 29 | :param env: Name of the environment. 30 | :type env: str 31 | :param file_name: Name of the config file. 32 | :type file_name: str 33 | :param replace_jinja: Whether replace Airflow vars using Jinja templating. 34 | :type replace_jinja: bool 35 | :return: Dictionary representing the config file. 36 | :rtype: dict 37 | """ 38 | config = read_env_config(dag_path, "base", file_name, replace_jinja) 39 | config.update(read_env_config(dag_path, env, file_name, replace_jinja)) 40 | return config 41 | 42 | 43 | def read_env_config( 44 | dag_path: Union[str, os.PathLike[str]], 45 | env: str, 46 | file_name: str, 47 | replace_jinja: bool = False, 48 | ) -> dict: 49 | """ 50 | Read config file, depending on the ``env``. 51 | 52 | :param dag_path: Path to the directory containing ``config`` directory. 53 | :type dag_path: Union[str, os.PathLike[str]] 54 | :param env: Name of the environment. 55 | :type env: str 56 | :param file_name: Name of the config file. 57 | :type file_name: str 58 | :param replace_jinja: Whether replace Airflow vars using Jinja templating. 59 | :type replace_jinja: bool 60 | :return: Dictionary representing the config file. 61 | :rtype: dict 62 | """ 63 | config_file_path = os.path.join(dag_path, "config", env, file_name) 64 | if os.path.exists(config_file_path): 65 | logging.info("Reading config from " + config_file_path) 66 | return read_yaml_file(config_file_path, replace_jinja) 67 | logging.warning("Missing config file: " + config_file_path) 68 | return {} 69 | 70 | 71 | def read_yaml_file(file_path: Union[str, os.PathLike[str]], replace_jinja: bool) -> dict: 72 | """ 73 | Load `yaml` file to dictionary. 74 | 75 | :param file_path: Path to the file. 76 | :type file_path: Union[str, os.PathLike[str]] 77 | :param replace_jinja: Whether replace Airflow vars using Jinja templating. 78 | :type replace_jinja: bool 79 | :return: Loaded dictionary. 80 | :rtype: dict 81 | """ 82 | if replace_jinja: 83 | return yaml.safe_load(_jinja_replace_airflow_vars(file_path)) 84 | 85 | with open(file_path, "r") as f: 86 | return yaml.safe_load(f) 87 | 88 | 89 | def _jinja_replace_airflow_vars(file_path: Union[str, os.PathLike[str]]) -> str: 90 | # Copied from airflow.models.taskinstance 91 | class VariableAccessor: 92 | def __init__(self) -> None: 93 | self.var = None 94 | 95 | def __getattr__(self, item: str) -> Any: 96 | self.var = Variable.get(item) 97 | return self.var 98 | 99 | def __repr__(self) -> str: 100 | return str(self.var) 101 | 102 | @staticmethod 103 | def get(item: str) -> Any: 104 | return Variable.get(item) 105 | 106 | file_path = pathlib.Path(file_path) 107 | jinja_loader = FileSystemLoader(str(file_path.parent)) 108 | jinja_env = NativeEnvironment(loader=jinja_loader) 109 | 110 | return jinja_env.get_template(file_path.name).render(var={"value": VariableAccessor()}) 111 | -------------------------------------------------------------------------------- /dbt_airflow_factory/constants.py: -------------------------------------------------------------------------------- 1 | import airflow 2 | 3 | IS_FIRST_AIRFLOW_VERSION = airflow.__version__.startswith("1.") 4 | IS_AIRFLOW_NEWER_THAN_2_4 = not IS_FIRST_AIRFLOW_VERSION and ( 5 | not airflow.__version__.startswith("2.") or int(airflow.__version__.split(".")[1]) > 4 6 | ) 7 | -------------------------------------------------------------------------------- /dbt_airflow_factory/dbt_parameters.py: -------------------------------------------------------------------------------- 1 | """POD representing DBT operator config file.""" 2 | 3 | import sys 4 | from typing import Any, Dict, Optional 5 | 6 | import yaml 7 | 8 | 9 | class DbtExecutionEnvironmentParameters: 10 | """POD representing DBT operator config file. 11 | 12 | :param target: Name of the target environment (passed to **dbt** as ``--target``). 13 | :type target: str 14 | :param project_dir_path: Path to project directory. 15 | :type project_dir_path: str 16 | :param profile_dir_path: Path to the directory containing ``profiles.yml``. 17 | :type project_dir_path: str 18 | :param vars: Dictionary of variables to pass to the **dbt**. 19 | :type vars: Optional[Dict[str, str]] 20 | """ 21 | 22 | def __init__( 23 | self, 24 | target: str, 25 | project_dir_path: str = "/dbt", 26 | profile_dir_path: str = "/root/.dbt", 27 | vars: Optional[Dict[str, str]] = None, 28 | **kwargs: Any, 29 | ) -> None: 30 | self.target = target 31 | self.project_dir_path = project_dir_path 32 | self.profile_dir_path = profile_dir_path 33 | self._vars = vars or {} 34 | 35 | @property 36 | def vars(self) -> str: 37 | """ 38 | String representation of dictionary of **dbt** variables. 39 | 40 | DBT expects ``--vars`` passing string in YAML format. This property 41 | returns such a string. 42 | 43 | :return: String representation of dictionary of **dbt** variables. 44 | :rtype: str 45 | """ 46 | return yaml.dump(self._vars, default_flow_style=True, width=sys.maxsize).rstrip() 47 | -------------------------------------------------------------------------------- /dbt_airflow_factory/ecs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/dbt_airflow_factory/ecs/__init__.py -------------------------------------------------------------------------------- /dbt_airflow_factory/ecs/ecs_operator.py: -------------------------------------------------------------------------------- 1 | """Factories creating Airflow Operators running DBT tasks.""" 2 | 3 | from typing import List, Optional 4 | 5 | from airflow.models.baseoperator import BaseOperator 6 | 7 | from dbt_airflow_factory.dbt_parameters import DbtExecutionEnvironmentParameters 8 | from dbt_airflow_factory.ecs.ecs_parameters import EcsExecutionParameters 9 | from dbt_airflow_factory.operator import DbtRunOperatorBuilder 10 | 11 | 12 | class EcsPodOperatorBuilder(DbtRunOperatorBuilder): 13 | dbt_execution_env_parameters: DbtExecutionEnvironmentParameters 14 | """POD representing DBT operator config file.""" 15 | ecs_execution_parameters: EcsExecutionParameters 16 | """POD representing ecs operator config file.""" 17 | 18 | def __init__( 19 | self, 20 | dbt_execution_env_parameters: DbtExecutionEnvironmentParameters, 21 | ecs_execution_parameters: EcsExecutionParameters, 22 | ): 23 | self.dbt_execution_env_parameters = dbt_execution_env_parameters 24 | self.ecs_execution_parameters = ecs_execution_parameters 25 | 26 | def create( 27 | self, 28 | name: str, 29 | command: str, 30 | model: Optional[str] = None, 31 | additional_dbt_args: Optional[List[str]] = None, 32 | ) -> BaseOperator: 33 | raise NotImplementedError # TODO 34 | -------------------------------------------------------------------------------- /dbt_airflow_factory/ecs/ecs_parameters.py: -------------------------------------------------------------------------------- 1 | """POD representing Kubernetes operator config file.""" 2 | 3 | from typing import Any 4 | 5 | 6 | class EcsExecutionParameters: 7 | """ 8 | :param image: tag of Docker image you wish to launch. 9 | :type image: str 10 | """ 11 | 12 | def __init__( 13 | self, 14 | image: str, 15 | **_kwargs: Any, 16 | ) -> None: 17 | self.image = image 18 | -------------------------------------------------------------------------------- /dbt_airflow_factory/ecs/ecs_parameters_loader.py: -------------------------------------------------------------------------------- 1 | """POD representing Kubernetes operator config file.""" 2 | 3 | from dbt_airflow_factory.ecs.ecs_parameters import EcsExecutionParameters 4 | 5 | 6 | class EcsExecutionParametersLoader: 7 | @staticmethod 8 | def create_config( 9 | dag_path: str, env: str, execution_env_config_file_name: str 10 | ) -> EcsExecutionParameters: 11 | raise NotImplementedError 12 | -------------------------------------------------------------------------------- /dbt_airflow_factory/ingestion.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List 3 | 4 | from airflow.models import BaseOperator 5 | from airflow.providers.airbyte.operators.airbyte import AirbyteTriggerSyncOperator 6 | 7 | 8 | class IngestionEngine(Enum): 9 | AIRBYTE = "airbyte" 10 | 11 | @classmethod 12 | def value_of(cls, value: str) -> "IngestionEngine": 13 | return IngestionEngine(value) 14 | 15 | 16 | class IngestionTasksBuilder: 17 | def build(self) -> List[BaseOperator]: 18 | raise NotImplementedError("Should implement build method") 19 | 20 | 21 | class AirbyteIngestionTasksBuilder(IngestionTasksBuilder): 22 | def __init__(self, config: dict): 23 | self.ingestion_config = config 24 | 25 | def build(self) -> List[BaseOperator]: 26 | airflow_tasks = [] 27 | tasks = self.ingestion_config["tasks"] 28 | for task in tasks: 29 | airflow_tasks.append( 30 | AirbyteTriggerSyncOperator( 31 | task_id=task["task_id"], 32 | airbyte_conn_id=self.ingestion_config["airbyte_connection_id"], 33 | connection_id=task["connection_id"], 34 | asynchronous=task["asyncrounous"], 35 | api_version=task["api_version"], 36 | wait_seconds=task["wait_seconds"], 37 | timeout=task["timeout"], 38 | ) 39 | ) 40 | 41 | return airflow_tasks 42 | 43 | 44 | class IngestionFactory: 45 | def __init__(self, ingestion_config: dict, name: IngestionEngine): 46 | self.ingestion_config = ingestion_config 47 | self.name = name 48 | 49 | def create( 50 | self, 51 | ) -> IngestionTasksBuilder: 52 | if self.name == IngestionEngine.AIRBYTE: 53 | return AirbyteIngestionTasksBuilder(self.ingestion_config) 54 | raise NotImplementedError(f"{self.name} is not supported !") 55 | -------------------------------------------------------------------------------- /dbt_airflow_factory/k8s/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/dbt_airflow_factory/k8s/__init__.py -------------------------------------------------------------------------------- /dbt_airflow_factory/k8s/k8s_operator.py: -------------------------------------------------------------------------------- 1 | """Factories creating Airflow Operators running DBT tasks.""" 2 | 3 | import inspect 4 | from typing import List, Optional 5 | 6 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 7 | from dbt_airflow_factory.dbt_parameters import DbtExecutionEnvironmentParameters 8 | from dbt_airflow_factory.k8s.k8s_parameters import KubernetesExecutionParameters 9 | from dbt_airflow_factory.operator import DbtRunOperatorBuilder 10 | 11 | if IS_FIRST_AIRFLOW_VERSION: 12 | from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator 13 | else: 14 | from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import ( 15 | KubernetesPodOperator, 16 | ) 17 | 18 | from airflow.models.baseoperator import BaseOperator 19 | 20 | 21 | class KubernetesPodOperatorBuilder(DbtRunOperatorBuilder): 22 | """ 23 | Builder of Kubernetes Pod Operator running a single DBT task. 24 | 25 | :param dbt_execution_env_parameters: POD representing DBT operator config file. 26 | :type dbt_execution_env_parameters: DbtExecutionEnvironmentParameters 27 | :param kubernetes_execution_parameters: 28 | POD representing Kubernetes operator config file. 29 | :type kubernetes_execution_parameters: KubernetesExecutionParameters 30 | """ 31 | 32 | dbt_execution_env_parameters: DbtExecutionEnvironmentParameters 33 | """POD representing DBT operator config file.""" 34 | kubernetes_execution_parameters: KubernetesExecutionParameters 35 | """POD representing Kubernetes operator config file.""" 36 | 37 | def __init__( 38 | self, 39 | dbt_execution_env_parameters: DbtExecutionEnvironmentParameters, 40 | kubernetes_execution_parameters: KubernetesExecutionParameters, 41 | ): 42 | self.dbt_execution_env_parameters = dbt_execution_env_parameters 43 | self.kubernetes_execution_parameters = kubernetes_execution_parameters 44 | 45 | def create( 46 | self, 47 | name: str, 48 | command: str, 49 | model: Optional[str] = None, 50 | additional_dbt_args: Optional[List[str]] = None, 51 | ) -> BaseOperator: 52 | return self._create(self._prepare_arguments(command, model, additional_dbt_args), name) 53 | 54 | def _prepare_arguments( 55 | self, 56 | command: str, 57 | model: Optional[str], 58 | additional_dbt_args: Optional[List[str]], 59 | ) -> List[str]: 60 | args = [ 61 | f"{self.kubernetes_execution_parameters.execution_script}", 62 | f"{command}", 63 | f"--target {self.dbt_execution_env_parameters.target}", 64 | f'--vars "{self.dbt_execution_env_parameters.vars}"', 65 | f"--project-dir {self.dbt_execution_env_parameters.project_dir_path}", 66 | f"--profiles-dir {self.dbt_execution_env_parameters.profile_dir_path}", 67 | ] 68 | if model: 69 | args += [f"--select {model}"] 70 | if additional_dbt_args: 71 | args += additional_dbt_args 72 | return [" ".join(args)] 73 | 74 | def _create(self, args: Optional[List[str]], name: str) -> KubernetesPodOperator: 75 | airflow_compatibility_dict = { 76 | "node_selectors" 77 | if IS_FIRST_AIRFLOW_VERSION 78 | else "node_selector": self.kubernetes_execution_parameters.node_selectors, 79 | # Since Airflow 2.3, https://github.com/apache/airflow/blob/12c3c39d1a816c99c626fe4c650e88cf7b1cc1bc/airflow/providers/cncf/kubernetes/CHANGELOG.rst#500 # noqa: E501 80 | "container_resources" 81 | if inspect.signature(KubernetesPodOperator).parameters.get("container_resources") 82 | is not None 83 | else "resources": self.kubernetes_execution_parameters.resources, 84 | } 85 | 86 | return KubernetesPodOperator( 87 | namespace=self.kubernetes_execution_parameters.namespace, 88 | image=self.kubernetes_execution_parameters.image, 89 | image_pull_policy=self.kubernetes_execution_parameters.image_pull_policy, 90 | cmds=["bash", "-c"], 91 | tolerations=self.kubernetes_execution_parameters.tolerations, 92 | annotations=self.kubernetes_execution_parameters.annotations, 93 | arguments=args, 94 | labels=self.kubernetes_execution_parameters.labels, 95 | name=name, 96 | task_id=name, 97 | env_vars=self.kubernetes_execution_parameters.env_vars, 98 | secrets=self.kubernetes_execution_parameters.secrets, 99 | is_delete_operator_pod=self.kubernetes_execution_parameters.is_delete_operator_pod, # noqa: E501 100 | hostnetwork=False, 101 | config_file=self.kubernetes_execution_parameters.config_file, 102 | in_cluster=self.kubernetes_execution_parameters.in_cluster, 103 | cluster_context=self.kubernetes_execution_parameters.cluster_context, 104 | startup_timeout_seconds=self.kubernetes_execution_parameters.startup_timeout_seconds, 105 | **airflow_compatibility_dict, 106 | ) 107 | -------------------------------------------------------------------------------- /dbt_airflow_factory/k8s/k8s_parameters.py: -------------------------------------------------------------------------------- 1 | """POD representing Kubernetes operator config file.""" 2 | 3 | from typing import Any, Dict, List, Optional 4 | 5 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 6 | 7 | if IS_FIRST_AIRFLOW_VERSION: 8 | from airflow.contrib.kubernetes.secret import Secret 9 | else: 10 | from airflow.kubernetes.secret import Secret 11 | 12 | 13 | class KubernetesExecutionParameters: 14 | """POD representing Kubernetes operator config file. 15 | 16 | :param image: tag of Docker image you wish to launch. 17 | :type image: str 18 | :param namespace: the namespace to run within Kubernetes. 19 | :type namespace: str 20 | :param image_pull_policy: Specify a policy to cache or always pull an image. 21 | :type image_pull_policy: str 22 | :param node_selectors: A dict containing a group of scheduling rules. 23 | :type node_selectors: dict 24 | :param tolerations: A list of Kubernetes tolerations. 25 | :type tolerations: list 26 | :param labels: labels to apply to the Pod. (templated) 27 | :type labels: dict 28 | :param limit: A dict containing resources limits. 29 | :type limit: dict 30 | :param requests: A dict containing resources requests. 31 | :type requests: dict 32 | :param annotations: non-identifying metadata you can attach to the Pod. 33 | :type annotations: dict 34 | :param envs: Environment variables initialized in the container. (templated) 35 | :type envs: Optional[Dict[str, str]] 36 | :param secrets: Kubernetes secrets to inject in the container. 37 | They can be exposed as environment vars or files in a volume. 38 | :type secrets: List[Secret] 39 | :param is_delete_operator_pod: What to do when the pod reaches its final 40 | state, or the execution is interrupted. If True: delete the pod. 41 | :type is_delete_operator_pod: bool 42 | :param config_file: The path to the Kubernetes config file. (templated) 43 | If not specified, default value is ``~/.kube/config`` 44 | :type config_file: Optional[str] 45 | :param execution_script: Script that will be executed inside pod. 46 | :type execution_script: str 47 | :param in_cluster: Run kubernetes client with in_cluster configuration. 48 | :type in_cluster: bool 49 | :param cluster_context: Context that points to kubernetes cluster. 50 | Ignored when in_cluster is True. If None, current-context is used. 51 | :type cluster_context: str 52 | """ 53 | 54 | def __init__( 55 | self, 56 | image: str, 57 | namespace: str = "default", 58 | image_pull_policy: Optional[str] = None, 59 | node_selectors: Optional[dict] = None, 60 | tolerations: Optional[list] = None, 61 | labels: Optional[dict] = None, 62 | limit: Optional[dict] = None, 63 | requests: Optional[dict] = None, 64 | annotations: Optional[dict] = None, 65 | envs: Optional[Dict[str, str]] = None, 66 | secrets: Optional[List[Secret]] = None, 67 | is_delete_operator_pod: bool = True, 68 | config_file: Optional[str] = None, 69 | execution_script: str = "dbt --no-write-json", 70 | in_cluster: Optional[bool] = None, 71 | cluster_context: Optional[str] = None, 72 | startup_timeout_seconds: int = 120, 73 | **kwargs: Any, 74 | ) -> None: 75 | self.namespace = namespace 76 | self.image = image 77 | self.image_pull_policy = image_pull_policy 78 | self.node_selectors = node_selectors 79 | self.tolerations = tolerations 80 | self.labels = labels 81 | self._limit = limit 82 | self._requests = requests 83 | self.annotations = annotations 84 | self._env_vars = envs 85 | self._secrets = secrets 86 | self.is_delete_operator_pod = is_delete_operator_pod 87 | self.config_file = config_file 88 | self.execution_script = execution_script 89 | self.in_cluster = in_cluster 90 | self.cluster_context = cluster_context 91 | self.startup_timeout_seconds = startup_timeout_seconds 92 | 93 | @property 94 | def resources(self): # type: ignore 95 | """ 96 | Return dict containing resources requests and limits. 97 | 98 | In the Airflow 1, it was expected to be a real dictionary with 99 | ``request_memory``, ``request_cpu``, ``limit_memory``, and ``limit_cpu`` 100 | as keys. So for Airflow 1, the function returns such a dictionary. 101 | 102 | Beginning with Airflow 2, :class:`KubernetesPodOperator` expects 103 | ``V1ResourceRequirements`` class instead. Hence, for Airflow 2, the 104 | function returns instance of this class. 105 | 106 | :return: Dictionary containing resources requests and limits. 107 | """ 108 | if IS_FIRST_AIRFLOW_VERSION: 109 | return { 110 | "limit_memory": self._limit["memory"] if self._limit else None, 111 | "limit_cpu": self._limit["cpu"] if self._limit else None, 112 | "request_memory": self._requests["memory"] if self._requests else None, 113 | "request_cpu": self._requests["cpu"] if self._requests else None, 114 | } 115 | else: 116 | from kubernetes.client import models as k8s 117 | 118 | return k8s.V1ResourceRequirements(limits=self._limit, requests=self._requests) 119 | 120 | @property 121 | def env_vars(self): # type: ignore 122 | """ 123 | Return dict containing environment variables to set in Kubernetes. 124 | 125 | For Airflow 1, the function returns a dictionary. 126 | 127 | Beginning with Airflow 2, :class:`KubernetesPodOperator` expects 128 | a list of ``V1EnvVar`` instances instead. Hence, for Airflow 2, the 129 | function returns a ``List[k8s.V1EnvVar]``. 130 | 131 | :return: Dictionary or list containing environment variables. 132 | """ 133 | if self._env_vars is None: 134 | return None 135 | 136 | if IS_FIRST_AIRFLOW_VERSION: 137 | return self._env_vars 138 | else: 139 | from kubernetes.client import models as k8s 140 | 141 | return [k8s.V1EnvVar(k, v) for k, v in self._env_vars.items()] 142 | 143 | @property 144 | def secrets(self) -> Optional[List[Secret]]: 145 | """ 146 | Return list containing secrets to be set in Kubernetes. 147 | :return List containing kubernetes Secrets 148 | """ 149 | if self._secrets is None: 150 | return None 151 | 152 | return [Secret(**secret) for secret in self._secrets] 153 | -------------------------------------------------------------------------------- /dbt_airflow_factory/k8s/k8s_parameters_loader.py: -------------------------------------------------------------------------------- 1 | """POD representing Kubernetes operator config file.""" 2 | 3 | from dbt_airflow_factory.config_utils import read_config 4 | from dbt_airflow_factory.k8s.k8s_parameters import KubernetesExecutionParameters 5 | 6 | 7 | class KubernetesExecutionParametersLoader: 8 | @staticmethod 9 | def create_config( 10 | dag_path: str, env: str, execution_env_config_file_name: str 11 | ) -> KubernetesExecutionParameters: 12 | config = read_config(dag_path, env, "k8s.yml") 13 | config = KubernetesExecutionParametersLoader._update_config_if_datahub_exits( 14 | config, read_config(dag_path, env, "datahub.yml") 15 | ) 16 | config.update(read_config(dag_path, env, execution_env_config_file_name)) 17 | config["image"] = KubernetesExecutionParametersLoader._prepare_image(config["image"]) 18 | config.update(config.pop("resources")) 19 | return KubernetesExecutionParameters(**config) 20 | 21 | @staticmethod 22 | def _prepare_image(config: dict) -> str: 23 | return config["repository"] + ":" + str(config["tag"]) 24 | 25 | @staticmethod 26 | def _update_config_if_datahub_exits(config: dict, datahub_config: dict) -> dict: 27 | if datahub_config: 28 | config["envs"].update({"DATAHUB_GMS_URL": datahub_config["sink"]["config"]["server"]}) 29 | return config 30 | -------------------------------------------------------------------------------- /dbt_airflow_factory/notifications/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/dbt_airflow_factory/notifications/__init__.py -------------------------------------------------------------------------------- /dbt_airflow_factory/notifications/handler.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable 2 | from typing import Any 3 | from urllib.parse import quote_plus 4 | 5 | from dbt_airflow_factory.constants import ( 6 | IS_AIRFLOW_NEWER_THAN_2_4, 7 | IS_FIRST_AIRFLOW_VERSION, 8 | ) 9 | 10 | if IS_FIRST_AIRFLOW_VERSION: 11 | from airflow.contrib.operators.slack_webhook_operator import SlackWebhookOperator 12 | else: 13 | from airflow.providers.slack.operators.slack_webhook import SlackWebhookOperator 14 | 15 | if IS_AIRFLOW_NEWER_THAN_2_4: 16 | from airflow.hooks.base import BaseHook 17 | else: 18 | from airflow.hooks.base_hook import BaseHook 19 | 20 | from dbt_airflow_factory.notifications.ms_teams_webhook_operator import ( 21 | MSTeamsWebhookOperator, 22 | ) 23 | 24 | 25 | class NotificationHandlersFactory: 26 | def create_failure_handler(self, handlers_config: dict) -> Callable: 27 | def failure_handler(context: Any) -> None: 28 | for handler_definition in handlers_config: 29 | if handler_definition["type"] == "slack": 30 | connection = BaseHook.get_connection(handler_definition["connection_id"]) 31 | return SlackWebhookOperator( 32 | task_id="slack_failure_notification", 33 | message=handler_definition["message_template"].format( 34 | task=context.get("task_instance").task_id, 35 | dag=context.get("task_instance").dag_id, 36 | execution_time=context.get("execution_date"), 37 | url=context.get("task_instance").log_url, 38 | ), 39 | http_conn_id=handler_definition["connection_id"], 40 | webhook_token=connection.password, 41 | username=connection.login, 42 | ).execute(context=context) 43 | elif handler_definition["type"] == "teams": 44 | webserver_url = handler_definition["webserver_url"] 45 | webserver_url = ( 46 | webserver_url[:-1] if webserver_url.endswith("/") else webserver_url 47 | ) 48 | dag_id = context.get("task_instance").dag_id 49 | task_id = context.get("task_instance").task_id 50 | context["task_instance"].xcom_push(key=dag_id, value=True) 51 | query = quote_plus( 52 | f"log?dag_id={dag_id}&task_id={task_id}&execution_date={context['ts']}", 53 | safe="=&?", 54 | ) 55 | logs_url = f"{webserver_url}/{query}" 56 | 57 | teams_notification = MSTeamsWebhookOperator( 58 | task_id="teams_failure_notification", 59 | message=handler_definition["message_template"].format( 60 | task=task_id, 61 | dag=dag_id, 62 | execution_time=context.get("execution_date"), 63 | url=logs_url, 64 | ), 65 | button_text="View log", 66 | button_url=logs_url, 67 | theme_color="FF0000", 68 | http_conn_id=handler_definition["connection_id"], 69 | ) 70 | return teams_notification.execute(context) 71 | 72 | return failure_handler 73 | -------------------------------------------------------------------------------- /dbt_airflow_factory/notifications/ms_teams_webhook_hook.py: -------------------------------------------------------------------------------- 1 | """ 2 | MS Teams webhook implementation. 3 | """ 4 | 5 | from typing import Any, Optional 6 | 7 | from airflow.exceptions import AirflowException 8 | 9 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 10 | 11 | if IS_FIRST_AIRFLOW_VERSION: 12 | from airflow.hooks.http_hook import HttpHook 13 | else: 14 | from airflow.providers.http.hooks.http import HttpHook 15 | 16 | 17 | # Credits: https://code.mendhak.com/Airflow-MS-Teams-Operator/ 18 | class MSTeamsWebhookHook(HttpHook): 19 | """ 20 | This hook allows you to post messages to MS Teams using the Incoming Webhook connector. 21 | 22 | Takes both MS Teams webhook token directly and connection that has MS Teams webhook token. 23 | If both supplied, the webhook token will be appended to the host in the connection. 24 | 25 | :param http_conn_id: connection that has MS Teams webhook URL 26 | :type http_conn_id: str 27 | :param webhook_token: MS Teams webhook token 28 | :type webhook_token: str 29 | :param message: The message you want to send on MS Teams 30 | :type message: str 31 | :param subtitle: The subtitle of the message to send 32 | :type subtitle: str 33 | :param button_text: The text of the action button 34 | :type button_text: str 35 | :param button_url: The URL for the action button click 36 | :type button_url : str 37 | :param theme_color: Hex code of the card theme, without the # 38 | :type message: str 39 | :param proxy: Proxy to use when making the webhook request 40 | :type proxy: str 41 | 42 | """ 43 | 44 | def __init__( 45 | self, 46 | http_conn_id: Optional[str] = None, 47 | webhook_token: Optional[str] = None, 48 | message: str = "", 49 | subtitle: str = "", 50 | button_text: str = "", 51 | button_url: str = "", 52 | theme_color: str = "00FF00", 53 | proxy: Optional[str] = None, 54 | **kwargs: Any 55 | ) -> None: 56 | super(MSTeamsWebhookHook, self).__init__(**kwargs) 57 | self.http_conn_id = http_conn_id 58 | self.webhook_token = self.get_token(webhook_token, http_conn_id) 59 | self.message = message 60 | self.subtitle = subtitle 61 | self.button_text = button_text 62 | self.button_url = button_url 63 | self.theme_color = theme_color 64 | self.proxy = proxy 65 | 66 | def get_proxy(self, http_conn_id: Any) -> str: 67 | """ 68 | Return proxy address from connection object 69 | :param http_conn_id: The conn_id provided 70 | :return: proxy address (str) to use 71 | """ 72 | conn = self.get_connection(http_conn_id) 73 | extra = conn.extra_dejson 74 | return extra.get("proxy", "") 75 | 76 | def get_token(self, token: Optional[str], http_conn_id: Optional[str]) -> str: 77 | """ 78 | Given either a manually set token or a conn_id, return the webhook_token to use 79 | :param token: The manually provided token 80 | :param http_conn_id: The conn_id provided 81 | :return: webhook_token (str) to use 82 | """ 83 | if token: 84 | return token 85 | 86 | if http_conn_id: 87 | conn = self.get_connection(http_conn_id) 88 | extra = conn.extra_dejson 89 | return extra.get("webhook_token", "") 90 | 91 | raise AirflowException("Cannot get URL: No valid MS Teams webhook URL nor conn_id supplied") 92 | 93 | def build_message(self) -> str: 94 | """ 95 | Builds payload for MS Teams webhook. 96 | """ 97 | card_json = """ 98 | {{ 99 | "@type": "MessageCard", 100 | "@context": "http://schema.org/extensions", 101 | "themeColor": "{3}", 102 | "summary": "{0}", 103 | "sections": [{{ 104 | "activityTitle": "{1}", 105 | "activitySubtitle": "{2}", 106 | "markdown": true, 107 | "potentialAction": [ 108 | {{ 109 | "@type": "OpenUri", 110 | "name": "{4}", 111 | "targets": [ 112 | {{ "os": "default", "uri": "{5}" }} 113 | ] 114 | }} 115 | ] 116 | }}] 117 | }} 118 | """ 119 | return card_json.format( 120 | self.message, 121 | self.message, 122 | self.subtitle, 123 | self.theme_color, 124 | self.button_text, 125 | self.button_url, 126 | ) 127 | 128 | def execute(self) -> None: 129 | """ 130 | Remote Popen (actually execute the webhook call) 131 | 132 | :param cmd: command to remotely execute 133 | :param kwargs: extra arguments to Popen (see subprocess.Popen) 134 | """ 135 | proxies = {} 136 | proxy_url = self.get_proxy(self.http_conn_id) 137 | 138 | if len(proxy_url) > 5: 139 | proxies = {"https": proxy_url} 140 | 141 | self.run( 142 | endpoint=self.webhook_token, 143 | data=self.build_message(), 144 | headers={"Content-type": "application/json"}, 145 | extra_options={"proxies": proxies}, 146 | ) 147 | -------------------------------------------------------------------------------- /dbt_airflow_factory/notifications/ms_teams_webhook_operator.py: -------------------------------------------------------------------------------- 1 | """ 2 | MS Teams webhook operator. 3 | """ 4 | 5 | import logging 6 | from typing import Any, Optional 7 | 8 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 9 | 10 | if IS_FIRST_AIRFLOW_VERSION: 11 | from airflow.providers.http.operators.http import SimpleHttpOperator 12 | else: 13 | from airflow.operators.http_operator import SimpleHttpOperator 14 | 15 | from airflow.utils.decorators import apply_defaults 16 | 17 | from dbt_airflow_factory.notifications.ms_teams_webhook_hook import MSTeamsWebhookHook 18 | 19 | 20 | # Credits: https://code.mendhak.com/Airflow-MS-Teams-Operator/ 21 | class MSTeamsWebhookOperator(SimpleHttpOperator): 22 | """ 23 | This operator allows you to post messages to MS Teams using the Incoming Webhooks connector. 24 | Takes both MS Teams webhook token directly and connection that has MS Teams webhook token. 25 | If both supplied, the webhook token will be appended to the host in the connection. 26 | 27 | :param http_conn_id: connection that has MS Teams webhook URL 28 | :type http_conn_id: str 29 | :param webhook_token: MS Teams webhook token 30 | :type webhook_token: str 31 | :param message: The message you want to send on MS Teams 32 | :type message: str 33 | :param subtitle: The subtitle of the message to send 34 | :type subtitle: str 35 | :param button_text: The text of the action button 36 | :type button_text: str 37 | :param button_url: The URL for the action button click 38 | :type button_url : str 39 | :param theme_color: Hex code of the card theme, without the # 40 | :type message: str 41 | :param proxy: Proxy to use when making the webhook request 42 | :type proxy: str 43 | """ 44 | 45 | template_fields = ( 46 | "message", 47 | "subtitle", 48 | ) 49 | 50 | @apply_defaults 51 | def __init__( 52 | self, 53 | http_conn_id: Optional[str] = None, 54 | webhook_token: Optional[str] = None, 55 | message: str = "", 56 | subtitle: str = "", 57 | button_text: str = "", 58 | button_url: str = "", 59 | theme_color: str = "00FF00", 60 | proxy: Optional[str] = None, 61 | **kwargs: Any, 62 | ) -> None: 63 | super(MSTeamsWebhookOperator, self).__init__(**kwargs) 64 | self.http_conn_id = http_conn_id 65 | self.webhook_token = webhook_token 66 | self.message = message 67 | self.subtitle = subtitle 68 | self.button_text = button_text 69 | self.button_url = button_url 70 | self.theme_color = theme_color 71 | self.proxy = proxy 72 | 73 | def execute(self, context: Any) -> None: 74 | """ 75 | Call the webhook with the required parameters 76 | """ 77 | MSTeamsWebhookHook( 78 | self.http_conn_id, 79 | self.webhook_token, 80 | self.message, 81 | self.subtitle, 82 | self.button_text, 83 | self.button_url, 84 | self.theme_color, 85 | self.proxy, 86 | ).execute() 87 | logging.info("Webhook request sent to MS Teams") 88 | -------------------------------------------------------------------------------- /dbt_airflow_factory/operator.py: -------------------------------------------------------------------------------- 1 | """Factories creating Airflow Operators running DBT tasks.""" 2 | 3 | import abc 4 | from typing import List, Optional 5 | 6 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 7 | 8 | if IS_FIRST_AIRFLOW_VERSION: 9 | from airflow.operators.dummy_operator import DummyOperator 10 | else: 11 | from airflow.operators.dummy import DummyOperator 12 | 13 | from airflow.models.baseoperator import BaseOperator 14 | 15 | 16 | class DbtRunOperatorBuilder(metaclass=abc.ABCMeta): 17 | """ 18 | Base class of a factory creating Airflow 19 | :class:`airflow.models.baseoperator.BaseOperator` running a single DBT task. 20 | """ 21 | 22 | @abc.abstractmethod 23 | def create( 24 | self, 25 | name: str, 26 | command: str, 27 | model: Optional[str] = None, 28 | additional_dbt_args: Optional[List[str]] = None, 29 | ) -> BaseOperator: 30 | """ 31 | Create Airflow Operator running a single DBT task. 32 | 33 | :param name: task name. 34 | :type name: str 35 | :param command: DBT command to run. 36 | :type command: str 37 | :param model: models to include. 38 | :type model: Optional[str] 39 | :param additional_dbt_args: Additional arguments to pass to dbt. 40 | :type additional_dbt_args: Optional[List[str]] 41 | :return: Airflow Operator running a single DBT task. 42 | :rtype: BaseOperator 43 | """ 44 | raise NotImplementedError 45 | 46 | 47 | class EphemeralOperator(DummyOperator): 48 | """ 49 | :class:`DummyOperator` representing ephemeral DBT model. 50 | """ 51 | 52 | ui_color = "#F3E4F7" 53 | -------------------------------------------------------------------------------- /dbt_airflow_factory/tasks.py: -------------------------------------------------------------------------------- 1 | """Classes representing tasks corresponding to a single DBT model.""" 2 | 3 | from typing import Dict, Iterable, List, Optional 4 | 5 | from airflow.models.baseoperator import BaseOperator 6 | 7 | 8 | class ModelExecutionTask: 9 | """ 10 | Wrapper around tasks corresponding to a single DBT model. 11 | 12 | :param execution_airflow_task: Operator running DBT's ``run`` task. 13 | :type execution_airflow_task: BaseOperator 14 | :param test_airflow_task: Operator running DBT's ``test`` task (optional). 15 | :type test_airflow_task: BaseOperator 16 | :param task_group: TaskGroup consisting of ``run`` and ``test`` tasks 17 | (if Airflow version is at least 2). 18 | """ 19 | 20 | def __init__( # type: ignore 21 | self, 22 | execution_airflow_task: BaseOperator, 23 | test_airflow_task: Optional[BaseOperator] = None, 24 | task_group=None, 25 | ) -> None: 26 | self.execution_airflow_task = execution_airflow_task 27 | self.test_airflow_task = test_airflow_task 28 | self.task_group = task_group 29 | 30 | def __repr__(self) -> str: 31 | return ( 32 | repr(self.task_group) 33 | if self.task_group 34 | else repr( 35 | [self.execution_airflow_task] 36 | + ([self.test_airflow_task] if self.test_airflow_task else []) 37 | ) 38 | ) 39 | 40 | def get_start_task(self): # type: ignore 41 | """ 42 | Return model's first task. 43 | 44 | It is either a whole TaskGroup or ``run`` task. 45 | """ 46 | return self.task_group or self.execution_airflow_task 47 | 48 | def get_end_task(self): # type: ignore 49 | """ 50 | Return model's last task. 51 | 52 | It is either a whole TaskGroup, ``test`` task, or ``run`` task, depending 53 | on version of Airflow and existence of ``test`` task. 54 | """ 55 | return self.task_group or self.test_airflow_task or self.execution_airflow_task 56 | 57 | 58 | class ModelExecutionTasks: 59 | """ 60 | Dictionary of all Operators corresponding to DBT tasks. 61 | 62 | :param tasks: Dictionary of model tasks. 63 | :type tasks: Dict[str, ModelExecutionTask] 64 | :param starting_task_names: List of names of initial tasks (DAG sources). 65 | :type starting_task_names: List[str] 66 | :param ending_task_names: List of names of ending tasks (DAG sinks). 67 | :type ending_task_names: List[str] 68 | """ 69 | 70 | def __init__( 71 | self, 72 | tasks: Dict[str, ModelExecutionTask], 73 | starting_task_names: List[str], 74 | ending_task_names: List[str], 75 | ) -> None: 76 | self._tasks = tasks 77 | self._starting_task_names = starting_task_names 78 | self._ending_task_names = ending_task_names 79 | 80 | def __repr__(self) -> str: 81 | return f"ModelExecutionTasks(\n {self._tasks} \n)" 82 | 83 | def get_task(self, node_name: str) -> ModelExecutionTask: 84 | """ 85 | Return :class:`ModelExecutionTask` for given model's **node_name**. 86 | 87 | :param node_name: Name of the task. 88 | :type node_name: str 89 | :return: Wrapper around tasks corresponding to a given model. 90 | :rtype: ModelExecutionTask 91 | """ 92 | return self._tasks[node_name] 93 | 94 | def length(self) -> int: 95 | """Count TaskGroups corresponding to a single DBT model.""" 96 | return len(self._tasks) 97 | 98 | def get_starting_tasks(self) -> List[ModelExecutionTask]: 99 | """ 100 | Get a list of all DAG sources. 101 | 102 | :return: List of all DAG sources. 103 | :rtype: List[ModelExecutionTask] 104 | """ 105 | return self._extract_by_keys(self._starting_task_names) 106 | 107 | def get_ending_tasks(self) -> List[ModelExecutionTask]: 108 | """ 109 | Get a list of all DAG sinks. 110 | 111 | :return: List of all DAG sinks. 112 | :rtype: List[ModelExecutionTask] 113 | """ 114 | return self._extract_by_keys(self._ending_task_names) 115 | 116 | def _extract_by_keys(self, keys: Iterable[str]) -> List[ModelExecutionTask]: 117 | tasks = [] 118 | for key in keys: 119 | tasks.append(self._tasks[key]) 120 | return tasks 121 | -------------------------------------------------------------------------------- /dbt_airflow_factory/tasks_builder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/dbt_airflow_factory/tasks_builder/__init__.py -------------------------------------------------------------------------------- /dbt_airflow_factory/tasks_builder/builder.py: -------------------------------------------------------------------------------- 1 | """Class parsing ``manifest.json`` into Airflow tasks.""" 2 | import json 3 | import logging 4 | from typing import Any, ContextManager, Dict, Tuple 5 | 6 | from airflow.models.baseoperator import BaseOperator 7 | from airflow.operators.dummy import DummyOperator 8 | from airflow.sensors.external_task_sensor import ExternalTaskSensor 9 | 10 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 11 | from dbt_airflow_factory.tasks_builder.parameters import TasksBuildingParameters 12 | 13 | if not IS_FIRST_AIRFLOW_VERSION: 14 | from airflow.utils.task_group import TaskGroup 15 | 16 | from dbt_graph_builder.builder import GraphConfiguration, create_tasks_graph 17 | from dbt_graph_builder.gateway import GatewayConfiguration 18 | from dbt_graph_builder.graph import DbtManifestGraph 19 | from dbt_graph_builder.node_type import NodeType 20 | 21 | from dbt_airflow_factory.operator import DbtRunOperatorBuilder, EphemeralOperator 22 | from dbt_airflow_factory.tasks import ModelExecutionTask, ModelExecutionTasks 23 | 24 | 25 | class DbtAirflowTasksBuilder: 26 | """ 27 | Parses ``manifest.json`` into Airflow tasks. 28 | 29 | :param airflow_config: DBT node operator. 30 | :type airflow_config: TasksBuildingParameters 31 | :param operator_builder: DBT node operator. 32 | :type operator_builder: DbtRunOperatorBuilder 33 | :param gateway_config: DBT node operator. 34 | :type gateway_config: GatewayConfiguration 35 | """ 36 | 37 | def __init__( 38 | self, 39 | airflow_config: TasksBuildingParameters, 40 | operator_builder: DbtRunOperatorBuilder, 41 | gateway_config: GatewayConfiguration, 42 | ): 43 | self.operator_builder = operator_builder 44 | self.airflow_config = airflow_config 45 | self.gateway_config = gateway_config 46 | 47 | def parse_manifest_into_tasks(self, manifest_path: str) -> ModelExecutionTasks: 48 | """ 49 | Parse ``manifest.json`` into tasks. 50 | 51 | :param manifest_path: Path to ``manifest.json``. 52 | :type manifest_path: str 53 | :return: Dictionary of tasks created from ``manifest.json`` parsing. 54 | :rtype: ModelExecutionTasks 55 | """ 56 | return self._make_dbt_tasks(manifest_path) 57 | 58 | def create_seed_task(self) -> BaseOperator: 59 | """ 60 | Create ``dbt_seed`` task. 61 | 62 | :return: Operator for ``dbt_seed`` task. 63 | :rtype: BaseOperator 64 | """ 65 | return self.operator_builder.create("dbt_seed", "seed") 66 | 67 | @staticmethod 68 | def _load_dbt_manifest(manifest_path: str) -> dict: 69 | with open(manifest_path, "r") as f: 70 | manifest_content = json.load(f) 71 | logging.debug("Manifest content: " + str(manifest_content)) 72 | return manifest_content 73 | 74 | def _make_dbt_test_task(self, model_name: str, is_in_task_group: bool) -> BaseOperator: 75 | command = "test" 76 | return self.operator_builder.create( 77 | self._build_task_name(model_name, command, is_in_task_group), 78 | command, 79 | model_name, 80 | additional_dbt_args=["--indirect-selection=cautious"], 81 | ) 82 | 83 | def _make_dbt_multiple_deps_test_task( 84 | self, test_names: str, dependency_tuple_str: str 85 | ) -> BaseOperator: 86 | command = "test" 87 | return self.operator_builder.create(dependency_tuple_str, command, test_names) 88 | 89 | def _make_dbt_run_task(self, model_name: str, is_in_task_group: bool) -> BaseOperator: 90 | command = "run" 91 | return self.operator_builder.create( 92 | self._build_task_name(model_name, command, is_in_task_group), 93 | command, 94 | model_name, 95 | ) 96 | 97 | @staticmethod 98 | def _build_task_name(model_name: str, command: str, is_in_task_group: bool) -> str: 99 | return command if is_in_task_group else f"{model_name}_{command}" 100 | 101 | @staticmethod 102 | def _create_task_group_for_model( 103 | model_name: str, use_task_group: bool 104 | ) -> Tuple[Any, ContextManager]: 105 | import contextlib 106 | 107 | task_group = ( 108 | None 109 | if (IS_FIRST_AIRFLOW_VERSION or not use_task_group) 110 | else TaskGroup(group_id=model_name) 111 | ) 112 | task_group_ctx = task_group or contextlib.nullcontext() 113 | return task_group, task_group_ctx 114 | 115 | def _create_task_for_model( 116 | self, 117 | model_name: str, 118 | use_task_group: bool, 119 | ) -> ModelExecutionTask: 120 | (task_group, task_group_ctx) = self._create_task_group_for_model(model_name, use_task_group) 121 | is_in_task_group = task_group is not None 122 | with task_group_ctx: 123 | run_task = self._make_dbt_run_task(model_name, is_in_task_group) 124 | test_task = self._make_dbt_test_task(model_name, is_in_task_group) 125 | # noinspection PyStatementEffect 126 | run_task >> test_task 127 | return ModelExecutionTask(run_task, test_task, task_group) 128 | 129 | def _create_task_from_graph_node( 130 | self, node_name: str, node: Dict[str, Any] 131 | ) -> ModelExecutionTask: 132 | if node["node_type"] == NodeType.MULTIPLE_DEPS_TEST: 133 | return ModelExecutionTask( 134 | self._make_dbt_multiple_deps_test_task(node["select"], node_name), None 135 | ) 136 | if node["node_type"] == NodeType.SOURCE_SENSOR: 137 | return self._create_dag_sensor(node) 138 | if node["node_type"] == NodeType.MOCK_GATEWAY: 139 | return self._create_dummy_task(node) 140 | if node["node_type"] == NodeType.EPHEMERAL: 141 | return ModelExecutionTask( 142 | EphemeralOperator(task_id=f"{node['select']}__ephemeral"), None 143 | ) 144 | return self._create_task_for_model( 145 | node["select"], 146 | self.airflow_config.use_task_group, 147 | ) 148 | 149 | def _create_tasks_from_graph(self, dbt_airflow_graph: DbtManifestGraph) -> ModelExecutionTasks: 150 | result_tasks = { 151 | node_name: self._create_task_from_graph_node(node_name, node) 152 | for node_name, node in dbt_airflow_graph.get_graph_nodes() 153 | } 154 | for node, neighbour in dbt_airflow_graph.get_graph_edges(): 155 | # noinspection PyStatementEffect 156 | (result_tasks[node].get_end_task() >> result_tasks[neighbour].get_start_task()) 157 | return ModelExecutionTasks( 158 | result_tasks, 159 | dbt_airflow_graph.get_graph_sources(), 160 | dbt_airflow_graph.get_graph_sinks(), 161 | ) 162 | 163 | def _make_dbt_tasks(self, manifest_path: str) -> ModelExecutionTasks: 164 | manifest = self._load_dbt_manifest(manifest_path) 165 | dbt_airflow_graph: DbtManifestGraph = create_tasks_graph( 166 | manifest, 167 | GraphConfiguration( 168 | gateway_config=self.gateway_config, 169 | enable_dags_dependencies=self.airflow_config.enable_dags_dependencies, 170 | show_ephemeral_models=self.airflow_config.show_ephemeral_models, 171 | check_all_deps_for_multiple_deps_tests=self.airflow_config.check_all_deps_for_multiple_deps_tests, 172 | ), 173 | ) 174 | tasks_with_context = self._create_tasks_from_graph(dbt_airflow_graph) 175 | logging.info(f"Created {str(tasks_with_context.length())} tasks groups") 176 | return tasks_with_context 177 | 178 | def _create_dag_sensor(self, node: Dict[str, Any]) -> ModelExecutionTask: 179 | # todo move parameters to configuration 180 | return ModelExecutionTask( 181 | ExternalTaskSensor( 182 | task_id="sensor_" + node["select"], 183 | external_dag_id=node["dag"], 184 | external_task_id=node["select"] 185 | + (".test" if self.airflow_config.use_task_group else "_test"), 186 | timeout=24 * 60 * 60, 187 | allowed_states=["success"], 188 | failed_states=["failed", "skipped"], 189 | mode="reschedule", 190 | ) 191 | ) 192 | 193 | @staticmethod 194 | def _create_dummy_task(node: Dict[str, Any]) -> ModelExecutionTask: 195 | return ModelExecutionTask(DummyOperator(task_id=node["select"])) 196 | -------------------------------------------------------------------------------- /dbt_airflow_factory/tasks_builder/parameters.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass(frozen=True) 5 | class TasksBuildingParameters: 6 | use_task_group: bool = True 7 | show_ephemeral_models: bool = True 8 | enable_dags_dependencies: bool = False 9 | check_all_deps_for_multiple_deps_tests: bool = False 10 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -Ea 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | apidoc: 16 | @sphinx-apidoc ../dbt_airflow_factory -o source -fMT -d 1 17 | 18 | .PHONY: help apidoc Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | If you are looking for information on a specific function, class, or 5 | method, this part of the documentation is for you. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | source/dbt_airflow_factory 11 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CHANGELOG.md 2 | ``` -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | Data Pipelines CLI 2 | ----- 3 | 4 | **DBT Airflow Factory** works best in tandem with `data-pipelines-cli `_ 5 | tool. **dp** not only prepares directory for the library to digest, but also automates Docker image building and pushes 6 | generated directory to the cloud storage of your choice. 7 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | import os 10 | import sys 11 | 12 | sys.path.insert(0, os.path.abspath("..")) 13 | 14 | 15 | # -- Project information ----------------------------------------------------- 16 | 17 | project = "dbt-airflow-factory" 18 | copyright = "2021, GetInData" 19 | author = "GetInData" 20 | 21 | 22 | # -- General configuration --------------------------------------------------- 23 | 24 | # Add any Sphinx extension module names here, as strings. They can be 25 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 26 | # ones. 27 | extensions = [ 28 | "sphinx.ext.autodoc", 29 | "sphinx.ext.viewcode", 30 | "sphinx_click.ext", 31 | "myst_parser", 32 | ] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ["_templates"] 36 | 37 | # List of patterns, relative to source directory, that match files and 38 | # directories to ignore when looking for source files. 39 | # This pattern also affects html_static_path and html_extra_path. 40 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 41 | 42 | add_module_names = False 43 | 44 | 45 | # -- Options for HTML output ------------------------------------------------- 46 | 47 | # The theme to use for HTML and HTML Help pages. See the documentation for 48 | # a list of builtin themes. 49 | # 50 | html_theme = "sphinx_rtd_theme" 51 | 52 | # Add any paths that contain custom static files (such as style sheets) here, 53 | # relative to this directory. They are copied after the builtin static files, 54 | # so a file named "default.css" will overwrite the builtin "default.css". 55 | html_static_path = ["_static"] 56 | -------------------------------------------------------------------------------- /docs/configuration.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ----- 3 | 4 | Description 5 | +++++++++++++++++++ 6 | 7 | airflow.yml file 8 | ~~~~~~~~~~~~~~~~~~~~~~~ 9 | .. list-table:: 10 | :widths: 25 20 2 13 40 11 | :header-rows: 1 12 | 13 | * - Parameter 14 | - Data type 15 | - Required 16 | - Default 17 | - Description 18 | * - default_args 19 | - dictionary 20 | - x 21 | - 22 | - Values that are passed to DAG as default_arguments (check Airflow documentation for more details) 23 | * - dag 24 | - dictionary 25 | - x 26 | - 27 | - Values used to DAG creation. Currently ``dag_id``, ``description``, ``schedule_interval`` and ``catchup`` are supported. Check Airflow documentation for more details about each of them. 28 | * - seed_task 29 | - boolean 30 | - 31 | - False 32 | - Enables first task of the DAG that is responsible for executing *dbt seed* command to load some data. 33 | * - manifest_file_name 34 | - string 35 | - 36 | - manifest.json 37 | - Name of the file with DBT manifest. 38 | * - use_task_group 39 | - boolean 40 | - 41 | - False 42 | - Enable grouping ``dbt run`` and ``dbt test`` into Airflow's Task Groups. It is only available in Airflow 2+ (check Airflow documentation for more details). 43 | * - show_ephemeral_models 44 | - boolean 45 | - 46 | - True 47 | - Enabled/disables separate tasks for DBT's ephemeral models. The tasks are finished in second as they have nothing to do. 48 | * - failure_handlers 49 | - list 50 | - 51 | - empty list 52 | - Each item of the list contains configuration of notifications handler in case Tasks or DAG fails. Each item is a dictionary with following fields 53 | ``type`` (type of handler eg. *slack* or *teams*), ``webserver_url`` (Airflow Webserver URL), ``connection_id`` (id of the Airflow's connection) and ``message_template`` that will be sent. 54 | More on how to configure the webhooks can be found here: `Slack `_ or `MS Teams `_ 55 | * - enable_project_dependencies 56 | - boolean 57 | - 58 | - False 59 | - When True it creates sensors for all sources that have dag name in metadata. The sources wait for selected DAGs to finish. 60 | * - save_points 61 | - list of string 62 | - 63 | - empty list 64 | - List of schemas between which the gateway should be created. 65 | 66 | dbt.yml file 67 | ~~~~~~~~~~~~~~~~~~~~~~~ 68 | .. list-table:: 69 | :widths: 25 20 2 13 40 70 | :header-rows: 1 71 | 72 | * - Parameter 73 | - Data type 74 | - Required 75 | - Default 76 | - Description 77 | * - target 78 | - string 79 | - x 80 | - 81 | - Values that are passed to DAG as default_arguments (check Airflow documentation for more details) 82 | * - project_dir_path 83 | - string 84 | - 85 | - /dbt 86 | - Values used to DAG creation. Currently ``dag_id``, ``description``, ``schedule_interval`` and ``catchup`` are supported. Check Airflow documentation for more details about each of them. 87 | * - profile_dir_path 88 | - string 89 | - 90 | - /root/.dbt 91 | - Enables first task of the DAG that is responsible for executing ``dbt seed`` command to load some data. 92 | * - vars 93 | - dictionary 94 | - 95 | - 96 | - Dictionary of variables passed to DBT tasks. 97 | 98 | execution_env.yml file 99 | ~~~~~~~~~~~~~~~~~~~~~~~ 100 | 101 | .. list-table:: 102 | :widths: 25 20 2 53 103 | :header-rows: 1 104 | 105 | * - Parameter 106 | - Data type 107 | - Required 108 | - Description 109 | * - image.repository 110 | - string 111 | - x 112 | - Docker image repository URL 113 | * - image.tag 114 | - string 115 | - x 116 | - Docker image tag 117 | * - type 118 | - string 119 | - x 120 | - Selects type of execution environment. Currently only k8s is available. 121 | 122 | k8s.yml file 123 | ~~~~~~~~~~~~~~~~~~~~~~~ 124 | 125 | .. list-table:: 126 | :widths: 25 20 2 53 127 | :header-rows: 1 128 | 129 | * - Parameter 130 | - Data type 131 | - Required 132 | - Description 133 | * - image_pull_policy 134 | - string 135 | - x 136 | - See kubernetes documentation for details: https://kubernetes.io/docs/concepts/containers/images/#image-pull-policy 137 | * - namespace 138 | - string 139 | - x 140 | - Name of the namespace to run processing 141 | * - envs 142 | - dictionary 143 | - 144 | - Environment variables that will be passed to container 145 | * - secrets 146 | - list of dictionaries 147 | - 148 | - List that contains secrets mounted to each container. It is required to set ``secret`` as name, ``deploy_type`` (env or volume) and ``deploy_target`` which is path for volume type and name for envs. 149 | * - labels 150 | - dictionary 151 | - 152 | - Dictionary that contains labels set on created pods 153 | * - annotations 154 | - dictionary 155 | - 156 | - Annotations applied to created pods 157 | * - is_delete_operator_pod 158 | - boolean 159 | - 160 | - If set to True finished containers will be deleted 161 | * - config_file 162 | - string 163 | - 164 | - Path to the k8s configuration available in Airflow 165 | * - resources.node_selectors 166 | - dictionary 167 | - 168 | - See more details in Kubernetes documentation: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector 169 | * - resources.tolerations 170 | - list of dictionaries 171 | - 172 | - See more details in Kubernetes documentation: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ 173 | * - resources.limit 174 | - dictionary 175 | - 176 | - See more details in Kubernetes documentation: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ 177 | * - resources.requests 178 | - dictionary 179 | - 180 | - See more details in Kubernetes documentation: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ 181 | * - execution_script 182 | - str 183 | - 184 | - Script that will be executed inside pod 185 | * - in_cluster 186 | - bool 187 | - 188 | - Run kubernetes client with in_cluster configuration 189 | * - cluster_context 190 | - str 191 | - 192 | - Context that points to kubernetes cluster, ignored when ``in_cluster`` is ``True``. If ``None``, current-context is used. 193 | * - startup_timeout_seconds 194 | - int 195 | - 196 | - Timeout in seconds to startup the pod. 197 | 198 | airbyte.yml file 199 | ~~~~~~~~~~~~~~~~~~~~~~~ 200 | .. list-table:: 201 | :widths: 25 20 2 13 40 202 | :header-rows: 1 203 | 204 | * - Parameter 205 | - Data type 206 | - Required 207 | - Default 208 | - Description 209 | * - airbyte_connection_id 210 | - string 211 | - x 212 | - 213 | - Connection id for Airbyte in Airflow instance. Remember to add this to Airflow's dependencies 214 | ``apache-airflow-providers-airbyte`` to be able to add such connection. 215 | * - tasks 216 | - list of objects 217 | - x 218 | - 219 | - Each task consist of fields 220 | 221 | **task_id**: string - name of the task which will be shown on airflow 222 | 223 | **connection_id**: string - id of Airbyte connection 224 | 225 | **asyncrounous**: boolean - Flag to get job_id after submitting the job to the Airbyte API. 226 | 227 | **api_version**: string - Airbyte API version 228 | 229 | **wait_seconds**: integer - Number of seconds between checks. Only used when ``asynchronous`` is False 230 | 231 | **timeout**: float - The amount of time, in seconds, to wait for the request to complete 232 | 233 | 234 | ingestion.yml file 235 | ~~~~~~~~~~~~~~~~~~~~~~~ 236 | .. list-table:: 237 | :widths: 25 20 2 13 40 238 | :header-rows: 1 239 | 240 | * - Parameter 241 | - Data type 242 | - Required 243 | - Default 244 | - Description 245 | * - enable 246 | - boolean 247 | - x 248 | - 249 | - Boolean value to specify if ingestion task should be added to Airflow's DAG. 250 | * - engine 251 | - string 252 | - x 253 | - 254 | - Enumeration based option, currently only supported value is ``airbyte`` 255 | 256 | 257 | airbyte.yml file 258 | ~~~~~~~~~~~~~~~~~~~~~~~ 259 | .. list-table:: 260 | :widths: 25 20 2 13 40 261 | :header-rows: 1 262 | 263 | * - Parameter 264 | - Data type 265 | - Required 266 | - Default 267 | - Description 268 | * - airbyte_connection_id 269 | - string 270 | - x 271 | - 272 | - Connection id for airbyte in airflow instance. Remember to add this to airflow dependencies 273 | ``apache-airflow-providers-airbyte`` to be able to add such connection. 274 | * - tasks 275 | - list of objects 276 | - x 277 | - 278 | - Each task consist of fields 279 | 280 | **task_id**: string - name of the task which will be shown in Airflow 281 | 282 | **connection_id**: string - id of Airbyte connection 283 | 284 | **asyncrounous**: boolean - Flag to get job_id after submitting the job to the Airbyte API. 285 | 286 | **api_version**: string - Airbyte API version 287 | 288 | **wait_seconds**: integer - Number of seconds between checks. Only used when ``asynchronous`` is ``False`` 289 | 290 | **timeout**: float - The amount of time, in seconds, to wait for the request to complete 291 | 292 | 293 | ingestion.yml file 294 | ~~~~~~~~~~~~~~~~~~~~~~~ 295 | .. list-table:: 296 | :widths: 25 20 2 13 40 297 | :header-rows: 1 298 | 299 | * - Parameter 300 | - Data type 301 | - Required 302 | - Default 303 | - Description 304 | * - enable 305 | - boolean 306 | - x 307 | - 308 | - Boolean value to specify if ingestion task should be added to airflow dag. 309 | * - engine 310 | - string 311 | - x 312 | - 313 | - Enumeration based option, currently only supported value is ``airbyte`` 314 | 315 | 316 | airbyte.yml file 317 | ~~~~~~~~~~~~~~~~~~~~~~~ 318 | .. list-table:: 319 | :widths: 25 20 2 13 40 320 | :header-rows: 1 321 | 322 | * - Parameter 323 | - Data type 324 | - Required 325 | - Default 326 | - Description 327 | * - airbyte_connection_id 328 | - string 329 | - x 330 | - 331 | - Connection id for airbyte in airflow instance. Remember to add this to Airflow's dependencies 332 | ``apache-airflow-providers-airbyte`` to be able to add such connection. 333 | * - tasks 334 | - list of objects 335 | - x 336 | - 337 | - Each task consist of fields 338 | 339 | **task_id**: string - name of the task which will be shown on airflow 340 | 341 | **connection_id**: string - id of airbyte connection 342 | 343 | **asyncrounous**: boolean - Flag to get job_id after submitting the job to the Airbyte API. 344 | 345 | **api_version**: string - Airbyte API version 346 | 347 | **wait_seconds**: integer - Number of seconds between checks. Only used when ``asynchronous`` is False 348 | 349 | **timeout**: float - The amount of time, in seconds, to wait for the request to complete 350 | 351 | 352 | ingestion.yml file 353 | ~~~~~~~~~~~~~~~~~~~~~~~ 354 | .. list-table:: 355 | :widths: 25 20 2 13 40 356 | :header-rows: 1 357 | 358 | * - Parameter 359 | - Data type 360 | - Required 361 | - Default 362 | - Description 363 | * - enable 364 | - boolean 365 | - x 366 | - 367 | - Boolean value to specify if ingestion task should be added to Airflow's DAG. 368 | * - engine 369 | - string 370 | - x 371 | - 372 | - Enumeration based option, currently only supported value is ``airbyte`` 373 | 374 | Example files 375 | +++++++++++++++++++ 376 | 377 | It is best to look up the example configuration files in 378 | `tests directory `_ to get a glimpse 379 | of correct configs. 380 | -------------------------------------------------------------------------------- /docs/features.rst: -------------------------------------------------------------------------------- 1 | Features 2 | ----- 3 | 4 | Building Airflow DAG 5 | +++++++++++++++++++ 6 | **dbt-airflow-factory** main responsibility is parsing the DBT manifest file and creating a Graph of Airflow tasks. 7 | For each model, the library creates pair of tasks: run and test. The first one is responsible for the execution 8 | of transformations and the second one is for testing (``dbt run`` and ``dbt test`` commands). 9 | 10 | .. image:: images/dag.png 11 | :width: 400 12 | 13 | Kubernetes as execution environment 14 | +++++++++++++++++++ 15 | At the moment library supports Kubernetes as an execution environment. 16 | It is possible to use the same Kubernetes that Airflow is deployed on as well as an external one 17 | (in the case of deployment with celery). 18 | 19 | Bash Operator 20 | +++++++++++++++++++ 21 | For local testing library supports BashOperator. 22 | 23 | Tasks grouping 24 | +++++++++++++++++++ 25 | Using the tool with Airflow 2.0 gives a possibility for grouping run and test tasks together. 26 | It is done by TaskGroup Airflow functionality. 27 | 28 | .. image:: images/grouped.png 29 | :width: 600 30 | 31 | Hiding ephemeral models 32 | +++++++++++++++++++ 33 | DBT introduces ephemeral models to reuse code but they are not executed on their own. That's why 34 | usually it makes no sense to present them in Airflow. The library allows disabling them from rendering. 35 | 36 | .. image:: images/ephemeral.png 37 | :width: 600 38 | 39 | The above picture presents ephemeral tasks colored pink. Enabling hiding will cause all of the pink rectangles to disappear. 40 | 41 | Tests depending on multiple models 42 | +++++++++++++++++++ 43 | Some tests that are written in the tests directory may be executed on multiple models and they don't 44 | have a clear connection to any of them. The library detects it automatically and creates separate Airflow 45 | tasks for these tests. 46 | 47 | .. image:: images/tests.png 48 | :width: 600 49 | 50 | Dynamically resolvable configuration 51 | +++++++++++++++++++ 52 | Most of the configuration used by the library is subject to be resolved during execution. You can use `Airflow template variables `_ 53 | in your ``dbt.yml`` and ``k8s.yml`` files, as long as they are inside quotation marks: 54 | 55 | .. code-block:: yaml 56 | 57 | target: "{{ var.value.env }}" 58 | some_other_field: "{{ ds_nodash }}" 59 | 60 | Analogously, you can use ``"{{ var.value.VARIABLE_NAME }}"`` in ``airflow.yml``, but only the Airflow variable getter. 61 | Any other Airflow template variables will not work in ``airflow.yml``. 62 | 63 | Notifications 64 | +++++++++++++++++++ 65 | It is possible to configure notifications in case of task failure. Currently, the only available channels are Slack and MS Teams. 66 | 67 | .. image:: images/slack_notification.png 68 | :width: 800 69 | 70 | .. image:: images/msteams_notification.png 71 | :width: 800 72 | 73 | Source dependencies 74 | +++++++++++++++++++ 75 | Dividing the DBT project into smaller pieces or using data produced by other teams usually needs to wait for 76 | data to be ready before our transformation will be executed. It is possible to use sources with metadata 77 | to create Airflow sensors for other DAGs. Sensors will wait for these DAGs to finish before dependencies 78 | will be executed. 79 | 80 | The below picture presents upstream DAG created by Team A. Team A exposes data prepared by ``report`` task. 81 | 82 | .. image:: images/upstream.png 83 | :width: 600 84 | 85 | The following picture presents DAG that belongs to Team B that needs to use data created by Team A and extend it with their internal products. 86 | 87 | .. image:: images/downstream.png 88 | :width: 600 89 | 90 | Adding ingestion layer 91 | +++++++++++++++++++ 92 | Airflow dbt factory adds the possibility to specify ingestion tasks before dbt process. This helps with keeping data 93 | reliable and trustworthy. No more data inconsistencies, to turn this option on you need to add 2 configuration files 94 | to airflow dbt factory. 95 | 96 | - ingestion.yml 97 | .. code-block:: yaml 98 | 99 | enable: True 100 | engine: airbyte 101 | 102 | - airbyte.yml 103 | .. code-block:: yaml 104 | 105 | airbyte_connection_id: "airbyte_connection_id" 106 | tasks: 107 | - task_id: "postgres_ingestion" 108 | connection_id: "748ae2b6-b96d-4269-9550-d6ed57046182" 109 | asyncrounous: True 110 | api_version: "v1" 111 | wait_seconds: 3 112 | timeout: 110.0 113 | 114 | 115 | When options are properly filled in, the ingestion tasks should appear before the dbt data transformation step. 116 | 117 | .. image:: images/ingestions_tasks.png 118 | :width: 600 119 | 120 | Task Gateway 121 | +++++++++++++++++++ 122 | To prevent data inconsistency dbt-airflow-factory supports creating gateway between airflow dbt tasks representing specific 123 | schemas. It might be useful when dividing ETL pipelines into 3 main components (or more): 124 | 125 | - raw 126 | - staging 127 | - presentation 128 | 129 | Where presentation layer represents the data used by external clients like BI tools, analysts or other processes. 130 | Creating gateway between the staging and presentation may help with keeping data tested and reliable in presentation 131 | layer where BI tools or other analytics may be connected. Below examples show such a gateway in dag. 132 | 133 | 134 | .. image:: images/gateway.png 135 | :width: 600 136 | :align: center 137 | 138 | To create such a gateway all you need is adding config in `airflow.yml` file 139 | 140 | .. code-block:: yaml 141 | 142 | save_points: 143 | - stg 144 | - presentation 145 | -------------------------------------------------------------------------------- /docs/images/dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/dag.png -------------------------------------------------------------------------------- /docs/images/downstream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/downstream.png -------------------------------------------------------------------------------- /docs/images/ephemeral.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/ephemeral.png -------------------------------------------------------------------------------- /docs/images/gateway.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/gateway.png -------------------------------------------------------------------------------- /docs/images/grouped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/grouped.png -------------------------------------------------------------------------------- /docs/images/ingestions_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/ingestions_tasks.png -------------------------------------------------------------------------------- /docs/images/msteams_notification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/msteams_notification.png -------------------------------------------------------------------------------- /docs/images/slack_notification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/slack_notification.png -------------------------------------------------------------------------------- /docs/images/tests.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/tests.png -------------------------------------------------------------------------------- /docs/images/upstream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/docs/images/upstream.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ``DBT Airflow Factory`` 2 | ======================= 3 | 4 | .. image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue.svg 5 | :target: https://github.com/getindata/dbt-airflow-factory 6 | :alt: Python Version 7 | 8 | .. image:: https://badge.fury.io/py/dbt-airflow-factory.svg 9 | :target: https://pypi.org/project/dbt-airflow-factory/ 10 | :alt: PyPI Version 11 | 12 | .. image:: https://pepy.tech/badge/dbt-airflow-factory 13 | :target: https://pepy.tech/project/dbt-airflow-factory 14 | :alt: Downloads 15 | 16 | .. image:: https://api.codeclimate.com/v1/badges/47fd3570c858b6c166ad/maintainability 17 | :target: https://codeclimate.com/github/getindata/dbt-airflow-factory/maintainability 18 | :alt: Maintainability 19 | 20 | .. image:: https://api.codeclimate.com/v1/badges/47fd3570c858b6c166ad/test_coverage 21 | :target: https://codeclimate.com/github/getindata/dbt-airflow-factory/test_coverage 22 | :alt: Test Coverage 23 | 24 | Introduction 25 | ------------ 26 | The factory is a library for parsing DBT manifest files and building Airflow DAG. 27 | 28 | The library is expected to be used inside an Airflow environment with a Kubernetes image referencing **dbt**. 29 | 30 | **dbt-airflow-factory**'s main task is to parse ``manifest.json`` and create Airflow DAG out of it. It also reads config 31 | `YAML` files from ``config`` directory and therefore is highly customizable (e.g., user can set path to ``manifest.json``). 32 | DAG building is an on-the-fly process without materialization. Also, the process may use Airflow Variables as a way of configuration. 33 | 34 | Community 35 | ------------ 36 | Although the tools was created by GetInData and used in their project it is open-sourced and every one is welcome to use and contribute to make it better and even more usefull. 37 | 38 | .. toctree:: 39 | :maxdepth: 1 40 | :caption: Contents: 41 | 42 | installation 43 | usage 44 | configuration 45 | cli 46 | features 47 | api 48 | changelog 49 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Use the package manager [pip](https://pip.pypa.io/en/stable/) to install the library: 4 | 5 | ```shell 6 | pip install dbt-airflow-factory 7 | ``` 8 | 9 | ## Compatibility 10 | 11 | The library is compatibly with Airflow >= 1.10. It can be used in any managed anvironment where extra python packages can be added. It was tested so far with `Cloud Composer` and `Amazon Managed Workflows for Apache Airflow`. 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-rtd-theme==1.0.0 2 | sphinx-click>=3.0,<3.1 3 | myst-parser>=0.16, <0.17 4 | docutils<0.17 5 | apache-airflow[kubernetes]>=2.2.0,<3.0.0 6 | -------------------------------------------------------------------------------- /docs/source/dbt_airflow_factory.ecs.rst: -------------------------------------------------------------------------------- 1 | dbt\_airflow\_factory.ecs package 2 | ================================= 3 | 4 | .. automodule:: dbt_airflow_factory.ecs 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | dbt\_airflow\_factory.ecs.ecs\_operator module 13 | ---------------------------------------------- 14 | 15 | .. automodule:: dbt_airflow_factory.ecs.ecs_operator 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | dbt\_airflow\_factory.ecs.ecs\_parameters module 21 | ------------------------------------------------ 22 | 23 | .. automodule:: dbt_airflow_factory.ecs.ecs_parameters 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | dbt\_airflow\_factory.ecs.ecs\_parameters\_loader module 29 | -------------------------------------------------------- 30 | 31 | .. automodule:: dbt_airflow_factory.ecs.ecs_parameters_loader 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | -------------------------------------------------------------------------------- /docs/source/dbt_airflow_factory.k8s.rst: -------------------------------------------------------------------------------- 1 | dbt\_airflow\_factory.k8s package 2 | ================================= 3 | 4 | .. automodule:: dbt_airflow_factory.k8s 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | dbt\_airflow\_factory.k8s.k8s\_operator module 13 | ---------------------------------------------- 14 | 15 | .. automodule:: dbt_airflow_factory.k8s.k8s_operator 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | dbt\_airflow\_factory.k8s.k8s\_parameters module 21 | ------------------------------------------------ 22 | 23 | .. automodule:: dbt_airflow_factory.k8s.k8s_parameters 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | dbt\_airflow\_factory.k8s.k8s\_parameters\_loader module 29 | -------------------------------------------------------- 30 | 31 | .. automodule:: dbt_airflow_factory.k8s.k8s_parameters_loader 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | -------------------------------------------------------------------------------- /docs/source/dbt_airflow_factory.rst: -------------------------------------------------------------------------------- 1 | dbt\_airflow\_factory package 2 | ============================= 3 | 4 | .. automodule:: dbt_airflow_factory 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | 15 | dbt_airflow_factory.ecs 16 | dbt_airflow_factory.k8s 17 | dbt_airflow_factory.notifications 18 | dbt_airflow_factory.tasks_builder 19 | 20 | Submodules 21 | ---------- 22 | 23 | dbt\_airflow\_factory.airflow\_dag\_factory module 24 | -------------------------------------------------- 25 | 26 | .. automodule:: dbt_airflow_factory.airflow_dag_factory 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | dbt\_airflow\_factory.builder\_factory module 32 | --------------------------------------------- 33 | 34 | .. automodule:: dbt_airflow_factory.builder_factory 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | dbt\_airflow\_factory.config\_utils module 40 | ------------------------------------------ 41 | 42 | .. automodule:: dbt_airflow_factory.config_utils 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | dbt\_airflow\_factory.dbt\_parameters module 48 | -------------------------------------------- 49 | 50 | .. automodule:: dbt_airflow_factory.dbt_parameters 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | dbt\_airflow\_factory.ingestion module 56 | -------------------------------------- 57 | 58 | .. automodule:: dbt_airflow_factory.ingestion 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | dbt\_airflow\_factory.operator module 64 | ------------------------------------- 65 | 66 | .. automodule:: dbt_airflow_factory.operator 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | dbt\_airflow\_factory.tasks module 72 | ---------------------------------- 73 | 74 | .. automodule:: dbt_airflow_factory.tasks 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ----- 3 | 4 | To start, create a directory with a following structure, where ``manifest.json`` is a file generated by **dbt**: 5 | 6 | .. code-block:: bash 7 | 8 | . 9 | ├── config 10 | │ ├── base 11 | │ │ ├── airflow.yml 12 | │ │ ├── dbt.yml 13 | │ │ └── k8s.yml 14 | │ └── dev 15 | │ └── dbt.yml 16 | ├── dag.py 17 | └── manifest.json 18 | 19 | Then, put the following code into ``dag.py``: 20 | 21 | .. code-block:: python 22 | 23 | from dbt_airflow_factory.airflow_dag_factory import AirflowDagFactory 24 | from airflow.models import Variable 25 | from os import path 26 | 27 | dag = AirflowDagFactory(path.dirname(path.abspath(__file__)), Variable.get("env")).create() 28 | 29 | For older versions of Airflow (before 2.0) the dag file need to be slightly bigger: 30 | 31 | .. code-block:: python 32 | 33 | from airflow import DAG 34 | from pytimeparse import parse 35 | from os import path 36 | from airflow.models import Variable 37 | from dbt_airflow_factory.config_utils import read_config 38 | from dbt_airflow_factory.airflow_dag_factory import AirflowDagFactory 39 | 40 | dag_factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), Variable.get("env")) 41 | config = dag_factory.read_config() 42 | with DAG(default_args=config["default_args"], **config["dag"]) as dag: 43 | dag_factory.create_tasks(config) 44 | 45 | 46 | When uploaded to Airflow DAGs directory, it will get picked up by Airflow, parse ``manifest.json`` and prepare a DAG to run. 47 | 48 | Configuration files 49 | +++++++++++++++++++ 50 | 51 | It is best to look up the example configuration files in 52 | `tests directory `_ to get a glimpse 53 | of correct configs. 54 | 55 | You can use `Airflow template variables `_ 56 | in your ``dbt.yml`` and ``k8s.yml`` files, as long as they are inside quotation marks: 57 | 58 | .. code-block:: yaml 59 | 60 | target: "{{ var.value.env }}" 61 | some_other_field: "{{ ds_nodash }}" 62 | 63 | Analogously, you can use ``"{{ var.value.VARIABLE_NAME }}"`` in ``airflow.yml``, but only the Airflow variable getter. 64 | Any other Airflow template variables will not work in ``airflow.yml``. 65 | 66 | 67 | Creation of the directory with data-pipelines-cli 68 | +++++++++++++++++++++++++++++++++++++++++++++++++ 69 | 70 | **DBT Airflow Factory** works best in tandem with `data-pipelines-cli `_ 71 | tool. **dp** not only prepares directory for the library to digest, but also automates Docker image building and pushes 72 | generated directory to the cloud storage of your choice. 73 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.35.0 3 | 4 | [bumpversion:file:setup.py] 5 | 6 | [bumpversion:file:dbt_airflow_factory/__init__.py] 7 | 8 | [flake8] 9 | exclude = .git,__pycache__,build,dist,docs/source/conf.py 10 | max-line-length = 120 11 | extend-ignore = E203 12 | 13 | [mypy] 14 | ignore_missing_imports = True 15 | follow_imports = silent 16 | strict_optional = True 17 | no_implicit_optional = True 18 | warn_redundant_casts = True 19 | warn_unused_ignores = True 20 | check_untyped_defs = True 21 | no_implicit_reexport = True 22 | disallow_untyped_defs = True 23 | 24 | [mypy-tests.*] 25 | ignore_errors = True 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """dbt_airflow_factory module.""" 2 | 3 | from setuptools import find_packages, setup 4 | 5 | with open("README.md") as f: 6 | README = f.read() 7 | 8 | # Runtime Requirements. 9 | INSTALL_REQUIRES = [ 10 | "pytimeparse>=1.1, <2", 11 | "dbt-graph-builder>=0.7.0, <0.8.0", 12 | "apache-airflow[kubernetes,slack]>=2.5, <3", 13 | "apache-airflow-providers-airbyte>=3.1, <4", 14 | ] 15 | 16 | # Dev Requirements 17 | EXTRA_REQUIRE = { 18 | "tests": [ 19 | "pytest>=6.2.2, <7.0.0", 20 | "pytest-cov>=2.8.0, <3.0.0", 21 | "tox==3.21.1", 22 | "pre-commit==2.9.3", 23 | "pandas>=1.2.5, <2.0.0", 24 | ], 25 | "docs": [ 26 | "sphinx==4.3.1", 27 | "sphinx-rtd-theme==1.0.0", 28 | "sphinx-click>=3.0,<3.1", 29 | "myst-parser>=0.16, <0.17", 30 | "docutils<0.17", 31 | ], 32 | } 33 | 34 | setup( 35 | name="dbt-airflow-factory", 36 | version="0.35.0", 37 | description="Library to convert DBT manifest metadata to Airflow tasks", 38 | long_description=README, 39 | long_description_content_type="text/markdown", 40 | license="Apache Software License (Apache 2.0)", 41 | python_requires=">=3", 42 | classifiers=[ 43 | "Development Status :: 3 - Alpha", 44 | "Programming Language :: Python :: 3.8", 45 | "Programming Language :: Python :: 3.9", 46 | "Programming Language :: Python :: 3.10", 47 | "Programming Language :: Python :: 3.11", 48 | ], 49 | keywords="dbt airflow manifest parser python", 50 | author="Piotr Pekala", 51 | author_email="piotr.pekala@getindata.com", 52 | url="https://github.com/getindata/dbt-airflow-factory/", 53 | packages=find_packages(exclude=["ez_setup", "examples", "tests", "docs"]), 54 | include_package_data=True, 55 | zip_safe=False, 56 | install_requires=INSTALL_REQUIRES, 57 | extras_require=EXTRA_REQUIRE, 58 | ) 59 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/dbt-airflow-factory/903139045e373076fc720975a4961012c6d6c597/tests/__init__.py -------------------------------------------------------------------------------- /tests/config/airbyte_dev/airbyte.yml: -------------------------------------------------------------------------------- 1 | airbyte_connection_id: "airbyte_connection_id" 2 | tasks: 3 | - task_id: "postgres_ingestion" 4 | connection_id: "748ae2b6-b96d-4269-9550-d6ed57046182" 5 | asyncrounous: True 6 | api_version: "v1" 7 | wait_seconds: 3 8 | timeout: 110.0 9 | - task_id: "mysql_ingestion" 10 | connection_id: "fb016b35-535d-4571-8a67-3eab0a93373c" 11 | asyncrounous: False 12 | api_version: "v2" 13 | wait_seconds: 8 14 | timeout: 130.0 15 | - task_id: "sales_force_ingestion" 16 | connection_id: "8247b701-3357-4442-90bc-27a4dcd10b64" 17 | asyncrounous: True 18 | api_version: "v1" 19 | wait_seconds: 5 20 | timeout: 120.0 -------------------------------------------------------------------------------- /tests/config/airbyte_dev/airflow_seed_disabled.yml: -------------------------------------------------------------------------------- 1 | 2 | default_args: 3 | owner: Piotr Pekala 4 | email: 5 | - "test@getindata.com" 6 | depends_on_past: False 7 | start_date: 2021-10-20T00:00:00 8 | email_on_failure: False 9 | email_on_retry: False 10 | retries: 0 11 | retry_delay: 5m 12 | 13 | dag: 14 | dag_id: dbt-platform-poc 15 | description: 'Experimental snadbox data platform DAG' 16 | schedule_interval: '0 12 * * *' 17 | catchup: False 18 | 19 | seed_task: False 20 | manifest_file_name: ../tests/manifest.json 21 | use_task_group: True 22 | dags_path: "gs://example-bucket/dags/experimental-sandbox" 23 | show_ephemeral_models: True -------------------------------------------------------------------------------- /tests/config/airbyte_dev/airflow_seed_enabled.yml: -------------------------------------------------------------------------------- 1 | 2 | default_args: 3 | owner: Piotr Pekala 4 | email: 5 | - "test@getindata.com" 6 | depends_on_past: False 7 | start_date: 2021-10-20T00:00:00 8 | email_on_failure: False 9 | email_on_retry: False 10 | retries: 0 11 | retry_delay: 5m 12 | 13 | dag: 14 | dag_id: dbt-platform-poc 15 | description: 'Experimental snadbox data platform DAG' 16 | schedule_interval: '0 12 * * *' 17 | catchup: False 18 | 19 | seed_task: True 20 | manifest_file_name: ../tests/manifest.json 21 | use_task_group: True 22 | dags_path: "gs://example-bucket/dags/experimental-sandbox" 23 | show_ephemeral_models: True -------------------------------------------------------------------------------- /tests/config/airbyte_dev/ingestion_disabled.yml: -------------------------------------------------------------------------------- 1 | enable: False 2 | engine: airbyte -------------------------------------------------------------------------------- /tests/config/airbyte_dev/ingestion_enabled.yml: -------------------------------------------------------------------------------- 1 | enable: True 2 | engine: airbyte -------------------------------------------------------------------------------- /tests/config/airflow_vars/airflow.yml: -------------------------------------------------------------------------------- 1 | dags_path: "{{ var.value.dags_path }}" 2 | 3 | default_args: 4 | owner: "{{ var.value.dags_owner }}" 5 | email: 6 | - "{{ var.value.email_owner }}" 7 | depends_on_past: False 8 | start_date: 2021-10-20T00:00:00 9 | email_on_failure: False 10 | email_on_retry: False 11 | retries: 0 12 | retry_delay: 5m 13 | -------------------------------------------------------------------------------- /tests/config/base/airflow.yml: -------------------------------------------------------------------------------- 1 | 2 | default_args: 3 | owner: Piotr Pekala 4 | email: 5 | - "test@getindata.com" 6 | depends_on_past: False 7 | start_date: 2021-10-20T00:00:00 8 | email_on_failure: False 9 | email_on_retry: False 10 | retries: 0 11 | retry_delay: 5m 12 | 13 | dag: 14 | dag_id: dbt-platform-poc 15 | description: 'Experimental snadbox data platform DAG' 16 | schedule_interval: '0 12 * * *' 17 | catchup: False 18 | 19 | seed_task: True 20 | manifest_file_name: ../tests/manifest.json 21 | use_task_group: True 22 | dags_path: "gs://example-bucket/dags/experimental-sandbox" 23 | show_ephemeral_models: True -------------------------------------------------------------------------------- /tests/config/base/dbt.yml: -------------------------------------------------------------------------------- 1 | target: local 2 | target_type: bigquery 3 | project_dir_path: /dbt 4 | profile_dir_path: /root/.dbt 5 | -------------------------------------------------------------------------------- /tests/config/base/execution_env.yml: -------------------------------------------------------------------------------- 1 | 2 | image: 3 | repository: 123.gcr/dbt-platform-poc 4 | tag: 123 5 | 6 | type: k8s -------------------------------------------------------------------------------- /tests/config/base/k8s.yml: -------------------------------------------------------------------------------- 1 | 2 | image_pull_policy: IfNotPresent 3 | namespace: apache-airflow 4 | 5 | envs: 6 | EXAMPLE_ENV: "example" 7 | SECOND_EXAMPLE_ENV: "second" 8 | 9 | labels: 10 | runner: airflow 11 | 12 | annotations: 13 | iam.amazonaws.com/role: "k8s-airflow" 14 | 15 | is_delete_operator_pod: True 16 | 17 | resources: 18 | node_selectors: 19 | group: data-processing 20 | tolerations: 21 | - key: group 22 | operator: Equal 23 | value: data-processing 24 | effect: NoSchedule 25 | limit: 26 | memory: 2048M 27 | cpu: '2' 28 | requests: 29 | memory: 1024M 30 | cpu: '1' 31 | 32 | config_file: '/usr/local/airflow/dags/kube_config.yaml' 33 | -------------------------------------------------------------------------------- /tests/config/dev/dbt.yml: -------------------------------------------------------------------------------- 1 | target: dev -------------------------------------------------------------------------------- /tests/config/ephemeral_operator/airflow.yml: -------------------------------------------------------------------------------- 1 | manifest_file_name: ../tests/manifest_ephemeral.json 2 | -------------------------------------------------------------------------------- /tests/config/gateway/airflow.yml: -------------------------------------------------------------------------------- 1 | 2 | default_args: 3 | owner: Piotr Pekala 4 | email: 5 | - "test@getindata.com" 6 | depends_on_past: False 7 | start_date: 2021-10-20T00:00:00 8 | email_on_failure: False 9 | email_on_retry: False 10 | retries: 0 11 | retry_delay: 5m 12 | 13 | 14 | dag: 15 | dag_id: dbt-platform-poc 16 | description: 'Experimental snadbox data platform DAG' 17 | schedule_interval: '0 12 * * *' 18 | catchup: False 19 | 20 | seed_task: True 21 | manifest_file_name: ../tests/manifest_gateway.json 22 | use_task_group: True 23 | dags_path: "gs://example-bucket/dags/experimental-sandbox" 24 | show_ephemeral_models: True 25 | save_points: 26 | - "datalab_stg" 27 | - "datalab" -------------------------------------------------------------------------------- /tests/config/gateway_source/airflow.yml: -------------------------------------------------------------------------------- 1 | 2 | default_args: 3 | owner: Piotr Pekala 4 | email: 5 | - "test@getindata.com" 6 | depends_on_past: False 7 | start_date: 2021-10-20T00:00:00 8 | email_on_failure: False 9 | email_on_retry: False 10 | retries: 0 11 | retry_delay: 5m 12 | 13 | 14 | dag: 15 | dag_id: dbt-platform-poc 16 | description: 'Experimental snadbox data platform DAG' 17 | schedule_interval: '0 12 * * *' 18 | catchup: False 19 | 20 | seed_task: True 21 | manifest_file_name: ../tests/manifest_gateway_source.json 22 | use_task_group: True 23 | dags_path: "gs://example-bucket/dags/experimental-sandbox" 24 | show_ephemeral_models: True 25 | save_points: 26 | - "stage_schema_stage" 27 | - "stage_schema_data_mart" -------------------------------------------------------------------------------- /tests/config/no_ephemeral_operator/airflow.yml: -------------------------------------------------------------------------------- 1 | manifest_file_name: ../tests/manifest_ephemeral.json 2 | show_ephemeral_models: False -------------------------------------------------------------------------------- /tests/config/no_gateway/airflow.yml: -------------------------------------------------------------------------------- 1 | 2 | default_args: 3 | owner: Piotr Pekala 4 | email: 5 | - "test@getindata.com" 6 | depends_on_past: False 7 | start_date: 2021-10-20T00:00:00 8 | email_on_failure: False 9 | email_on_retry: False 10 | retries: 0 11 | retry_delay: 5m 12 | 13 | 14 | dag: 15 | dag_id: dbt-platform-poc 16 | description: 'Experimental snadbox data platform DAG' 17 | schedule_interval: '0 12 * * *' 18 | catchup: False 19 | 20 | seed_task: True 21 | manifest_file_name: ../tests/manifest.json 22 | use_task_group: True 23 | dags_path: "gs://example-bucket/dags/experimental-sandbox" 24 | show_ephemeral_models: True -------------------------------------------------------------------------------- /tests/config/no_task_group/airflow.yml: -------------------------------------------------------------------------------- 1 | use_task_group: False 2 | manifest_file_name: ../tests/manifest_task_group_tests.json 3 | -------------------------------------------------------------------------------- /tests/config/notifications_slack/airflow.yml: -------------------------------------------------------------------------------- 1 | 2 | failure_handlers: 3 | - type: slack 4 | connection_id: slack_failure 5 | message_template: | 6 | :red_circle: Task Failed. 7 | *Task*: {task} 8 | *Dag*: {dag} 9 | *Execution Time*: {execution_time} 10 | *Log Url*: {url} 11 | -------------------------------------------------------------------------------- /tests/config/notifications_teams/airflow.yml: -------------------------------------------------------------------------------- 1 | 2 | failure_handlers: 3 | - type: teams 4 | webserver_url: https://your.airflow-webserver.url 5 | connection_id: teams_failure 6 | message_template: | 7 | 🔴 **Task Failed**

8 | **Task**: {task}
9 | **Dag**: {dag}
10 | **Execution Time**: {execution_time}
11 | **Log Url**: {url} -------------------------------------------------------------------------------- /tests/config/qa/datahub.yml: -------------------------------------------------------------------------------- 1 | sink: 2 | config: 3 | server: "http://test_url:8080" 4 | -------------------------------------------------------------------------------- /tests/config/qa/dbt.yml: -------------------------------------------------------------------------------- 1 | target: qa -------------------------------------------------------------------------------- /tests/config/qa/execution_env.yml: -------------------------------------------------------------------------------- 1 | 2 | image: 3 | repository: 123.gcr/dbt-platform-poc 4 | tag: 123 5 | 6 | type: k8s 7 | execution_script: "./executor_with_test_reports_ingestions.sh" -------------------------------------------------------------------------------- /tests/config/qa/k8s.yml: -------------------------------------------------------------------------------- 1 | secrets: 2 | - secret: snowflake-access-user-key 3 | deploy_type: env 4 | deploy_target: test 5 | - secret: snowflake-access-user-key 6 | deploy_type: volume 7 | deploy_target: /var 8 | 9 | in_cluster: False 10 | cluster_context: test 11 | startup_timeout_seconds: 600 12 | -------------------------------------------------------------------------------- /tests/config/task_group/airflow.yml: -------------------------------------------------------------------------------- 1 | use_task_group: True 2 | manifest_file_name: ../tests/manifest_task_group_tests.json 3 | -------------------------------------------------------------------------------- /tests/config/vars/dbt.yml: -------------------------------------------------------------------------------- 1 | target: vars 2 | vars: 3 | variable_1: 123 4 | variable_2: "var2" 5 | -------------------------------------------------------------------------------- /tests/manifest_ephemeral.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "model.dbt_test.model1": { 4 | "name": "model1", 5 | "depends_on": { 6 | "nodes": [ 7 | "source.dbt_test.source1" 8 | ] 9 | }, 10 | "config": { 11 | "materialized": "table" 12 | } 13 | }, 14 | "model.dbt_test.model2": { 15 | "name": "model2", 16 | "depends_on": { 17 | "nodes": [ 18 | "model.dbt_test.model1" 19 | ] 20 | }, 21 | "config": { 22 | "materialized": "ephemeral" 23 | } 24 | }, 25 | "model.dbt_test.model3": { 26 | "name": "model3", 27 | "depends_on": { 28 | "nodes": [ 29 | "model.dbt_test.model2", 30 | "model.dbt_test.model5" 31 | ] 32 | }, 33 | "config": { 34 | "materialized": "ephemeral" 35 | } 36 | }, 37 | "model.dbt_test.model4": { 38 | "name": "model4", 39 | "depends_on": { 40 | "nodes": [ 41 | "model.dbt_test.model10" 42 | ] 43 | }, 44 | "config": { 45 | "materialized": "view" 46 | } 47 | }, 48 | "model.dbt_test.model5": { 49 | "name": "model5", 50 | "depends_on": { 51 | "nodes": [ 52 | "source.dbt_test.source2" 53 | ] 54 | }, 55 | "config": { 56 | "materialized": "ephemeral" 57 | } 58 | }, 59 | "model.dbt_test.model6": { 60 | "name": "model6", 61 | "depends_on": { 62 | "nodes": [ 63 | "source.dbt_test.source3" 64 | ] 65 | }, 66 | "config": { 67 | "materialized": "table" 68 | } 69 | }, 70 | "model.dbt_test.model7": { 71 | "name": "model7", 72 | "depends_on": { 73 | "nodes": [ 74 | "model.dbt_test.model6" 75 | ] 76 | }, 77 | "config": { 78 | "materialized": "ephemeral" 79 | } 80 | }, 81 | "model.dbt_test.model8": { 82 | "name": "model8", 83 | "depends_on": { 84 | "nodes": [ 85 | "model.dbt_test.model6" 86 | ] 87 | }, 88 | "config": { 89 | "materialized": "ephemeral" 90 | } 91 | }, 92 | "model.dbt_test.model9": { 93 | "name": "model9", 94 | "depends_on": { 95 | "nodes": [ 96 | "model.dbt_test.model7", 97 | "model.dbt_test.model8" 98 | ] 99 | }, 100 | "config": { 101 | "materialized": "ephemeral" 102 | } 103 | }, 104 | "model.dbt_test.model10": { 105 | "name": "model10", 106 | "depends_on": { 107 | "nodes": [ 108 | "model.dbt_test.model3", 109 | "model.dbt_test.model9" 110 | ] 111 | }, 112 | "config": { 113 | "materialized": "ephemeral" 114 | } 115 | }, 116 | "model.dbt_test.model11": { 117 | "name": "model11", 118 | "depends_on": { 119 | "nodes": [ 120 | "model.dbt_test.model10" 121 | ] 122 | }, 123 | "config": { 124 | "materialized": "ephemeral" 125 | } 126 | } 127 | }, 128 | "sources": { 129 | "source.dbt_test.source1": {}, 130 | "source.dbt_test.source2": {}, 131 | "source.dbt_test.source3": {} 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /tests/manifest_task_group_tests.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "model.dbt_test.model1": { 4 | "name": "model1", 5 | "depends_on": { 6 | "nodes": [] 7 | }, 8 | "config": { 9 | "materialized": "view" 10 | } 11 | }, 12 | "model.dbt_test.model2": { 13 | "name": "model2", 14 | "depends_on": { 15 | "nodes": [ 16 | "model.dbt_test.model1" 17 | ] 18 | }, 19 | "config": { 20 | "materialized": "view" 21 | } 22 | }, 23 | "model.dbt_test.model3": { 24 | "name": "model3", 25 | "depends_on": { 26 | "nodes": [ 27 | "model.dbt_test.model1" 28 | ] 29 | }, 30 | "config": { 31 | "materialized": "view" 32 | } 33 | }, 34 | "model.dbt_test.model4": { 35 | "name": "model4", 36 | "depends_on": { 37 | "nodes": [ 38 | "model.dbt_test.model2", 39 | "model.dbt_test.model3" 40 | ] 41 | }, 42 | "config": { 43 | "materialized": "view" 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /tests/teams_webhook_expected_paylaod.json: -------------------------------------------------------------------------------- 1 | { 2 | "@type": "MessageCard", 3 | "@context": "http://schema.org/extensions", 4 | "themeColor": "FF0000", 5 | "summary": "🔴**TaskFailed**

**Task**:task_id
**Dag**:dag_id
**ExecutionTime**:somedate
**LogUrl**:https://your.airflow-webserver.url/log?dag_id=dag_id&task_id=task_id&execution_date=ts", 6 | "sections": [ 7 | { 8 | "activityTitle": "🔴**TaskFailed**

**Task**:task_id
**Dag**:dag_id
**ExecutionTime**:somedate
**LogUrl**:https://your.airflow-webserver.url/log?dag_id=dag_id&task_id=task_id&execution_date=ts", 9 | "activitySubtitle": "", 10 | "markdown": true, 11 | "potentialAction": [ 12 | { 13 | "@type": "OpenUri", 14 | "name": "Viewlog", 15 | "targets": [ 16 | { 17 | "os": "default", 18 | "uri": "https://your.airflow-webserver.url/log?dag_id=dag_id&task_id=task_id&execution_date=ts" 19 | } 20 | ] 21 | } 22 | ] 23 | } 24 | ] 25 | } -------------------------------------------------------------------------------- /tests/test_config_propagation.py: -------------------------------------------------------------------------------- 1 | from dbt_airflow_factory.constants import ( 2 | IS_AIRFLOW_NEWER_THAN_2_4, 3 | IS_FIRST_AIRFLOW_VERSION, 4 | ) 5 | 6 | from .utils import builder_factory, manifest_file_with_models, test_dag 7 | 8 | 9 | def test_configuration(): 10 | # given 11 | manifest_path = manifest_file_with_models({"model.dbt_test.dim_users": []}) 12 | 13 | # when 14 | with test_dag(): 15 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 16 | 17 | # then 18 | run_task = tasks.get_task("model.dbt_test.dim_users").execution_airflow_task 19 | assert run_task.namespace == "apache-airflow" 20 | assert run_task.image == "123.gcr/dbt-platform-poc:123" 21 | if IS_FIRST_AIRFLOW_VERSION: 22 | assert run_task.node_selectors == {"group": "data-processing"} 23 | assert run_task.tolerations[0]["key"] == "group" 24 | assert run_task.tolerations[0]["operator"] == "Equal" 25 | assert run_task.tolerations[0]["value"] == "data-processing" 26 | assert run_task.tolerations[0]["effect"] == "NoSchedule" 27 | assert run_task.resources[0].limit_memory == "2048M" 28 | assert run_task.resources[0].limit_cpu == "2" 29 | assert run_task.resources[0].request_memory == "1024M" 30 | assert run_task.resources[0].request_cpu == "1" 31 | else: 32 | assert run_task.node_selector == {"group": "data-processing"} 33 | assert run_task.tolerations[0].key == "group" 34 | assert run_task.tolerations[0].operator == "Equal" 35 | assert run_task.tolerations[0].value == "data-processing" 36 | assert run_task.tolerations[0].effect == "NoSchedule" 37 | if IS_AIRFLOW_NEWER_THAN_2_4: 38 | assert run_task.container_resources.limits == {"memory": "2048M", "cpu": "2"} 39 | assert run_task.container_resources.requests == {"memory": "1024M", "cpu": "1"} 40 | else: 41 | assert run_task.k8s_resources.limits == {"memory": "2048M", "cpu": "2"} 42 | assert run_task.k8s_resources.requests == {"memory": "1024M", "cpu": "1"} 43 | 44 | assert run_task.startup_timeout_seconds == 120 45 | 46 | assert run_task.labels == {"runner": "airflow"} 47 | assert run_task.env_vars[0].to_dict() == { 48 | "name": "EXAMPLE_ENV", 49 | "value": "example", 50 | "value_from": None, 51 | } 52 | assert run_task.env_vars[1].to_dict() == { 53 | "name": "SECOND_EXAMPLE_ENV", 54 | "value": "second", 55 | "value_from": None, 56 | } 57 | assert run_task.in_cluster is None 58 | assert run_task.cluster_context is None 59 | assert run_task.config_file == "/usr/local/airflow/dags/kube_config.yaml" 60 | assert run_task.is_delete_operator_pod 61 | assert "--project-dir /dbt" in run_task.arguments[0] 62 | assert "--profiles-dir /root/.dbt" in run_task.arguments[0] 63 | assert "--target dev" in run_task.arguments[0] 64 | -------------------------------------------------------------------------------- /tests/test_config_propagation_qa.py: -------------------------------------------------------------------------------- 1 | from airflow.kubernetes.secret import Secret 2 | 3 | from .utils import builder_factory, manifest_file_with_models, test_dag 4 | 5 | 6 | def test_configuration_with_qa_config(): 7 | # given 8 | manifest_path = manifest_file_with_models({"model.dbt_test.dim_users": []}) 9 | 10 | # when 11 | with test_dag(): 12 | tasks = builder_factory(env="qa").create().parse_manifest_into_tasks(manifest_path) 13 | 14 | # then 15 | run_task = tasks.get_task("model.dbt_test.dim_users").execution_airflow_task 16 | assert run_task.env_vars[2].to_dict() == { 17 | "name": "DATAHUB_GMS_URL", 18 | "value": "http://test_url:8080", 19 | "value_from": None, 20 | } 21 | assert "./executor_with_test_reports_ingestions.sh" in run_task.arguments[0] 22 | assert run_task.secrets == [ 23 | Secret("env", "test", "snowflake-access-user-key", None), 24 | Secret("volume", "/var", "snowflake-access-user-key", None), 25 | ] 26 | assert run_task.in_cluster is False 27 | assert run_task.cluster_context == "test" 28 | assert run_task.startup_timeout_seconds == 600 29 | -------------------------------------------------------------------------------- /tests/test_dag_dependencies.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | builder_factory, 3 | manifest_file_with_models, 4 | task_group_prefix_builder, 5 | test_dag, 6 | ) 7 | 8 | extra_metadata_data = { 9 | "child_map": { 10 | "source.upstream_pipeline_sources.upstream_pipeline.some_final_model": [ 11 | "model.dbt_test.dependent_model" 12 | ], 13 | "source.upstream_pipeline_sources.upstream_pipeline.unused": [], 14 | }, 15 | "sources": { 16 | "source.upstream_pipeline_sources.upstream_pipeline.some_final_model": { 17 | "database": "gid-dataops-labs", 18 | "schema": "presentation", 19 | "name": "some_final_model", 20 | "unique_id": "source.upstream_pipeline_sources.upstream_pipeline.some_final_model", 21 | "source_meta": {"dag": "dbt-tpch-test"}, 22 | }, 23 | "source.upstream_pipeline_sources.upstream_pipeline.unused": { 24 | "database": "gid-dataops-labs", 25 | "schema": "presentation", 26 | "name": "unused", 27 | "unique_id": "source.upstream_pipeline_sources.upstream_pipeline.unused", 28 | "source_meta": {"dag": "dbt-tpch-test"}, 29 | }, 30 | "source.upstream_pipeline_sources.upstream_pipeline.no_dag": { 31 | "database": "gid-dataops-labs", 32 | "schema": "presentation", 33 | "name": "no_dag", 34 | "unique_id": "source.upstream_pipeline_sources.upstream_pipeline.no_dag", 35 | "source_meta": {}, 36 | }, 37 | }, 38 | } 39 | 40 | 41 | def test_dag_sensor(): 42 | # given 43 | manifest_path = manifest_file_with_models( 44 | { 45 | "model.dbt_test.dependent_model": [ 46 | "source.upstream_pipeline_sources.upstream_pipeline.some_final_model" 47 | ] 48 | }, 49 | extra_metadata_data, 50 | ) 51 | 52 | # when 53 | with test_dag(): 54 | tasks = ( 55 | builder_factory(enable_project_dependencies=True) 56 | .create() 57 | .parse_manifest_into_tasks(manifest_path) 58 | ) 59 | 60 | # then 61 | sensor_task = tasks.get_task( 62 | "source.upstream_pipeline_sources.upstream_pipeline.some_final_model" 63 | ) 64 | assert tasks.length() == 2 65 | assert sensor_task is not None 66 | assert sensor_task.execution_airflow_task is not None 67 | assert sensor_task.test_airflow_task is None 68 | assert sensor_task.execution_airflow_task.task_id == "sensor_some_final_model" 69 | 70 | 71 | def test_dag_sensor_dependency(): 72 | # given 73 | manifest_path = manifest_file_with_models( 74 | { 75 | "model.dbt_test.dependent_model": [ 76 | "source.upstream_pipeline_sources.upstream_pipeline.some_final_model" 77 | ] 78 | }, 79 | extra_metadata_data, 80 | ) 81 | 82 | # when 83 | with test_dag(): 84 | tasks = ( 85 | builder_factory(enable_project_dependencies=True) 86 | .create() 87 | .parse_manifest_into_tasks(manifest_path) 88 | ) 89 | 90 | # then 91 | assert ( 92 | "sensor_some_final_model" 93 | in tasks.get_task("model.dbt_test.dependent_model").execution_airflow_task.upstream_task_ids 94 | ) 95 | assert ( 96 | task_group_prefix_builder("dependent_model", "run") 97 | in tasks.get_task( 98 | "source.upstream_pipeline_sources.upstream_pipeline.some_final_model" 99 | ).execution_airflow_task.downstream_task_ids 100 | ) 101 | 102 | 103 | def test_dag_sensor_no_meta(): 104 | # given 105 | manifest_path = manifest_file_with_models( 106 | { 107 | "model.dbt_test.dependent_model": [ 108 | "source.upstream_pipeline_sources.upstream_pipeline.some_final_model", 109 | "source.upstream_pipeline_sources.upstream_pipeline.no_dag", 110 | ] 111 | }, 112 | extra_metadata_data, 113 | ) 114 | 115 | # when 116 | with test_dag(): 117 | tasks = ( 118 | builder_factory(enable_project_dependencies=True) 119 | .create() 120 | .parse_manifest_into_tasks(manifest_path) 121 | ) 122 | 123 | # then 124 | assert tasks.length() == 2 125 | -------------------------------------------------------------------------------- /tests/test_dag_factory.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from os import path 3 | from typing import Set 4 | 5 | import pytest 6 | 7 | from dbt_airflow_factory.airflow_dag_factory import AirflowDagFactory 8 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 9 | 10 | 11 | def test_dag_factory(): 12 | # given 13 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "dev") 14 | 15 | # when 16 | dag = factory.create() 17 | 18 | # then 19 | assert dag.dag_id == "dbt-platform-poc" 20 | assert dag.description == "Experimental snadbox data platform DAG" 21 | assert dag.schedule_interval == "0 12 * * *" 22 | assert not dag.catchup 23 | assert dag.default_args == { 24 | "owner": "Piotr Pekala", 25 | "email": ["test@getindata.com"], 26 | "depends_on_past": False, 27 | "start_date": datetime(2021, 10, 20, 0, 0, 0, tzinfo=timezone.utc), 28 | "email_on_failure": False, 29 | "email_on_retry": False, 30 | "retries": 0, 31 | "retry_delay": 300, 32 | } 33 | assert len(dag.tasks) == 4 34 | 35 | 36 | def test_task_group_dag_factory(): 37 | if IS_FIRST_AIRFLOW_VERSION: # You cannot use TaskGroup in Airflow 1 anyway 38 | return True 39 | 40 | # given 41 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "task_group") 42 | 43 | # when 44 | dag = factory.create() 45 | 46 | # then 47 | assert len(dag.tasks) == 10 48 | assert len(dag.task_group.children) == 6 49 | 50 | 51 | def test_no_task_group_dag_factory(): 52 | if IS_FIRST_AIRFLOW_VERSION: # You cannot use TaskGroup in Airflow 1 anyway 53 | return True 54 | 55 | # given 56 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "no_task_group") 57 | 58 | # when 59 | dag = factory.create() 60 | 61 | # then 62 | assert len(dag.tasks) == 10 63 | assert len(dag.task_group.children) == 10 64 | 65 | 66 | def test_gateway_dag_factory(): 67 | # given 68 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "gateway") 69 | 70 | # when 71 | dag = factory.create() 72 | 73 | # then save points should be as passed in the config file 74 | assert dag.tasks.__len__() == 15 75 | assert factory.airflow_config["save_points"] == ["datalab_stg", "datalab"] 76 | 77 | 78 | def test_should_not_fail_when_savepoint_property_wasnt_passed(): 79 | # given 80 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "no_gateway") 81 | 82 | # when 83 | dag = factory.create() 84 | 85 | # then save_points_property should be empty 86 | assert factory.airflow_config.get("save_points", []).__len__() == 0 87 | 88 | # and number of tasks should match 89 | assert dag.tasks.__len__() == 4 90 | 91 | 92 | def test_should_properly_map_tasks(): 93 | # given 94 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "gateway") 95 | 96 | # when 97 | dag = factory.create() 98 | 99 | # then save_points_property should be empty 100 | save_points = factory.airflow_config.get("save_points") 101 | assert save_points.__len__() == 2 102 | 103 | # and number of tasks should be as expected 104 | assert dag.tasks.__len__() == 15 105 | 106 | # and tasks should be correctly matched to themselves 107 | gateway_task = [ 108 | task for task in dag.tasks if task.task_id == f"{save_points[0]}_{save_points[1]}_gateway" 109 | ][0] 110 | 111 | assert gateway_task.downstream_task_ids == {"user.run", "shop.run", "payment.run"} 112 | 113 | assert gateway_task.upstream_task_ids == {"stg_payment.test", "stg_shop.test", "stg_user.test"} 114 | 115 | 116 | def test_should_properly_map_tasks_with_source(): 117 | # given 118 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "gateway_source") 119 | 120 | # when 121 | dag = factory.create() 122 | 123 | # then save_points_property should be empty 124 | save_points = factory.airflow_config.get("save_points") 125 | assert save_points.__len__() == 2 126 | 127 | # and number of tasks should be as expected 128 | assert dag.tasks.__len__() == 7 129 | 130 | # and tasks should be correctly matched to themselves 131 | gateway_task = [ 132 | task for task in dag.tasks if task.task_id == f"{save_points[0]}_{save_points[1]}_gateway" 133 | ][0] 134 | 135 | assert gateway_task.downstream_task_ids == {"my_second_dbt_model.run"} 136 | 137 | assert gateway_task.upstream_task_ids == {"my_first_dbt_model.test"} 138 | 139 | 140 | @pytest.mark.parametrize( 141 | "test_name,ingestion_enabled,seed_available,expected_start_task_deps", 142 | [ 143 | ( 144 | "should return no ingestion task when ingestion is not enabled - seed enabled", 145 | False, 146 | True, 147 | set(), 148 | ), 149 | ( 150 | "should return no ingestion task when ingestion is not enabled - seed disabled", 151 | False, 152 | False, 153 | set(), 154 | ), 155 | ( 156 | "should return ingestion tasks when ingestion is enabled - seed disabled", 157 | True, 158 | False, 159 | {"postgres_ingestion", "mysql_ingestion", "sales_force_ingestion"}, 160 | ), 161 | ( 162 | "should return ingestion tasks when ingestion is enabled - seed enabled", 163 | True, 164 | True, 165 | {"postgres_ingestion", "mysql_ingestion", "sales_force_ingestion"}, 166 | ), 167 | ], 168 | ) 169 | def test_should_add_airbyte_tasks_when_seed_is_not_available( 170 | test_name: str, 171 | ingestion_enabled: bool, 172 | seed_available: bool, 173 | expected_start_task_deps: Set[str], 174 | ): 175 | # given configuration for airbyte_dev 176 | factory = AirflowDagFactory( 177 | path.dirname(path.abspath(__file__)), 178 | "airbyte_dev", 179 | airflow_config_file_name=f"airflow_seed_{boolean_mapper[seed_available]}.yml", 180 | ingestion_config_file_name=f"ingestion_{boolean_mapper[ingestion_enabled]}.yml", 181 | ) 182 | 183 | # when creating factory 184 | dag = factory.create() 185 | 186 | # airbyte ingestion tasks should be added to dummy task 187 | start_task_name = [ 188 | task for task in dag.tasks if task.task_id == starting_task_mapper[seed_available] 189 | ][0] 190 | 191 | assert start_task_name.upstream_task_ids == expected_start_task_deps 192 | 193 | 194 | boolean_mapper = {True: "enabled", False: "disabled"} 195 | 196 | starting_task_mapper = {True: "dbt_seed", False: "start"} 197 | -------------------------------------------------------------------------------- /tests/test_dependencies.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .utils import ( 4 | builder_factory, 5 | manifest_file_with_models, 6 | task_group_prefix_builder, 7 | test_dag, 8 | ) 9 | 10 | 11 | def test_run_test_dependency(): 12 | # given 13 | manifest_path = manifest_file_with_models({"model.dbt_test.model1": []}) 14 | 15 | # when 16 | with test_dag(): 17 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 18 | 19 | # then 20 | assert ( 21 | task_group_prefix_builder("model1", "test") 22 | in tasks.get_task("model.dbt_test.model1").execution_airflow_task.downstream_task_ids 23 | ) 24 | assert ( 25 | task_group_prefix_builder("model1", "run") 26 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.upstream_task_ids 27 | ) 28 | 29 | 30 | def test_dependency(): 31 | # given 32 | manifest_path = manifest_file_with_models( 33 | { 34 | "model.dbt_test.model1": [], 35 | "model.dbt_test.model2": ["model.dbt_test.model1"], 36 | } 37 | ) 38 | 39 | # when 40 | with test_dag(): 41 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 42 | 43 | # then 44 | assert tasks.length() == 2 45 | 46 | assert ( 47 | task_group_prefix_builder("model1", "test") 48 | in tasks.get_task("model.dbt_test.model2").execution_airflow_task.upstream_task_ids 49 | ) 50 | assert ( 51 | task_group_prefix_builder("model2", "run") 52 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 53 | ) 54 | 55 | 56 | def test_more_complex_dependencies(): 57 | # given 58 | manifest_path = manifest_file_with_models( 59 | { 60 | "model.dbt_test.model1": [], 61 | "model.dbt_test.model2": ["model.dbt_test.model1"], 62 | "model.dbt_test.model3": ["model.dbt_test.model1", "model.dbt_test.model2"], 63 | "model.dbt_test.model4": ["model.dbt_test.model3"], 64 | } 65 | ) 66 | 67 | # when 68 | with test_dag(): 69 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 70 | 71 | # then 72 | assert tasks.length() == 4 73 | assert ( 74 | task_group_prefix_builder("model1", "test") 75 | in tasks.get_task("model.dbt_test.model2").execution_airflow_task.upstream_task_ids 76 | ) 77 | assert ( 78 | task_group_prefix_builder("model1", "test") 79 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 80 | ) 81 | assert ( 82 | task_group_prefix_builder("model2", "run") 83 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 84 | ) 85 | assert ( 86 | task_group_prefix_builder("model3", "run") 87 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 88 | ) 89 | assert ( 90 | task_group_prefix_builder("model1", "test") 91 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 92 | ) 93 | assert ( 94 | task_group_prefix_builder("model2", "test") 95 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 96 | ) 97 | assert ( 98 | task_group_prefix_builder("model4", "run") 99 | in tasks.get_task("model.dbt_test.model3").test_airflow_task.downstream_task_ids 100 | ) 101 | 102 | 103 | def test_test_dependencies(): 104 | # given 105 | manifest_path = manifest_file_with_models( 106 | { 107 | "model.dbt_test.model1": [], 108 | "model.dbt_test.model2": ["model.dbt_test.model1"], 109 | "model.dbt_test.model3": ["model.dbt_test.model2"], 110 | "test.dbt_test.test1": ["model.dbt_test.model1"], 111 | "test.dbt_test.test2": ["model.dbt_test.model1", "model.dbt_test.model2"], 112 | } 113 | ) 114 | 115 | # when 116 | with test_dag(): 117 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 118 | 119 | # then 120 | assert tasks.length() == 4 121 | assert ( 122 | task_group_prefix_builder("model1", "test") 123 | in tasks.get_task("model.dbt_test.model2").execution_airflow_task.upstream_task_ids 124 | ) 125 | assert ( 126 | task_group_prefix_builder("model2", "test") 127 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 128 | ) 129 | assert ( 130 | task_group_prefix_builder("model2", "run") 131 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 132 | ) 133 | assert ( 134 | task_group_prefix_builder("model3", "run") 135 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 136 | ) 137 | 138 | assert ( 139 | "model1_model2_test" 140 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 141 | ) 142 | assert ( 143 | task_group_prefix_builder("model1", "test") 144 | in tasks.get_task("model1_model2_test").execution_airflow_task.upstream_task_ids 145 | ) 146 | assert ( 147 | "model1_model2_test" 148 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 149 | ) 150 | assert ( 151 | task_group_prefix_builder("model2", "test") 152 | in tasks.get_task("model1_model2_test").execution_airflow_task.upstream_task_ids 153 | ) 154 | 155 | 156 | def test_complex_test_dependencies(): 157 | # given 158 | manifest_path = manifest_file_with_models( 159 | { 160 | "model.dbt_test.model1": [], 161 | "model.dbt_test.model2": ["model.dbt_test.model1"], 162 | "model.dbt_test.model3": ["model.dbt_test.model2"], 163 | "model.dbt_test.model4": ["model.dbt_test.model1", "model.dbt_test.model2"], 164 | "model.dbt_test.model5": [], 165 | "model.dbt_test.model6": [], 166 | "model.dbt_test.model7": ["model.dbt_test.model6", "model.dbt_test.model5"], 167 | "test.dbt_test.test1": ["model.dbt_test.model6", "model.dbt_test.model5"], 168 | "test.dbt_test.test2": ["model.dbt_test.model7", "model.dbt_test.model2"], 169 | "test.dbt_test.test3": ["model.dbt_test.model2", "model.dbt_test.model3"], 170 | "test.dbt_test.test4": ["model.dbt_test.model3", "model.dbt_test.model2"], 171 | "test.dbt_test.test5": ["model.dbt_test.model3", "model.dbt_test.model2"], 172 | } 173 | ) 174 | 175 | # when 176 | with test_dag(): 177 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 178 | 179 | # then 180 | assert tasks.length() == 10 181 | assert ( 182 | task_group_prefix_builder("model1", "test") 183 | in tasks.get_task("model.dbt_test.model2").execution_airflow_task.upstream_task_ids 184 | ) 185 | assert ( 186 | task_group_prefix_builder("model2", "run") 187 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 188 | ) 189 | assert ( 190 | task_group_prefix_builder("model2", "test") 191 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 192 | ) 193 | assert ( 194 | task_group_prefix_builder("model3", "run") 195 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 196 | ) 197 | assert ( 198 | task_group_prefix_builder("model1", "test") 199 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 200 | ) 201 | assert ( 202 | task_group_prefix_builder("model2", "test") 203 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 204 | ) 205 | assert ( 206 | task_group_prefix_builder("model4", "run") 207 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 208 | ) 209 | assert ( 210 | task_group_prefix_builder("model4", "run") 211 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 212 | ) 213 | assert ( 214 | task_group_prefix_builder("model5", "test") 215 | in tasks.get_task("model.dbt_test.model7").execution_airflow_task.upstream_task_ids 216 | ) 217 | assert ( 218 | task_group_prefix_builder("model7", "run") 219 | in tasks.get_task("model.dbt_test.model5").test_airflow_task.downstream_task_ids 220 | ) 221 | assert ( 222 | task_group_prefix_builder("model6", "test") 223 | in tasks.get_task("model.dbt_test.model7").execution_airflow_task.upstream_task_ids 224 | ) 225 | assert ( 226 | task_group_prefix_builder("model7", "run") 227 | in tasks.get_task("model.dbt_test.model6").test_airflow_task.downstream_task_ids 228 | ) 229 | 230 | def extract_model_arguments(args: str) -> List[str]: 231 | return list(filter(lambda s: not s.startswith("-"), args.split("--select ")[1].split())) 232 | 233 | assert ( 234 | "model2_model3_test" 235 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 236 | ) 237 | assert ( 238 | task_group_prefix_builder("model2", "test") 239 | in tasks.get_task("model2_model3_test").execution_airflow_task.upstream_task_ids 240 | ) 241 | assert ( 242 | "model2_model3_test" 243 | in tasks.get_task("model.dbt_test.model3").test_airflow_task.downstream_task_ids 244 | ) 245 | assert ( 246 | task_group_prefix_builder("model3", "test") 247 | in tasks.get_task("model2_model3_test").execution_airflow_task.upstream_task_ids 248 | ) 249 | assert all( 250 | test_name 251 | in extract_model_arguments( 252 | tasks.get_task("model2_model3_test").execution_airflow_task.arguments[0] 253 | ) 254 | for test_name in ["test3", "test4", "test5"] 255 | ) 256 | assert all( 257 | test_name 258 | not in extract_model_arguments( 259 | tasks.get_task("model2_model3_test").execution_airflow_task.arguments[0] 260 | ) 261 | for test_name in ["test1", "test2"] 262 | ) 263 | assert ( 264 | "model2_model7_test" 265 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 266 | ) 267 | assert ( 268 | task_group_prefix_builder("model2", "test") 269 | in tasks.get_task("model2_model7_test").execution_airflow_task.upstream_task_ids 270 | ) 271 | assert ( 272 | "model2_model7_test" 273 | in tasks.get_task("model.dbt_test.model7").test_airflow_task.downstream_task_ids 274 | ) 275 | assert ( 276 | task_group_prefix_builder("model7", "test") 277 | in tasks.get_task("model2_model7_test").execution_airflow_task.upstream_task_ids 278 | ) 279 | assert "test2" in extract_model_arguments( 280 | tasks.get_task("model2_model7_test").execution_airflow_task.arguments[0] 281 | ) 282 | assert all( 283 | test_name 284 | not in extract_model_arguments( 285 | tasks.get_task("model2_model7_test").execution_airflow_task.arguments[0] 286 | ) 287 | for test_name in ["test1", "test3", "test4", "test5"] 288 | ) 289 | assert ( 290 | "model5_model6_test" 291 | in tasks.get_task("model.dbt_test.model5").test_airflow_task.downstream_task_ids 292 | ) 293 | assert ( 294 | task_group_prefix_builder("model5", "test") 295 | in tasks.get_task("model5_model6_test").execution_airflow_task.upstream_task_ids 296 | ) 297 | assert ( 298 | "model5_model6_test" 299 | in tasks.get_task("model.dbt_test.model6").test_airflow_task.downstream_task_ids 300 | ) 301 | assert ( 302 | task_group_prefix_builder("model6", "test") 303 | in tasks.get_task("model5_model6_test").execution_airflow_task.upstream_task_ids 304 | ) 305 | assert "test1" in extract_model_arguments( 306 | tasks.get_task("model5_model6_test").execution_airflow_task.arguments[0] 307 | ) 308 | assert all( 309 | test_name 310 | not in extract_model_arguments( 311 | tasks.get_task("model5_model6_test").execution_airflow_task.arguments[0] 312 | ) 313 | for test_name in ["test2", "test3", "test4", "test5"] 314 | ) 315 | -------------------------------------------------------------------------------- /tests/test_edges.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | builder_factory, 3 | manifest_file_with_models, 4 | task_group_prefix_builder, 5 | test_dag, 6 | ) 7 | 8 | 9 | def test_starting_tasks(): 10 | # given 11 | manifest_path = manifest_file_with_models( 12 | { 13 | "model.dbt_test.model1": [], 14 | "model.dbt_test.model2": [], 15 | "model.dbt_test.model3": ["model.dbt_test.model1", "model.dbt_test.model2"], 16 | "model.dbt_test.model4": ["model.dbt_test.model3"], 17 | "model.dbt_test.model5": [], 18 | } 19 | ) 20 | 21 | # when 22 | with test_dag(): 23 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 24 | 25 | # then 26 | starting_tasks_names = [ 27 | task.execution_airflow_task.task_id for task in tasks.get_starting_tasks() 28 | ] 29 | assert task_group_prefix_builder("model1", "run") in starting_tasks_names 30 | assert task_group_prefix_builder("model2", "run") in starting_tasks_names 31 | assert task_group_prefix_builder("model5", "run") in starting_tasks_names 32 | 33 | 34 | def test_ending_tasks(): 35 | # given 36 | manifest_path = manifest_file_with_models( 37 | { 38 | "model.dbt_test.model1": [], 39 | "model.dbt_test.model2": [], 40 | "model.dbt_test.model3": ["model.dbt_test.model1", "model.dbt_test.model2"], 41 | "model.dbt_test.model4": ["model.dbt_test.model3"], 42 | "model.dbt_test.model5": [], 43 | } 44 | ) 45 | 46 | # when 47 | with test_dag(): 48 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 49 | 50 | # then 51 | ending_tasks_names = [task.test_airflow_task.task_id for task in tasks.get_ending_tasks()] 52 | assert task_group_prefix_builder("model4", "test") in ending_tasks_names 53 | assert task_group_prefix_builder("model5", "test") in ending_tasks_names 54 | -------------------------------------------------------------------------------- /tests/test_ephemeral_operator.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from dbt_airflow_factory.airflow_dag_factory import AirflowDagFactory 4 | from dbt_airflow_factory.operator import EphemeralOperator 5 | from tests.utils import task_group_prefix_builder, test_dag 6 | 7 | 8 | def _get_ephemeral_name(model_name: str) -> str: 9 | return f"{model_name}__ephemeral" 10 | 11 | 12 | def test_ephemeral_dag_factory(): 13 | # given 14 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "ephemeral_operator") 15 | 16 | # when 17 | dag = factory.create() 18 | 19 | # then 20 | assert len(dag.tasks) == 16 21 | 22 | task_group_names = [ 23 | el 24 | for node_name in ["model1", "model4", "model6"] 25 | for el in [ 26 | task_group_prefix_builder(node_name, "test"), 27 | task_group_prefix_builder(node_name, "run"), 28 | ] 29 | ] 30 | ephemeral_task_names = [ 31 | node_name + "__ephemeral" 32 | for node_name in [ 33 | "model2", 34 | "model3", 35 | "model5", 36 | "model7", 37 | "model8", 38 | "model9", 39 | "model10", 40 | "model11", 41 | ] 42 | ] 43 | assert set(dag.task_ids) == set(["dbt_seed", "end"] + task_group_names + ephemeral_task_names) 44 | 45 | for ephemeral_task_name in ephemeral_task_names: 46 | assert isinstance(dag.task_dict[ephemeral_task_name], EphemeralOperator) 47 | 48 | 49 | def test_no_ephemeral_dag_factory(): 50 | # given 51 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "no_ephemeral_operator") 52 | 53 | # when 54 | dag = factory.create() 55 | 56 | # then 57 | assert len(dag.tasks) == 8 58 | 59 | task_group_names = [ 60 | el 61 | for node_name in ["model1", "model4", "model6"] 62 | for el in [ 63 | task_group_prefix_builder(node_name, "test"), 64 | task_group_prefix_builder(node_name, "run"), 65 | ] 66 | ] 67 | assert set(dag.task_ids) == set(["dbt_seed", "end"] + task_group_names) 68 | 69 | for task_name in task_group_names: 70 | assert not isinstance(dag.task_dict[task_name], EphemeralOperator) 71 | 72 | 73 | def test_ephemeral_tasks(): 74 | with test_dag(): 75 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "ephemeral_operator") 76 | tasks = factory._builder.parse_manifest_into_tasks(factory._manifest_file_path()) 77 | 78 | # then 79 | assert ( 80 | task_group_prefix_builder("model1", "test") 81 | in tasks.get_task("model.dbt_test.model1").execution_airflow_task.downstream_task_ids 82 | ) 83 | assert ( 84 | task_group_prefix_builder("model1", "run") 85 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.upstream_task_ids 86 | ) 87 | 88 | assert ( 89 | task_group_prefix_builder("model1", "test") 90 | in tasks.get_task("model.dbt_test.model2").execution_airflow_task.upstream_task_ids 91 | ) 92 | assert ( 93 | "model2__ephemeral" 94 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 95 | ) 96 | 97 | assert ( 98 | "model2__ephemeral" 99 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 100 | ) 101 | assert ( 102 | "model3__ephemeral" 103 | in tasks.get_task("model.dbt_test.model5").execution_airflow_task.downstream_task_ids 104 | ) 105 | 106 | assert ( 107 | "model3__ephemeral" 108 | in tasks.get_task("model.dbt_test.model10").execution_airflow_task.upstream_task_ids 109 | ) 110 | assert ( 111 | "model9__ephemeral" 112 | in tasks.get_task("model.dbt_test.model10").execution_airflow_task.upstream_task_ids 113 | ) 114 | assert ( 115 | "model10__ephemeral" 116 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.downstream_task_ids 117 | ) 118 | assert ( 119 | "model10__ephemeral" 120 | in tasks.get_task("model.dbt_test.model9").execution_airflow_task.downstream_task_ids 121 | ) 122 | assert ( 123 | "model11__ephemeral" 124 | in tasks.get_task("model.dbt_test.model10").execution_airflow_task.downstream_task_ids 125 | ) 126 | assert ( 127 | "model10__ephemeral" 128 | in tasks.get_task("model.dbt_test.model11").execution_airflow_task.upstream_task_ids 129 | ) 130 | 131 | 132 | def test_no_ephemeral_tasks(): 133 | with test_dag(): 134 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), "no_ephemeral_operator") 135 | tasks = factory._builder.parse_manifest_into_tasks(factory._manifest_file_path()) 136 | 137 | # then 138 | assert ( 139 | task_group_prefix_builder("model1", "test") 140 | in tasks.get_task("model.dbt_test.model1").execution_airflow_task.downstream_task_ids 141 | ) 142 | assert ( 143 | task_group_prefix_builder("model1", "run") 144 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.upstream_task_ids 145 | ) 146 | 147 | assert ( 148 | task_group_prefix_builder("model1", "test") 149 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 150 | ) 151 | assert ( 152 | task_group_prefix_builder("model4", "run") 153 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 154 | ) 155 | 156 | assert ( 157 | task_group_prefix_builder("model6", "test") 158 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 159 | ) 160 | assert ( 161 | task_group_prefix_builder("model4", "run") 162 | in tasks.get_task("model.dbt_test.model6").test_airflow_task.downstream_task_ids 163 | ) 164 | -------------------------------------------------------------------------------- /tests/test_notifications.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | from os import path 4 | from unittest.mock import MagicMock, patch 5 | 6 | import pytest 7 | from airflow.models import Connection 8 | 9 | from dbt_airflow_factory.constants import ( 10 | IS_AIRFLOW_NEWER_THAN_2_4, 11 | IS_FIRST_AIRFLOW_VERSION, 12 | ) 13 | 14 | if IS_FIRST_AIRFLOW_VERSION: 15 | from airflow.contrib.operators.slack_webhook_operator import SlackWebhookOperator 16 | else: 17 | from airflow.providers.slack.operators.slack_webhook import SlackWebhookOperator 18 | 19 | from dbt_airflow_factory.airflow_dag_factory import AirflowDagFactory 20 | from dbt_airflow_factory.notifications.handler import NotificationHandlersFactory 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "config_dir", 25 | ( 26 | "notifications_slack", 27 | "notifications_teams", 28 | ), 29 | ) 30 | def test_notification_callback_creation(config_dir): 31 | # given 32 | factory = AirflowDagFactory(path.dirname(path.abspath(__file__)), config_dir) 33 | 34 | # when 35 | dag = factory.create() 36 | 37 | # then 38 | assert dag.default_args["on_failure_callback"] 39 | 40 | 41 | @patch( 42 | "airflow.hooks.base.BaseHook.get_connection" 43 | if IS_AIRFLOW_NEWER_THAN_2_4 44 | else "airflow.hooks.base_hook.BaseHook.get_connection" 45 | ) 46 | @patch( 47 | "airflow.contrib.operators.slack_webhook_operator.SlackWebhookOperator.__new__" 48 | if IS_FIRST_AIRFLOW_VERSION 49 | else "airflow.providers.slack.operators.slack_webhook.SlackWebhookOperator.__new__" 50 | ) 51 | def test_notification_send_for_slack(mock_operator_init, mock_get_connection): 52 | # given 53 | notifications_config = AirflowDagFactory( 54 | path.dirname(path.abspath(__file__)), "notifications_slack" 55 | ).airflow_config["failure_handlers"] 56 | factory = NotificationHandlersFactory() 57 | context = create_context() 58 | mock_get_connection.return_value = create_slack_connection() 59 | mock_operator = MagicMock() 60 | mock_operator_init.return_value = mock_operator 61 | 62 | # when 63 | factory.create_failure_handler(notifications_config)(context) 64 | 65 | # then 66 | mock_operator_init.assert_called_once_with( 67 | SlackWebhookOperator, 68 | task_id="slack_failure_notification", 69 | message=":red_circle: Task Failed.\n" 70 | "*Task*: task_id\n" 71 | "*Dag*: dag_id\n" 72 | "*Execution Time*: some date\n" 73 | "*Log Url*: log_url", 74 | http_conn_id="slack_failure", 75 | webhook_token="test_password", 76 | username="test_login", 77 | ) 78 | mock_operator.execute.assert_called_once_with(context=context) 79 | 80 | 81 | @patch( 82 | "airflow.hooks.base.BaseHook.get_connection" 83 | if IS_AIRFLOW_NEWER_THAN_2_4 84 | else "airflow.hooks.base_hook.BaseHook.get_connection" 85 | ) 86 | @patch("dbt_airflow_factory.notifications.ms_teams_webhook_hook.MSTeamsWebhookHook.run") 87 | def test_notification_send_for_teams(mock_hook_run, mock_get_connection): 88 | # given 89 | notifications_config = AirflowDagFactory( 90 | path.dirname(path.abspath(__file__)), "notifications_teams" 91 | ).airflow_config["failure_handlers"] 92 | factory = NotificationHandlersFactory() 93 | context = create_context() 94 | mock_get_connection.return_value = create_teams_connection() 95 | expected_payload_path = pathlib.Path(__file__).parent / "teams_webhook_expected_paylaod.json" 96 | with open(expected_payload_path, "rt") as f: 97 | webhook_expected_payload = json.load(f) 98 | 99 | # when 100 | factory.create_failure_handler(notifications_config)(context) 101 | 102 | # then 103 | request = mock_hook_run.call_args_list[0].kwargs 104 | webhook_post_data = json.loads(request["data"].replace("\n", "").replace(" ", "")) 105 | assert mock_hook_run.called_once 106 | assert webhook_post_data == webhook_expected_payload 107 | 108 | 109 | def create_slack_connection(): 110 | connection = MagicMock() 111 | connection.configure_mock(**{"login": "test_login", "password": "test_password"}) 112 | return connection 113 | 114 | 115 | def create_teams_connection(): 116 | connection = Connection( 117 | **{ 118 | "login": None, 119 | "password": None, 120 | "conn_type": "http", 121 | "host": "teams.com/webhook_endpoint", 122 | "schema": "https", 123 | } 124 | ) 125 | return connection 126 | 127 | 128 | def create_context(): 129 | task_instance = MagicMock() 130 | task_instance.configure_mock(**{"task_id": "task_id", "dag_id": "dag_id", "log_url": "log_url"}) 131 | return {"task_instance": task_instance, "execution_date": "some date", "ts": "ts"} 132 | -------------------------------------------------------------------------------- /tests/test_task_group.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | builder_factory, 3 | manifest_file_with_models, 4 | task_group_prefix_builder, 5 | test_dag, 6 | ) 7 | 8 | 9 | def test_task_group(): 10 | # given 11 | manifest_path = manifest_file_with_models( 12 | { 13 | "model.dbt_test.model1": [], 14 | "model.dbt_test.model2": ["model.dbt_test.model1"], 15 | "model.dbt_test.model3": ["model.dbt_test.model1"], 16 | "model.dbt_test.model4": ["model.dbt_test.model2", "model.dbt_test.model3"], 17 | } 18 | ) 19 | 20 | # when 21 | with test_dag(): 22 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 23 | 24 | # then 25 | assert ( 26 | task_group_prefix_builder("model1", "test") 27 | in tasks.get_task("model.dbt_test.model1").execution_airflow_task.downstream_task_ids 28 | ) 29 | assert ( 30 | task_group_prefix_builder("model1", "run") 31 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.upstream_task_ids 32 | ) 33 | 34 | assert ( 35 | task_group_prefix_builder("model1", "test") 36 | in tasks.get_task("model.dbt_test.model2").execution_airflow_task.upstream_task_ids 37 | ) 38 | assert ( 39 | task_group_prefix_builder("model2", "run") 40 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 41 | ) 42 | 43 | assert ( 44 | task_group_prefix_builder("model1", "test") 45 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 46 | ) 47 | assert ( 48 | task_group_prefix_builder("model3", "run") 49 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 50 | ) 51 | 52 | assert ( 53 | task_group_prefix_builder("model2", "test") 54 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 55 | ) 56 | assert ( 57 | task_group_prefix_builder("model3", "test") 58 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 59 | ) 60 | assert ( 61 | task_group_prefix_builder("model4", "run") 62 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 63 | ) 64 | assert ( 65 | task_group_prefix_builder("model4", "run") 66 | in tasks.get_task("model.dbt_test.model3").test_airflow_task.downstream_task_ids 67 | ) 68 | 69 | 70 | def test_no_task_group(): 71 | # given 72 | manifest_path = manifest_file_with_models( 73 | { 74 | "model.dbt_test.model1": [], 75 | "model.dbt_test.model2": ["model.dbt_test.model1"], 76 | "model.dbt_test.model3": ["model.dbt_test.model1"], 77 | "model.dbt_test.model4": ["model.dbt_test.model2", "model.dbt_test.model3"], 78 | } 79 | ) 80 | 81 | # when 82 | with test_dag(): 83 | tasks = builder_factory(False).create().parse_manifest_into_tasks(manifest_path) 84 | 85 | # then 86 | assert ( 87 | "model1_test" 88 | in tasks.get_task("model.dbt_test.model1").execution_airflow_task.downstream_task_ids 89 | ) 90 | assert ( 91 | "model1_run" in tasks.get_task("model.dbt_test.model1").test_airflow_task.upstream_task_ids 92 | ) 93 | 94 | assert ( 95 | "model1_test" 96 | in tasks.get_task("model.dbt_test.model2").execution_airflow_task.upstream_task_ids 97 | ) 98 | assert ( 99 | "model2_run" 100 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 101 | ) 102 | 103 | assert ( 104 | "model1_test" 105 | in tasks.get_task("model.dbt_test.model3").execution_airflow_task.upstream_task_ids 106 | ) 107 | assert ( 108 | "model3_run" 109 | in tasks.get_task("model.dbt_test.model1").test_airflow_task.downstream_task_ids 110 | ) 111 | 112 | assert ( 113 | "model2_test" 114 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 115 | ) 116 | assert ( 117 | "model3_test" 118 | in tasks.get_task("model.dbt_test.model4").execution_airflow_task.upstream_task_ids 119 | ) 120 | assert ( 121 | "model4_run" 122 | in tasks.get_task("model.dbt_test.model2").test_airflow_task.downstream_task_ids 123 | ) 124 | assert ( 125 | "model4_run" 126 | in tasks.get_task("model.dbt_test.model3").test_airflow_task.downstream_task_ids 127 | ) 128 | -------------------------------------------------------------------------------- /tests/test_tasks.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from dbt_airflow_factory.builder_factory import DbtAirflowTasksBuilderFactory 4 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 5 | 6 | from .utils import ( 7 | builder_factory, 8 | manifest_file_with_models, 9 | task_group_prefix_builder, 10 | test_dag, 11 | ) 12 | 13 | 14 | def test_get_dag(): 15 | # given 16 | manifest_path = manifest_file_with_models({"model.dbt_test.dim_users": []}) 17 | 18 | # when 19 | with test_dag(): 20 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 21 | 22 | # then 23 | assert tasks.length() == 1 24 | assert tasks.get_task("model.dbt_test.dim_users") is not None 25 | assert tasks.get_task("model.dbt_test.dim_users").execution_airflow_task is not None 26 | assert tasks.get_task("model.dbt_test.dim_users").test_airflow_task is not None 27 | 28 | 29 | def test_run_task(): 30 | # given 31 | manifest_path = manifest_file_with_models({"model.dbt_test.dim_users": []}) 32 | 33 | # when 34 | with test_dag(): 35 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 36 | 37 | # then 38 | run_task = tasks.get_task("model.dbt_test.dim_users").execution_airflow_task 39 | assert run_task.cmds == ["bash", "-c"] 40 | assert "dbt --no-write-json run " in run_task.arguments[0] 41 | assert "--select dim_users" in run_task.arguments[0] 42 | assert '--vars "{}"' in run_task.arguments[0] 43 | assert run_task.name == "dim-users-run" if IS_FIRST_AIRFLOW_VERSION else "run" 44 | assert run_task.task_id == task_group_prefix_builder("dim_users", "run") 45 | 46 | 47 | def test_test_task(): 48 | # given 49 | manifest_path = manifest_file_with_models({"model.dbt_test.dim_users": []}) 50 | 51 | # when 52 | with test_dag(): 53 | tasks = builder_factory().create().parse_manifest_into_tasks(manifest_path) 54 | 55 | # then 56 | test_task = tasks.get_task("model.dbt_test.dim_users").test_airflow_task 57 | assert test_task.cmds == ["bash", "-c"] 58 | assert "dbt --no-write-json test " in test_task.arguments[0] 59 | assert "--select dim_users" in test_task.arguments[0] 60 | assert '--vars "{}"' in test_task.arguments[0] 61 | assert test_task.name == "dim-users-test" if IS_FIRST_AIRFLOW_VERSION else "test" 62 | assert test_task.task_id == task_group_prefix_builder("dim_users", "test") 63 | 64 | 65 | def test_dbt_vars(): 66 | # given 67 | manifest_path = manifest_file_with_models({"model.dbt_test.dim_users": []}) 68 | factory = DbtAirflowTasksBuilderFactory(path.dirname(path.abspath(__file__)), "vars", {}) 69 | 70 | # when 71 | with test_dag(): 72 | tasks = factory.create().parse_manifest_into_tasks(manifest_path) 73 | 74 | # then 75 | run_task = tasks.get_task("model.dbt_test.dim_users").execution_airflow_task 76 | assert run_task.cmds == ["bash", "-c"] 77 | assert "dbt --no-write-json run " in run_task.arguments[0] 78 | assert '--vars "{variable_1: 123, variable_2: var2}"' in run_task.arguments[0] 79 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | from datetime import datetime 5 | 6 | from airflow import DAG 7 | 8 | from dbt_airflow_factory.builder_factory import DbtAirflowTasksBuilderFactory 9 | 10 | 11 | def manifest_file_with_models(nodes_with_dependencies: dict, extra_metadata: dict = None): 12 | content_nodes = {} 13 | for node_name in nodes_with_dependencies.keys(): 14 | content_nodes[node_name] = { 15 | "depends_on": {"nodes": nodes_with_dependencies[node_name]}, 16 | "config": {"materialized": "view"}, 17 | "name": node_name.split(".")[-1], 18 | } 19 | content = {"nodes": content_nodes, "child_map": {}} 20 | if extra_metadata: 21 | content.update(extra_metadata) 22 | with tempfile.NamedTemporaryFile(delete=False) as tmp: 23 | tmp.write(str.encode(json.dumps(content))) 24 | return tmp.name 25 | 26 | 27 | def builder_factory(use_task_group=True, enable_project_dependencies=False, env="dev"): 28 | return DbtAirflowTasksBuilderFactory( 29 | os.path.dirname(os.path.abspath(__file__)), 30 | env, 31 | { 32 | "enable_project_dependencies": enable_project_dependencies, 33 | "use_task_group": use_task_group, 34 | }, 35 | ) 36 | 37 | 38 | def test_dag(): 39 | return DAG("test", default_args={"start_date": datetime(2021, 10, 13)}) 40 | 41 | 42 | def task_group_prefix_builder(task_model_id: str, task_command: str) -> str: 43 | from dbt_airflow_factory.constants import IS_FIRST_AIRFLOW_VERSION 44 | 45 | return ( 46 | f"{task_model_id}_{task_command}" 47 | if IS_FIRST_AIRFLOW_VERSION 48 | else f"{task_model_id}.{task_command}" 49 | ) 50 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38 3 | 4 | [testenv] 5 | extras = 6 | tests 7 | commands= 8 | python -m pytest --cov dbt_airflow_factory --cov-report xml --cov-report term-missing --ignore=venv 9 | 10 | # Lint 11 | [flake8] 12 | exclude = .git,__pycache__,docs/source/conf.py,old,build,dist 13 | max-line-length = 120 14 | extend-ignore = E203 15 | 16 | [mypy] 17 | no_strict_optional = True 18 | ignore_missing_imports = True 19 | 20 | # Autoformatter 21 | [testenv:black] 22 | basepython = python3 23 | skip_install = true 24 | deps = 25 | black 26 | commands = 27 | black 28 | 29 | # Release tooling 30 | [testenv:build] 31 | basepython = python3 32 | skip_install = true 33 | deps = 34 | wheel 35 | setuptools 36 | commands = 37 | python setup.py sdist 38 | 39 | [testenv:release] 40 | basepython = python3 41 | skip_install = true 42 | setenv = 43 | TWINE_USERNAME = {env:TWINE_USERNAME} 44 | TWINE_PASSWORD = {env:TWINE_PASSWORD} 45 | deps = 46 | {[testenv:build]deps} 47 | twine >= 1.5.0 48 | commands = 49 | {[testenv:build]commands} 50 | twine upload --skip-existing dist/* 51 | --------------------------------------------------------------------------------