├── .flake8 ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE.md ├── Makefile ├── README.md ├── dags └── dop │ ├── __init__.py │ ├── airflow_module │ ├── dag_builder │ │ ├── dag_builder_util.py │ │ └── transformation_dag_builder.py │ └── operator │ │ ├── common.py │ │ ├── dbt_k8_operator.py │ │ ├── dbt_operator.py │ │ ├── dbt_operator_helper.py │ │ └── run_results_schema.json │ ├── component │ ├── configuration │ │ ├── __init__.py │ │ └── env.py │ ├── helper │ │ ├── __init__.py │ │ ├── dbt_init.py │ │ └── dbt_profile.py │ ├── transformation │ │ ├── common │ │ │ ├── adapter │ │ │ │ ├── model.py │ │ │ │ ├── relation.py │ │ │ │ └── schema.py │ │ │ ├── parser │ │ │ │ └── yaml_parser.py │ │ │ └── templating │ │ │ │ ├── jinja.py │ │ │ │ └── template │ │ │ │ └── global.sql │ │ └── runner │ │ │ └── bigquery │ │ │ ├── adapter │ │ │ ├── impl.py │ │ │ ├── model.py │ │ │ └── relation.py │ │ │ └── template │ │ │ └── macro │ │ │ ├── adapter.sql │ │ │ └── materialization │ │ │ ├── table_create_or_replace.sql │ │ │ └── table_upsert.sql │ └── util │ │ ├── auth.py │ │ └── secret_manager.py │ └── definitions.py ├── docker-compose.yml ├── docs ├── a_typical_dop_orchestration_flow.png ├── dop_docker_account_impersonation.png ├── dop_service_project_architecture.png ├── example_dag_with_dbt_running.png ├── grant_service_account_user.png ├── local_airflow_ui.png ├── set-variables-ide.png ├── trigger_dag.png └── trigger_full_refresh.png ├── examples └── service_project │ ├── .gcloudignore │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── dbt_start │ ├── .gitignore │ ├── README.md │ ├── analysis │ │ └── .gitkeep │ ├── data │ │ └── .gitkeep │ ├── dbt_project.yml │ ├── macros │ │ └── .gitkeep │ ├── models │ │ ├── aggregation_a │ │ │ └── covid19_cases_by_country.sql │ │ ├── aggregation_b │ │ │ └── covid19_cases_by_country_and_region.sql │ │ ├── schema.yml │ │ └── staging │ │ │ └── stg_covid19_cases.sql │ ├── snapshots │ │ └── .gitkeep │ └── tests │ │ └── .gitkeep │ ├── dbt_start_two │ ├── .gitignore │ ├── README.md │ ├── analysis │ │ └── .gitkeep │ ├── data │ │ └── .gitkeep │ ├── dbt_project.yml │ ├── macros │ │ └── .gitkeep │ ├── models │ │ ├── aggregation_a │ │ │ └── covid19_cases_by_country.sql │ │ ├── aggregation_b │ │ │ └── covid19_cases_by_country_and_region.sql │ │ ├── schema.yml │ │ └── staging │ │ │ └── stg_covid19_cases.sql │ ├── snapshots │ │ └── .gitkeep │ └── tests │ │ └── .gitkeep │ └── embedded_dop │ ├── executor_config │ └── dbt │ │ ├── Pipfile │ │ ├── Pipfile.lock │ │ └── config.yaml │ └── orchestration │ ├── dummy_upstream_dependency │ └── config.yaml │ ├── example_complex_design │ ├── config.yaml │ └── sql │ │ ├── dim_customer.sql │ │ ├── dim_customer_assertion.sql │ │ ├── dim_customer_subscription.sql │ │ ├── dim_customer_subscription_assertion.sql │ │ ├── dim_date.sql │ │ ├── dim_product.sql │ │ ├── dim_voucher.sql │ │ ├── fact_customer_activity.sql │ │ ├── fact_newly_registered_customer.sql │ │ ├── fact_transaction.sql │ │ ├── list_of_users_require_attention.sql │ │ ├── salesforce_marketing_cloud_is_ready.sql │ │ ├── salesforce_service_cloud_is_ready.sql │ │ ├── staging_salesforce_marketing_cloud.sql │ │ ├── staging_salesforce_service_cloud.sql │ │ ├── staging_zend_desk.sql │ │ ├── zend_desk_is_ready.sql │ │ ├── zend_desk_ticket_assignments.sql │ │ ├── zend_desk_ticket_comments.sql │ │ ├── zend_desk_ticket_priority_changes.sql │ │ ├── zend_desk_ticket_summary.sql │ │ └── zend_desk_ticket_user_issues.sql │ ├── example_covid19 │ ├── config.yaml │ └── sql │ │ ├── assert_upstream_data_is_ready.sql │ │ ├── covid19_by_country.sql │ │ ├── covid19_by_country_and_region.sql │ │ ├── data_quality_checks.sql │ │ ├── invoke_sp_all_countries_and_regions.sql │ │ ├── sp_all_countries_and_regions.sql │ │ ├── stg_covid19.sql │ │ ├── udf_empty_str_as_null.sql │ │ ├── udf_unpivot.sql │ │ └── view_covid19_by_country_and_region.sql │ ├── example_dataflow_template │ └── config.yaml │ ├── example_external_task_sensor │ └── config.yaml │ └── example_upstream_dependency │ └── config.yaml ├── infrastructure ├── cloudbuild │ ├── build-dbt.yaml │ ├── build.yaml │ └── deploy.yaml ├── dbt-docs │ ├── README.md │ └── app-engine │ │ ├── .gcloudignore │ │ ├── app.yaml │ │ ├── main.py │ │ └── requirements.txt ├── docker │ ├── README.md │ ├── composer_1.10.10 │ │ ├── Dockerfile │ │ ├── requirements.composer.txt │ │ ├── requirements.txt │ │ └── script │ │ │ ├── entrypoint.sh │ │ │ └── exec_entrypoint.sh │ ├── composer_1.10.14 │ │ ├── Dockerfile │ │ ├── constrains-composer.txt │ │ └── requirements.txt │ ├── composer_1.10.15 │ │ ├── Dockerfile │ │ ├── constrains-composer.txt │ │ └── requirements.txt │ ├── developer_only │ │ └── .airflowignore │ └── docker-compose-dop.yml └── executor │ └── dbt │ ├── Dockerfile │ ├── README.md │ └── init.py └── tests ├── __init__.py ├── integration_tests └── .gitkeep └── unit_tests ├── component └── transformation │ └── common │ └── adapter │ └── test_schema.py └── requirements.txt /.flake8: -------------------------------------------------------------------------------- 1 | # See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#line-length for more details 2 | [flake8] 3 | max-line-length = 88 4 | select = C,E,F,W,B,B950 5 | extend-ignore = E203, E501 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | venv 3 | .installed.cfg 4 | bin 5 | develop-eggs 6 | dist 7 | downloads 8 | eggs 9 | parts 10 | src/*.egg-info 11 | lib 12 | lib64 13 | *.pyc 14 | *.pyo 15 | .python-version 16 | 17 | # folders 18 | 19 | # IDE 20 | .idea 21 | 22 | .env 23 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 21.5b1 10 | hooks: 11 | - id: black 12 | language_version: python3 13 | - repo: https://gitlab.com/pycqa/flake8 14 | rev: 3.9.2 15 | hooks: 16 | - id: flake8 17 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # DOP v0.3.0 — 2021-08-11 2 | 3 | ## Features 4 | 5 | * **Support for "generic" airflow operators**: you can now use regular python 6 | operators as part of your config files. 7 | 8 | * **Support for “dbt docs” command to generate documentation for all dbt 9 | tasks**: Users can now add “docs generate” as a target in their DOP 10 | configuration and additionally specify a GCS bucket with the `--bucket` 11 | and `--bucket-path` options where documents are copied to. 12 | 13 | * **Serve dbt docs**: Documents generated by dbt can be served as a web page by 14 | deploying the provided app on GAE. Note that deploying is an additional step 15 | that needs to be carried out after docs have been generated. See 16 | `infrastructure/dbt-docs/README.md` for details. 17 | 18 | * **dbt tasks artifacts `run_results` created by dbt tasks saved to BigQuery**: 19 | This json file contains information on completed dbt invocations and is saved 20 | in the BQ table “run_results” for analysis and debugging. 21 | 22 | * **Add support for Airflow `v1.10.14` and `v1.10.15` local environments**: 23 | Users can specify which version they want to use by setting 24 | the `AIRFLOW_VERSION` environment variable. 25 | 26 | * **Pre-commit linters**: added pre-commit hooks to ensure python, yaml and some 27 | support for plain text file consistency in formatting and style throughout DOP 28 | codebase. 29 | 30 | ## Changes 31 | 32 | * **Ensure DAGs using the same DBT project do not run concurrently**: Safety 33 | feature to safely allow selective execution of workflows by calling specific 34 | commands or tags (e.g. `dbt run --m`) within a single dbt project. This avoids 35 | creating inter-dependant workflows to avoid overriding each other's artifacts, 36 | since they will share the same target location (within the dbt container). 37 | 38 | * **Test time-partitioning**: Time-partitioning of datetime type properly 39 | validated as part of schema validation. 40 | 41 | * **Use Python 3.7 and dbt 0.19.1 in Composer K8s Operator** 42 | 43 | * **Add Dataflow example task**: with the introduction of "regular" in the yaml 44 | config Airflow Operators, it is now possible to run compute intensive Dataflow 45 | jobs. Check `example_dataflow_template` for an example on how to implement a 46 | Dataflow pipeline. 47 | 48 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Datatonic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build down up 2 | 3 | # Defaults to the latest 4 | AIRFLOW_VERSION := 1.10.15 5 | 6 | include .env 7 | export 8 | DOP_PROJECT_ID := #{REPLACE WITH A GCP PROJECT ID WHERE DOP WILL EXECUTE ALL JOBS} 9 | DOP_LOCATION := #{REPLACE WITH A GCP REGION WHERE DATA WILL BE PERSISTED BY DOP} 10 | 11 | ENVS := PROJECT_ID=$(DOP_PROJECT_ID) \ 12 | LOCATION=$(DOP_LOCATION) 13 | 14 | validate: 15 | if [ -z ${DOP_PROJECT_ID} ]; then \ 16 | echo "DOP_PROJECT_ID must be defined. Aborting";\ 17 | exit 1; \ 18 | elif [ -z ${DOP_LOCATION} ]; then \ 19 | echo "DOP_LOCATION must be defined. Aborting";\ 20 | exit 1; \ 21 | elif [ -z ${AIRFLOW_VERSION} ]; then \ 22 | echo "AIRFLOW_VERSION must be defined. Aborting";\ 23 | exit 1; \ 24 | fi 25 | 26 | build: 27 | $(ENVS) docker-compose up -d --build webserver 28 | 29 | down: validate 30 | $(ENVS) docker-compose down 31 | 32 | up: 33 | $(ENVS) docker-compose up -d 34 | 35 | logs: 36 | docker logs dop_webserver -f 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Table of contents 2 | ================= 3 | * [What is DOP](#what-is-dop) 4 | * [Design Concept](#design-concept) 5 | * [A Typical DOP Orchestration Flow](#a-typical-dop-orchestration-flow) 6 | * [Prerequisites - Run in Docker](#prerequisites---run-in-docker) 7 | * [For DOP Native Features](#for-dop-native-features) 8 | * [For DBT](#for-dbt) 9 | * [Instructions for Setting things up](#instructions-for-setting-things-up) 10 | * [Run Airflow with DOP in Docker - Mac](#run-airflow-with-dop-in-docker---mac) 11 | * [Run Airflow with DOP in Docker - Windows](#run-airflow-with-dop-in-docker---windows) 12 | * [Run on Composer](#run-on-composer) 13 | * [Prerequisites](#prerequisites) 14 | * [Create Composer Cluster](#create-composer-cluster) 15 | * [Deployment](#deployment) 16 | * [Misc](#misc) 17 | * [Service Account Impersonation](#service-account-impersonation) 18 | 19 | # What is DOP 20 | ## Design Concept 21 | DOP is designed to simplify the orchestration effort across many connected components using a configuration file without the need to write any code. 22 | We have a vision to make orchestration easier to manage and more accessible to a wider group of people. 23 | 24 | Here are some of the key design concept behind DOP, 25 | - Built on top of Apache Airflow - Utilises it’s DAG capabilities with interactive GUI 26 | - DAGs without code - YAML + SQL 27 | - Native capabilities (SQL) - Materialisation, Assertion and Invocation 28 | - Extensible via plugins - DBT job, Spark job, Egress job, Triggers, etc 29 | - Easy to setup and deploy - fully automated dev environment and easy to deploy 30 | - Open Source - open sourced under the MIT license 31 | 32 | **Please note that this project is heavily optimised to run with GCP (Google Cloud Platform) services which is our current focus. By focusing on one cloud provider, it allows us to really improve on end user experience through automation** 33 | 34 | ## A Typical DOP Orchestration Flow 35 | ![Typical DOP Flow](docs/a_typical_dop_orchestration_flow.png) 36 | 37 | # Prerequisites - Run in Docker 38 | Note that all the IAM related prerequisites will be available as a Terraform template soon! 39 | 40 | ## For DOP Native Features 41 | 1. Download and install Docker https://docs.docker.com/get-docker/ (if you are on Windows, please follow instruction here as there are some additional steps required for it to work https://docs.docker.com/docker-for-windows/install/) 42 | 1. Download and install Google Cloud Platform (GCP) SDK following instructions here https://cloud.google.com/sdk/docs/install. 43 | 1. Create a dedicated service account for docker with limited permissions for the `development` GCP project, the Docker instance is not designed to be connected to the production environment 44 | 1. Call it `dop-docker-user@` and create it in `https://console.cloud.google.com/iam-admin/serviceaccounts?project=` 45 | 1. Assign the `roles/bigquery.dataEditor` and `roles/bigquery.jobUser` role to the service account under `https://console.cloud.google.com/iam-admin/iam?project=` 46 | 1. Your GCP user / group will need to be given the `roles/iam.serviceAccountUser` and the `roles/iam.serviceAccountTokenCreator` role on the`development` project just for the `dop-docker-user` service account in order to enable [Service Account Impersonation](#service-account-impersonation). 47 | ![Grant service account user](docs/grant_service_account_user.png) 48 | 1. Authenticating with your GCP environment by typing in `gcloud auth application-default login` in your terminal and following instructions. Make sure you proceed to the stage where `application_default_credentials.json` is created on your machine (For windows users, make a note of the path, this will be required on a later stage) 49 | 1. Clone this repository to your machine. 50 | 51 | ## For DBT 52 | 1. Setup a service account for your GCP project called `dop-dbt-user` in `https://console.cloud.google.com/iam-admin/serviceaccounts?project=` 53 | 1. Assign the `roles/bigquery.dataEditor` and `roles/bigquery.jobUser` role to the service account at project level under `https://console.cloud.google.com/iam-admin/iam?project=` 54 | 1. Your GCP user / group will need to be given the `roles/iam.serviceAccountUser` and the `roles/iam.serviceAccountTokenCreator` role on the `development` project just for the `dop-dbt-user` service account in order to enable [Service Account Impersonation](#service-account-impersonation). 55 | 56 | # Instructions for Setting things up 57 | 58 | ## Run Airflow with DOP in Docker - Mac 59 | 60 | See [README in the service project setup](examples/service_project/README.md) and follow instructions. 61 | 62 | Once it's setup, you should see example DOP DAGs such as `dop__example_covid19` 63 | ![Airflow in Docker](docs/local_airflow_ui.png) 64 | 65 | ### Local development 66 | 67 | To simplify the development, in the root folder, there is a `Makefile` and a `docker-compose.yml` that start Postgres and Airflow locally 68 | 69 | From the root of the repo run: 70 | ```shell 71 | make build \ 72 | DOP_PROJECT_ID= \ 73 | DOP_LOCATION= 74 | ``` 75 | 76 | For subsequent runs run 77 | ```shell 78 | make up \ 79 | DOP_PROJECT_ID= \ 80 | DOP_LOCATION= 81 | ``` 82 | 83 | To shut the local environment down run: 84 | ```shell 85 | make down \ 86 | DOP_PROJECT_ID= \ 87 | DOP_LOCATION= 88 | ``` 89 | 90 | On Linux, the mounted volumes in container use the native Linux filesystem user/group permissions. 91 | Airflow image is started with the user/group 50000 and doesn't have read or write access in some mounted volumes 92 | (check volumes section in `docker-compose.yml`) 93 | 94 | ```shell 95 | $ docker exec -u airflow -it dop_webserver id 96 | uid=50000(airflow) gid=50000(airflow) groups=50000(airflow) 97 | $ docker exec -u airflow -it dop_webserver ls -ld dags 98 | drwxrwxr-x 5 1001 1001 4096 Jun 4 07:25 dags 99 | $ docker exec -u airflow -it dop_webserver ls -l /secret/gcp-credentials/application_default_credentials.json 100 | -rw------- 1 1001 1001 341 Jun 4 09:54 /secret/gcp-credentials/application_default_credentials.json 101 | ``` 102 | 103 | So, permissions must be updated manually to have read permissions on the secrets file and write permissions in the dags folder 104 | 105 | 106 | ## Run Airflow with DOP in Docker - Windows 107 | This is currently working in progress, however the instructions on what needs to be done is in the [Makefile](examples/service_project/Makefile) 108 | 109 | ## Run on Composer 110 | 111 | ### Prerequisites 112 | 1. Create a dedicate service account for Composer and call it `dop-composer-user` with following roles at project level 113 | - roles/bigquery.dataEditor 114 | - roles/bigquery.jobUser 115 | - roles/composer.worker 116 | - roles/compute.viewer 117 | 1. Create a dedicated service account for DBT with limited permissions. 118 | 1. [Already done in here if it’s DEV] Call it `dop-dbt-user@` and create in `https://console.cloud.google.com/iam-admin/serviceaccounts?project=` 119 | 1. [Already done in here if it’s DEV] Assign the `roles/bigquery.dataEditor` and `roles/bigquery.jobUser` role to the service account at project level under `https://console.cloud.google.com/iam-admin/iam?project=` 120 | 1. The `dop-composer-user` will need to be given the `roles/iam.serviceAccountUser` and the `roles/iam.serviceAccountTokenCreator` role just for the `dop-dbt-user` service account in order to enable [Service Account Impersonation](#service-account-impersonation). 121 | 122 | ### Create Composer Cluster 123 | 1. Use the service account already created `dop-composer-user` instead of the default service account 124 | 1. Use the following environment variables 125 | ``` 126 | DOP_PROJECT_ID : {REPLACE WITH THE GCP PROJECT ID WHERE DOP WILL PERSIST ALL DATA TO} 127 | DOP_LOCATION : {REPLACE WITH GCP REGION LOCATION WHRE DOP WILL PERSIST ALL DATA TO} 128 | DOP_SERVICE_PROJECT_PATH := {REPLACE WITH THE ABSOLUTE PATH OF THE Service Project, i.e. /home/airflow/gcs/dags/dop_{service project name} 129 | DOP_INFRA_PROJECT_ID := {REPLACE WITH THE GCP INFRASTRUCTURE PROJECT ID WHERE BUILD ARTIFACTS ARE STORED, i.e. a DBT docker image stored in GCR} 130 | ``` 131 | and optionally 132 | ``` 133 | DOP_GCR_PULL_SECRET_NAME:= {This maybe needed if the project storing the gcr images are not he same as where Cloud Composer runs, however this might be a better alternative https://medium.com/google-cloud/using-single-docker-repository-with-multiple-gke-projects-1672689f780c} 134 | ``` 135 | 1. Add the following Python Packages 136 | ``` 137 | dataclasses==0.7 138 | ``` 139 | 1. Finally create a new node pool with the following k8 label 140 | ``` 141 | key: cloud.google.com/gke-nodepool 142 | value: kubernetes-task-pool 143 | ``` 144 | 145 | ### Deployment 146 | See [Service Project README](examples/service_project/README.md#deploy-to-cloud-composer) 147 | 148 | # Misc 149 | ## Service Account Impersonation 150 | Impersonation is a GCP feature allows a user / service account to impersonate as another service account. 151 | This is a very useful feature and offers the following benefits 152 | - When doing development locally, especially with automation involved (i.e using Docker), it is very risky to interact with GCP services by using your user account directly because it may have a lot of permissions. By impersonate as another service account with less permissions, it is a lot safer (least privilege) 153 | - There is no credential needs to be downloaded, all permissions are linked to the user account. If an employee leaves the company, access to GCP will be revoked immediately because the impersonation process is no longer possible 154 | 155 | The following diagram explains how we use Impersonation in DOP when it runs in Docker 156 | ![DOP Docker Account Impersonation](docs/dop_docker_account_impersonation.png) 157 | 158 | And when running DBT jobs on production, we are also using this technique to use the composer service account to impersonate as the `dop-dbt-user` service account so that service account keys are not required. 159 | 160 | There are two very google articles explaining how impersonation works and why using it 161 | - [Using ImpersonatedCredentials for Google Cloud APIs](https://medium.com/google-cloud/using-impersonatedcredentials-for-google-cloud-apis-14581ca990d8) 162 | - [Stop Downloading Google Cloud Service Account Keys!](https://medium.com/@jryancanty/stop-downloading-google-cloud-service-account-keys-1811d44a97d9) 163 | 164 | 165 | ## Pre-commit Linter 166 | [pre-commit](https://pre-commit.com/) tool runs a number of checks against the code, enforcing that all the code pushed to the repository follows the same guidelines and best practices. In this project the checks are: 167 | * Trim trailing whitespaces 168 | * Fix end-of-file 169 | * YAML file format 170 | * Python code formatter using [Black](https://black.readthedocs.io/en/stable/) 171 | * Python style guide using [Flake8](https://flake8.pycqa.org/en/latest/) 172 | 173 | To install locally, follow the [installation guide](https://pre-commit.com/index.html#install) in the pre-commit page 174 | 175 | The normal usage is to run `pre-commit run` after staging files. If the [git hook](https://pre-commit.com/index.html#3-install-the-git-hook-scripts) has been installed, pre-commit will run automatically on `git commit`. 176 | -------------------------------------------------------------------------------- /dags/dop/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/dags/dop/__init__.py -------------------------------------------------------------------------------- /dags/dop/airflow_module/dag_builder/dag_builder_util.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | 6 | 7 | def get_default_dag_start_date(tzinfo): 8 | return datetime( 9 | 1970, 1, 1, tzinfo=tzinfo 10 | ) # You cannot back fill a dag prior to this date 11 | 12 | 13 | def create_dag( 14 | dag_id, 15 | start_date, 16 | schedule_interval=None, 17 | retries=3, 18 | retry_delay=None, 19 | owner="airflow", 20 | depends_on_past=False, 21 | catchup=False, 22 | max_active_runs=5, 23 | concurrency=int(Variable.get("DAG_CONCURRENCY", default_var=2)), 24 | template_searchpath=None, 25 | user_defined_macros=None, 26 | ): 27 | default_args = { 28 | "owner": owner, 29 | "depends_on_past": depends_on_past, 30 | "start_date": start_date, 31 | "retries": retries, 32 | "retry_delay": retry_delay if retry_delay is not None else timedelta(minutes=5), 33 | } 34 | 35 | return DAG( 36 | dag_id=dag_id, 37 | default_args=default_args, 38 | schedule_interval=schedule_interval, 39 | catchup=catchup, 40 | max_active_runs=max_active_runs, 41 | concurrency=concurrency, 42 | template_searchpath=template_searchpath, 43 | user_defined_macros=user_defined_macros, 44 | ) 45 | -------------------------------------------------------------------------------- /dags/dop/airflow_module/operator/common.py: -------------------------------------------------------------------------------- 1 | from airflow.operators.python_operator import PythonOperator 2 | from airflow.sensors.base_sensor_operator import BaseSensorOperator 3 | 4 | 5 | class BasePythonOperator(PythonOperator): 6 | def __init__( 7 | self, 8 | python_callable, 9 | op_args=None, 10 | op_kwargs=None, 11 | provide_context=False, 12 | templates_dict=None, 13 | templates_exts=None, 14 | *args, 15 | **kwargs 16 | ): 17 | if kwargs.get("priority_weight") is None: 18 | kwargs["priority_weight"] = 1 19 | 20 | super(BasePythonOperator, self).__init__( 21 | python_callable=python_callable, 22 | op_args=op_args, 23 | op_kwargs=op_kwargs, 24 | provide_context=provide_context, 25 | templates_dict=templates_dict, 26 | templates_exts=templates_exts, 27 | *args, 28 | **kwargs 29 | ) 30 | 31 | 32 | class AbstractBaseSensorOperator(BaseSensorOperator): 33 | def __init__(self, *args, **kwargs): 34 | super(AbstractBaseSensorOperator, self).__init__(*args, **kwargs) 35 | 36 | 37 | class TransformationOperator(BasePythonOperator): 38 | template_fields = ( 39 | "action", 40 | "target", 41 | "database", 42 | "schema", 43 | "identifier", 44 | "arguments", 45 | "sql", 46 | "templates_dict", 47 | ) 48 | 49 | def __init__( 50 | self, 51 | python_callable, 52 | op_args=None, 53 | op_kwargs=None, 54 | provide_context=False, 55 | templates_dict=None, 56 | templates_exts=None, 57 | *args, 58 | **kwargs 59 | ): 60 | super(TransformationOperator, self).__init__( 61 | python_callable=python_callable, 62 | op_args=op_args, 63 | op_kwargs=op_kwargs, 64 | provide_context=provide_context, 65 | templates_dict=templates_dict, 66 | templates_exts=templates_exts, 67 | *args, 68 | **kwargs 69 | ) 70 | task = op_kwargs["task"] 71 | self.action = task.kind.action 72 | self.target = task.kind.target 73 | self.database = task.database 74 | self.schema = task.schema 75 | self.identifier = task.identifier 76 | self.arguments = task.options.get("arguments") 77 | self.sql = templates_dict["sql"] 78 | 79 | 80 | class MaterializationOperator(TransformationOperator): 81 | ui_color = "#f2fade" 82 | 83 | 84 | class RecreateSpOperator(TransformationOperator): 85 | ui_color = "#a0c6d9" 86 | 87 | 88 | class InvocationOperator(TransformationOperator): 89 | ui_color = "#99cee8" 90 | 91 | 92 | class AssertOperator(BasePythonOperator): 93 | ui_color = "#fcc2a7" 94 | template_fields = ("assertion_sql", "templates_dict") 95 | 96 | def __init__( 97 | self, 98 | python_callable, 99 | provide_context=False, 100 | templates_dict=None, 101 | *args, 102 | **kwargs 103 | ): 104 | super(AssertOperator, self).__init__( 105 | python_callable=python_callable, 106 | provide_context=provide_context, 107 | templates_dict=templates_dict, 108 | *args, 109 | **kwargs 110 | ) 111 | self.assertion_sql = templates_dict["sql"] 112 | -------------------------------------------------------------------------------- /dags/dop/airflow_module/operator/dbt_k8_operator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from typing import List, Dict 5 | 6 | from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator 7 | from airflow.sensors.base_sensor_operator import apply_defaults 8 | from dop.component.configuration.env import env_config 9 | from dop.airflow_module.operator import dbt_operator_helper 10 | 11 | # List of files generated by dbt docs generate 12 | # https://docs.getdbt.com/reference/commands/cmd-docs 13 | DBT_DOC_FILES = ["index.html", "manifest.json", "catalog.json"] 14 | DBT_DOC_FOLDER = "target" 15 | DBT_USER = "dbtuser" 16 | DBT_RUN_RESULTS_PATH = "target/run_results.json" 17 | 18 | # See: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity 19 | node_pool_affinity = { 20 | "nodeAffinity": { 21 | # requiredDuringSchedulingIgnoredDuringExecution means in order 22 | # for a pod to be scheduled on a node, the node must have the 23 | # specified labels. However, if labels on a node change at 24 | # runtime such that the affinity rules on a pod are no longer 25 | # met, the pod will still continue to run on the node. 26 | "requiredDuringSchedulingIgnoredDuringExecution": { 27 | "nodeSelectorTerms": [ 28 | { 29 | "matchExpressions": [ 30 | { 31 | # When nodepools are created in Google Kubernetes 32 | # Engine, the nodes inside of that nodepool are 33 | # automatically assigned the label 34 | # 'cloud.google.com/gke-nodepool' with the value of 35 | # the nodepool's name. 36 | "key": "cloud.google.com/gke-nodepool", 37 | "operator": "In", 38 | "values": ["kubernetes-task-pool"], 39 | } 40 | ] 41 | } 42 | ] 43 | } 44 | } 45 | } 46 | 47 | 48 | def retrieve_commit_hash(): 49 | with open( 50 | os.path.sep.join([env_config.service_project_path, ".commit-hash"]) 51 | ) as fp: 52 | return fp.read() 53 | 54 | 55 | class DbtK8Operator(KubernetesPodOperator): 56 | template_fields = ( 57 | "action", 58 | "target", 59 | "dbt_project_name", 60 | "image_tag", 61 | "dbt_arguments", 62 | "gcr_pull_secret_name", 63 | "arguments", 64 | ) 65 | ui_color = "#FF694B" 66 | 67 | @apply_defaults 68 | def __init__( 69 | self, 70 | dbt_project_name: str, 71 | dbt_version: str, 72 | dbt_arguments: List[Dict], 73 | *args, 74 | **kwargs, 75 | ): 76 | """ 77 | :param dbt_project_name: the name for the dbt project name inline with what's defined in `.dbt-project-repos.json` 78 | :param dbt_version: Not used 79 | :param args: 80 | :param kwargs: must contain the Task entity 81 | """ 82 | 83 | task = kwargs["task"] 84 | self.dbt_project_name = dbt_project_name 85 | self.dbt_version = "N/A, this is fixed in the docker image" 86 | self.action = task.kind.action 87 | self.target = task.kind.target 88 | self.dbt_arguments = dbt_arguments 89 | self.gcr_pull_secret_name = env_config.gcr_pull_secret_name 90 | self.image_tag = retrieve_commit_hash() 91 | 92 | self._full_refresh = ( 93 | False # used to trigger DBT full refresh, modified via execute() override 94 | ) 95 | 96 | self.arguments = [self.parse_bash_command()] 97 | 98 | super(DbtK8Operator, self).__init__( 99 | name=kwargs["task_id"], 100 | cmds=["/bin/bash", "-c"], 101 | arguments=self.arguments, 102 | get_logs=True, 103 | namespace="default", 104 | image=f"eu.gcr.io/{env_config.infra_project_id}/dop-dbt:{self.image_tag}", 105 | is_delete_operator_pod=True, 106 | env_vars={ 107 | "DOP_PROJECT_ID": env_config.project_id, 108 | "DOP_LOCATION": env_config.location, 109 | }, 110 | image_pull_secrets=self.gcr_pull_secret_name, 111 | affinity=node_pool_affinity, 112 | *args, 113 | **kwargs, 114 | ) 115 | 116 | def execute(self, context): 117 | """ 118 | Override the parent method to ingest required contexts 119 | """ 120 | dag_run_conf = context["dag_run"].conf if context["dag_run"].conf else {} 121 | full_refresh = dag_run_conf.get("full_refresh", False) 122 | 123 | self._full_refresh = full_refresh 124 | 125 | logging.info(f"### IS FULL REFRESH ENABLED: {self._full_refresh}") 126 | 127 | self.arguments = [self.parse_bash_command(context=context)] 128 | 129 | logging.info(f"### Updated arguments: {self.arguments}") 130 | 131 | super(DbtK8Operator, self).execute(context=context) 132 | 133 | def parse_bash_command(self, context=None): 134 | full_refresh_cmd = "" 135 | if self.target != "run": 136 | full_refresh_cmd = "" 137 | elif self.dbt_arguments: 138 | if self._full_refresh and "--full-refresh" not in [ 139 | arg.get("option") for arg in self.dbt_arguments 140 | ]: 141 | full_refresh_cmd = "--full-refresh" 142 | elif self._full_refresh: 143 | full_refresh_cmd = "--full-refresh" 144 | 145 | cmd_for_additional_arguments = "" 146 | 147 | # docs arguments are only used to copy files to GCS, not in the task execution 148 | if self.dbt_arguments and self.target != "docs generate": 149 | cmd_for_additional_arguments = dbt_operator_helper.implode_arguments( 150 | dbt_arguments=self.dbt_arguments 151 | ) 152 | 153 | cmd_to_run_dbt = ( 154 | f"pipenv run dbt --no-use-colors {self.target} --project-dir ./{self.dbt_project_name}" 155 | f" --vars {dbt_operator_helper.parsed_cmd_airflow_context_vars(context=context)}" 156 | f" {cmd_for_additional_arguments}" 157 | f" {full_refresh_cmd};" 158 | f" gsutil cp /home/{DBT_USER}/{self.dbt_project_name}/{DBT_RUN_RESULTS_PATH} gs://{os.getenv('GCS_BUCKET')}/dbt/{DBT_RUN_RESULTS_PATH}" 159 | ) 160 | 161 | if self.target == "docs generate": 162 | command = self.copy_docs_to_gcs_command() 163 | if command: 164 | cmd_to_run_dbt += f"; {command}" 165 | 166 | return cmd_to_run_dbt 167 | 168 | def copy_docs_to_gcs_command(self): 169 | """ 170 | Generate gsutil command line to copy doc files generated with dbt docs generate to GCS 171 | """ 172 | command = [] 173 | gcs_bucket = dbt_operator_helper.extract_argument( 174 | self.dbt_arguments, "--bucket" 175 | ) 176 | if not gcs_bucket: 177 | logging.warning("No bucket argument provided. Skipping copy to GCS") 178 | return "" 179 | gcs_path = dbt_operator_helper.extract_argument( 180 | self.dbt_arguments, "--bucket-path", "" 181 | ) 182 | 183 | for doc_file in DBT_DOC_FILES: 184 | doc_file_path = ( 185 | f"/home/{DBT_USER}/{self.dbt_project_name}/{DBT_DOC_FOLDER}/{doc_file}" 186 | ) 187 | logging.info(f"Copying {doc_file} to gs://{gcs_bucket}/{gcs_path}") 188 | command.append( 189 | f"gsutil cp {doc_file_path} gs://{gcs_bucket}/{gcs_path}/{doc_file}" 190 | ) 191 | return ";".join(command) 192 | 193 | def post_execute(self, context, result=None): 194 | """ 195 | This hook is triggered right after self.execute() is called. 196 | It is passed the execution context and any results returned by the 197 | operator. 198 | """ 199 | dbt_operator_helper.save_run_results_in_bq( 200 | env_config.project_id, 201 | self.dbt_project_name, 202 | f"gs://{os.getenv('GCS_BUCKET')}/dbt/{DBT_RUN_RESULTS_PATH}", 203 | ) 204 | -------------------------------------------------------------------------------- /dags/dop/airflow_module/operator/dbt_operator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import List, Dict 4 | 5 | from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook 6 | from airflow.operators.bash_operator import BashOperator 7 | from airflow.sensors.base_sensor_operator import apply_defaults 8 | from dop.airflow_module.operator import dbt_operator_helper 9 | from dop.component.configuration.env import env_config 10 | 11 | # List of files generated by dbt docs generate 12 | # https://docs.getdbt.com/reference/commands/cmd-docs 13 | DBT_DOC_FILES = ["index.html", "manifest.json", "catalog.json"] 14 | DBT_DOC_FOLDER = "target" 15 | DBT_RUN_RESULTS_PATH = "target/run_results.json" 16 | 17 | 18 | class DbtOperator(BashOperator): 19 | template_fields = ( 20 | "action", 21 | "target", 22 | "dbt_project_path", 23 | "dbt_version", 24 | "dbt_arguments", 25 | "bash_command", 26 | ) 27 | ui_color = "#FF694B" 28 | 29 | @apply_defaults 30 | def __init__( 31 | self, 32 | dbt_project_name: str, 33 | dbt_version: str, 34 | dbt_arguments: List[Dict], 35 | *args, 36 | **kwargs, 37 | ): 38 | """ 39 | :param dbt_project_name: the name for the dbt project name inline with what's defined in `.dbt-project-repos.json` 40 | :param dbt_version: a supported DBT version, version must be >= 0.19.1 41 | :param args: 42 | :param kwargs: must contain the Task entity 43 | """ 44 | 45 | task = kwargs["task"] 46 | self.dbt_project_path = os.path.sep.join( 47 | [env_config.dbt_projects_path, dbt_project_name] 48 | ) 49 | self.dbt_project_name = dbt_project_name 50 | self.dbt_version = dbt_version 51 | self.action = task.kind.action 52 | self.target = task.kind.target 53 | self.dbt_arguments = dbt_arguments 54 | 55 | self._full_refresh = ( 56 | False # used to trigger DBT full refresh, modified via execute() override 57 | ) 58 | 59 | super(DbtOperator, self).__init__( 60 | bash_command=self.parse_bash_command(), *args, **kwargs 61 | ) 62 | 63 | def execute(self, context): 64 | """ 65 | Override the parent method to ingest required contexts 66 | """ 67 | dag_run_conf = context["dag_run"].conf if context["dag_run"].conf else {} 68 | full_refresh = dag_run_conf.get("full_refresh", False) 69 | 70 | self._full_refresh = full_refresh 71 | 72 | logging.info(f"### IS FULL REFRESH ENABLED: {self._full_refresh}") 73 | 74 | self.bash_command = self.parse_bash_command(context=context) 75 | super(DbtOperator, self).execute(context=context) 76 | 77 | def parse_bash_command(self, context=None): 78 | """ 79 | Create a virtualenv and run DBT. Virtualenv is removed regardless if the script is successful or not 80 | """ 81 | 82 | full_refresh_cmd = "" 83 | if self.target != "run": 84 | full_refresh_cmd = "" 85 | elif self.dbt_arguments: 86 | if self._full_refresh and "--full-refresh" not in [ 87 | arg.get("option") for arg in self.dbt_arguments 88 | ]: 89 | full_refresh_cmd = "--full-refresh" 90 | elif self._full_refresh: 91 | full_refresh_cmd = "--full-refresh" 92 | 93 | set_err_handling = "set -xe" 94 | trap = """ 95 | trap 'catch $? $LINENO' ERR 96 | catch() { 97 | echo "Script errored, removing virtualenv" 98 | rm -rf $TMP_DIR 99 | exit 1 100 | } 101 | """ 102 | cmd_for_tmp_dir = "export TMP_DIR=$(mktemp -d)" 103 | cmd_to_print_tmp_dir = "echo TMP_DIR is: $TMP_DIR" 104 | cmd_for_virtualenv = "virtualenv -p python3 $TMP_DIR" 105 | dbt_init = f"PYTHONPATH={env_config.dag_path} python {env_config.dag_path}/dop/component/helper/dbt_init.py --tmp_dir=$TMP_DIR --project_name={self.dbt_project_name}" 106 | cmd_for_activating_virtualenv = "source $TMP_DIR/bin/activate" 107 | install_pip_deps = f"pip install dbt=={self.dbt_version}" 108 | 109 | cmd_for_additional_arguments = "" 110 | 111 | # docs arguments are only used to copy files to GCS, not in the task execution 112 | if self.dbt_arguments and self.target != "docs generate": 113 | cmd_for_additional_arguments = dbt_operator_helper.implode_arguments( 114 | dbt_arguments=self.dbt_arguments 115 | ) 116 | 117 | cmd_to_run_dbt = ( 118 | f"dbt clean --project-dir {self.dbt_project_path} --profiles-dir $TMP_DIR/.dbt" 119 | f" && dbt deps --project-dir {self.dbt_project_path}" 120 | f" && dbt --no-use-colors {self.target} --project-dir {self.dbt_project_path}" 121 | f" --profiles-dir $TMP_DIR/.dbt" 122 | f" --vars {dbt_operator_helper.parsed_cmd_airflow_context_vars(context=context)}" 123 | f" {cmd_for_additional_arguments}" 124 | f" {full_refresh_cmd}" 125 | ) 126 | 127 | cmd_to_remove_tmp_dir = "rm -rf $TMP_DIR" 128 | 129 | return "\n".join( 130 | [ 131 | set_err_handling, 132 | trap, 133 | cmd_for_tmp_dir, 134 | cmd_to_print_tmp_dir, 135 | cmd_for_virtualenv, 136 | dbt_init, # setup dbt profiles.yml & service account secret from Secret Manager 137 | cmd_for_activating_virtualenv, 138 | install_pip_deps, 139 | cmd_to_run_dbt, 140 | cmd_to_remove_tmp_dir, 141 | ] 142 | ) 143 | 144 | def post_execute(self, context, result=None): 145 | """ 146 | This hook is triggered right after self.execute() is called. 147 | It is passed the execution context and any results returned by the 148 | operator. 149 | """ 150 | if self.target == "docs generate": 151 | gcs_bucket = dbt_operator_helper.extract_argument( 152 | self.dbt_arguments, "--bucket" 153 | ) 154 | if not gcs_bucket: 155 | logging.warning("No bucket argument provided. Skipping copy to GCS") 156 | gcs_path = dbt_operator_helper.extract_argument( 157 | self.dbt_arguments, "--bucket-path", "" 158 | ) 159 | logging.info(f"Copying dbt docs JSON files to GCS bucket {gcs_bucket}") 160 | dbt_operator_helper.copy_docs_to_gcs( 161 | gcs_bucket, gcs_path, self.dbt_project_path 162 | ) 163 | 164 | dbt_operator_helper.save_run_results_in_bq( 165 | env_config.project_id, 166 | self.dbt_project_name, 167 | f"{self.dbt_project_path}/{DBT_RUN_RESULTS_PATH}", 168 | ) 169 | 170 | def copy_docs_to_gcs(self, bucket: str, bucket_path: str, project_path: str): 171 | """ 172 | Copy doc files generated with dbt docs generate to GCS 173 | 174 | :param bucket: Bucket where the doc files will be copied 175 | :param bucket_path: Path in the bucket 176 | :param project_path: Local project folder 177 | """ 178 | hook = GoogleCloudStorageHook() 179 | for doc_file in DBT_DOC_FILES: 180 | doc_file_path = f"{project_path}/{DBT_DOC_FOLDER}/{doc_file}" 181 | if os.path.exists(doc_file_path): 182 | logging.info( 183 | f"{doc_file} found. Copying to gs://{bucket}/{bucket_path}" 184 | ) 185 | hook.upload( 186 | bucket, 187 | object=f"{bucket_path}/{doc_file}" if bucket_path else doc_file, 188 | filename=doc_file_path, 189 | mime_type="text/html" 190 | if doc_file.endswith(".html") 191 | else "application/json", 192 | ) 193 | else: 194 | logging.warning(f"{doc_file} not found. Skipping") 195 | -------------------------------------------------------------------------------- /dags/dop/airflow_module/operator/dbt_operator_helper.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import logging 4 | import pathlib 5 | 6 | from google.cloud import bigquery 7 | from google.cloud import storage 8 | from urllib.parse import urlparse 9 | from google.cloud.exceptions import NotFound 10 | 11 | DBT_RUN_RESULTS_TABLE = "run_results" 12 | DBT_RUN_RESULTS_SCHEMA_FILE = "run_results_schema.json" 13 | 14 | 15 | def implode_arguments(dbt_arguments, filter_func=None): 16 | filtered_dbt_arguments = ( 17 | filter_func(dbt_arguments) if filter_func else dbt_arguments 18 | ) 19 | return " ".join( 20 | [ 21 | " ".join( 22 | [ 23 | argument["option"], 24 | "" if argument.get("value") is None else argument.get("value"), 25 | ] 26 | ) 27 | for argument in filtered_dbt_arguments 28 | ] 29 | ) 30 | 31 | 32 | def parsed_cmd_airflow_context_vars(context): 33 | cmd = '"{' 34 | context_vars = ["ds", "ds_nodash", "ts", "ts_nodash", "ts_nodash_with_tz"] 35 | if context: 36 | var_list = [f"'{v}'" + f": '{context[v]}'" for v in context_vars] 37 | else: 38 | var_list = [f"'{v}'" + ": '{{ " + v + " }}'" for v in context_vars] 39 | cmd += ",".join(var_list) 40 | 41 | cmd += '}"' 42 | 43 | return cmd 44 | 45 | 46 | def extract_argument(dbt_arguments: list, name: str, default_value: str = None): 47 | """ 48 | Extract an argument from the argument list. Format is 49 | [ 50 | {'option': 'OPTION1', 'value': 'VALUE1'}, 51 | {'option': 'OPTION2', 'value': 'VALUE2'}, 52 | ... 53 | ] 54 | 55 | :param dbt_arguments: Argument list 56 | :param name: Argument to extract 57 | :param default_value: Default value to return if not present 58 | """ 59 | return next( 60 | (arg.get("value") for arg in dbt_arguments if arg.get("option") == name), 61 | default_value, 62 | ) 63 | 64 | def save_run_results_in_bq(project_id, dbt_project_name, run_results_path): 65 | """ 66 | Load run_results json file in BigQuery. As a first step is checked if the table 67 | already exists in the schema and if not, it's created. 68 | 69 | To fit BQ schema, the field metadata.env (JSON object) must be serialised 70 | and results.message converted to string, 71 | because depending on the task it can be an integer or a string 72 | """ 73 | table_id = f"{project_id}.{dbt_project_name}.{DBT_RUN_RESULTS_TABLE}" 74 | client = bigquery.Client(project=project_id) 75 | check_run_results_table(client, table_id) 76 | 77 | run_results = {} 78 | if run_results_path.startswith("gs://"): 79 | storage_client = storage.Client() 80 | bucket, path = _parse_gcs_url(run_results_path) 81 | bucket = storage_client.get_bucket(bucket) 82 | blob = bucket.blob(path) 83 | run_results = json.loads(blob.download_as_string()) 84 | else: 85 | with open(run_results_path) as run_results_file: 86 | run_results = json.load(run_results_file) 87 | 88 | if run_results["metadata"]["env"]: 89 | run_results["metadata"]["env"] = json.dumps(run_results["metadata"]["env"]) 90 | else: 91 | del run_results["metadata"]["env"] 92 | for item in run_results["results"]: 93 | item["message"] = str(item["message"]) 94 | 95 | data_as_file = io.StringIO(json.dumps(run_results)) 96 | job_config = bigquery.LoadJobConfig( 97 | source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON 98 | ) 99 | job = client.load_table_from_file(data_as_file, table_id, job_config=job_config) 100 | try: 101 | result = job.result() # Waits for table load to complete. 102 | logging.info("Pushed {} rows into run_results table".format(result.output_rows)) 103 | except Exception: 104 | logging.info(f"Error loading run_results to BigQuery: {job.errors}") 105 | 106 | 107 | def check_run_results_table(client, table_id): 108 | """ 109 | Check if run_results table exists in BigQuery, and if not create it 110 | """ 111 | try: 112 | client.get_table(table_id) 113 | except NotFound: 114 | print("Table {} is not found.".format(table_id)) 115 | current_folder = pathlib.Path(__file__).parent.absolute() 116 | schema = client.schema_from_json( 117 | f"{current_folder}/{DBT_RUN_RESULTS_SCHEMA_FILE}" 118 | ) 119 | table = bigquery.Table(table_id, schema=schema) 120 | client.create_table(table) 121 | 122 | 123 | def _parse_gcs_url(gsurl): 124 | """ 125 | Given a Google Cloud Storage URL (gs:///), returns a 126 | tuple containing the corresponding bucket and blob. 127 | """ 128 | parsed_url = urlparse(gsurl) 129 | bucket = parsed_url.netloc 130 | blob = parsed_url.path.lstrip("/") 131 | return bucket, blob 132 | 133 | -------------------------------------------------------------------------------- /dags/dop/airflow_module/operator/run_results_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "NULLABLE", 4 | "name": "elapsed_time", 5 | "type": "FLOAT" 6 | }, 7 | { 8 | "fields": [ 9 | { 10 | "mode": "REPEATED", 11 | "name": "models", 12 | "type": "STRING" 13 | }, 14 | { 15 | "mode": "NULLABLE", 16 | "name": "version_check", 17 | "type": "BOOLEAN" 18 | }, 19 | { 20 | "mode": "NULLABLE", 21 | "name": "rpc_method", 22 | "type": "STRING" 23 | }, 24 | { 25 | "mode": "NULLABLE", 26 | "name": "vars", 27 | "type": "STRING" 28 | }, 29 | { 30 | "mode": "NULLABLE", 31 | "name": "log_format", 32 | "type": "STRING" 33 | }, 34 | { 35 | "mode": "NULLABLE", 36 | "name": "project_dir", 37 | "type": "STRING" 38 | }, 39 | { 40 | "mode": "NULLABLE", 41 | "name": "profiles_dir", 42 | "type": "STRING" 43 | }, 44 | { 45 | "mode": "NULLABLE", 46 | "name": "which", 47 | "type": "STRING" 48 | }, 49 | { 50 | "mode": "NULLABLE", 51 | "name": "use_cache", 52 | "type": "BOOLEAN" 53 | }, 54 | { 55 | "mode": "NULLABLE", 56 | "name": "use_colors", 57 | "type": "BOOLEAN" 58 | }, 59 | { 60 | "mode": "NULLABLE", 61 | "name": "write_json", 62 | "type": "BOOLEAN" 63 | }, 64 | { 65 | "mode": "NULLABLE", 66 | "name": "schema", 67 | "type": "BOOLEAN" 68 | }, 69 | { 70 | "mode": "NULLABLE", 71 | "name": "data", 72 | "type": "BOOLEAN" 73 | }, 74 | { 75 | "mode": "NULLABLE", 76 | "name": "full_refresh", 77 | "type": "BOOLEAN" 78 | } 79 | ], 80 | "mode": "NULLABLE", 81 | "name": "args", 82 | "type": "RECORD" 83 | }, 84 | { 85 | "fields": [ 86 | { 87 | "mode": "NULLABLE", 88 | "name": "message", 89 | "type": "STRING" 90 | }, 91 | { 92 | "fields": [ 93 | { 94 | "description": "bq-datetime", 95 | "mode": "NULLABLE", 96 | "name": "completed_at", 97 | "type": "TIMESTAMP" 98 | }, 99 | { 100 | "description": "bq-datetime", 101 | "mode": "NULLABLE", 102 | "name": "started_at", 103 | "type": "TIMESTAMP" 104 | }, 105 | { 106 | "mode": "NULLABLE", 107 | "name": "name", 108 | "type": "STRING" 109 | } 110 | ], 111 | "mode": "REPEATED", 112 | "name": "timing", 113 | "type": "RECORD" 114 | }, 115 | { 116 | "mode": "NULLABLE", 117 | "name": "thread_id", 118 | "type": "STRING" 119 | }, 120 | { 121 | "fields": [ 122 | { 123 | "mode": "NULLABLE", 124 | "name": "bytes_processed", 125 | "type": "INTEGER" 126 | }, 127 | { 128 | "mode": "NULLABLE", 129 | "name": "code", 130 | "type": "STRING" 131 | }, 132 | { 133 | "mode": "NULLABLE", 134 | "name": "_message", 135 | "type": "STRING" 136 | }, 137 | { 138 | "mode": "NULLABLE", 139 | "name": "rows_affected", 140 | "type": "INTEGER" 141 | } 142 | ], 143 | "mode": "NULLABLE", 144 | "name": "adapter_response", 145 | "type": "RECORD" 146 | }, 147 | { 148 | "mode": "NULLABLE", 149 | "name": "unique_id", 150 | "type": "STRING" 151 | }, 152 | { 153 | "mode": "NULLABLE", 154 | "name": "execution_time", 155 | "type": "FLOAT" 156 | }, 157 | { 158 | "mode": "NULLABLE", 159 | "name": "status", 160 | "type": "STRING" 161 | } 162 | ], 163 | "mode": "REPEATED", 164 | "name": "results", 165 | "type": "RECORD" 166 | }, 167 | { 168 | "fields": [ 169 | { 170 | "mode": "NULLABLE", 171 | "name": "env", 172 | "type": "STRING" 173 | }, 174 | { 175 | "mode": "NULLABLE", 176 | "name": "invocation_id", 177 | "type": "STRING" 178 | }, 179 | { 180 | "mode": "NULLABLE", 181 | "name": "dbt_version", 182 | "type": "STRING" 183 | }, 184 | { 185 | "description": "bq-datetime", 186 | "mode": "NULLABLE", 187 | "name": "generated_at", 188 | "type": "TIMESTAMP" 189 | }, 190 | { 191 | "mode": "NULLABLE", 192 | "name": "dbt_schema_version", 193 | "type": "STRING" 194 | } 195 | ], 196 | "mode": "NULLABLE", 197 | "name": "metadata", 198 | "type": "RECORD" 199 | } 200 | ] 201 | -------------------------------------------------------------------------------- /dags/dop/component/configuration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/dags/dop/component/configuration/__init__.py -------------------------------------------------------------------------------- /dags/dop/component/configuration/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DOP_DBT_USER = "dop-dbt-user" 4 | DOP_DOCKER_USER = "dop-docker-user" 5 | 6 | 7 | class EnvConfig: 8 | def __init__(self): 9 | pass 10 | 11 | @property 12 | def environment(self): 13 | return os.environ["DOP_ENVIRONMENT"] 14 | 15 | @property 16 | def project_id(self): 17 | return os.environ["DOP_PROJECT_ID"] 18 | 19 | @property 20 | def dag_path(self): 21 | return os.path.sep.join( 22 | [self.service_project_path, "embedded_dop", "source", "dags"] 23 | ) 24 | 25 | @property 26 | def location(self): 27 | return os.environ["DOP_LOCATION"] 28 | 29 | @property 30 | def orchestration_path(self): 31 | return os.path.sep.join( 32 | [self.service_project_path, "embedded_dop", "orchestration"] 33 | ) 34 | 35 | @property 36 | def is_sandbox_environment(self): 37 | return bool(os.environ.get("DOP_SANDBOX_ENVIRONMENT", False)) 38 | 39 | @property 40 | def service_project_path(self): 41 | return os.environ["DOP_SERVICE_PROJECT_PATH"] 42 | 43 | @property 44 | def infra_project_id(self): 45 | return os.environ["DOP_INFRA_PROJECT_ID"] 46 | 47 | @property 48 | def gcr_pull_secret_name(self): 49 | return os.environ.get("DOP_GCR_PULL_SECRET_NAME", None) 50 | 51 | @property 52 | def dbt_projects_path(self): 53 | """ 54 | An Alias of service_project_path because it is also the DBT project paths by convention. 55 | This may however be changed to have its own environment variable in the future if it is required to 56 | differentiate the DBT projects setup from the service project path itself 57 | :return: 58 | """ 59 | return self.service_project_path 60 | 61 | 62 | env_config = EnvConfig() 63 | -------------------------------------------------------------------------------- /dags/dop/component/helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/dags/dop/component/helper/__init__.py -------------------------------------------------------------------------------- /dags/dop/component/helper/dbt_init.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | from dop.component.helper import dbt_profile 6 | 7 | if __name__ == "__main__": 8 | logging.getLogger().setLevel(logging.INFO) 9 | parser = argparse.ArgumentParser(description="Process arguments") 10 | parser.add_argument( 11 | "--tmp_dir", 12 | required=True, 13 | type=str, 14 | help="the TMP DIR where the python virtual environment is created", 15 | ) 16 | parser.add_argument( 17 | "--project_name", required=True, type=str, help="DBT project name" 18 | ) 19 | 20 | args = parser.parse_args() 21 | 22 | os.mkdir(os.path.sep.join([args.tmp_dir, ".dbt"])) 23 | 24 | logging.info(f"Creating DBT profiles.yml in {args.tmp_dir}") 25 | dbt_profile.setup_and_save_profiles( 26 | project_name=args.project_name, profile_path=args.tmp_dir 27 | ) 28 | 29 | logging.info("Done.") 30 | -------------------------------------------------------------------------------- /dags/dop/component/helper/dbt_profile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | from dop.component.transformation.common.parser.yaml_parser import ( 5 | yaml_to_dict, 6 | dict_to_yaml, 7 | ) 8 | from dop.component.configuration.env import env_config 9 | from dop.component.configuration import env 10 | 11 | 12 | def setup_profiles(project_name): 13 | project_id = env_config.project_id 14 | location = env_config.location 15 | # the profile is generated dynamically at runtime, therefore multiple target profiles are not required 16 | target = "all" 17 | target_type = "bigquery" 18 | path_to_dbt_project_yml = os.path.sep.join( 19 | [env_config.dbt_projects_path, project_name, "dbt_project.yml"] 20 | ) 21 | 22 | bq_profile = {} 23 | 24 | if not os.path.isfile(path_to_dbt_project_yml): 25 | raise RuntimeError( 26 | f"Profile yaml `{path_to_dbt_project_yml}` does not exist, " 27 | f"was the DBT option `project` in the orchestration config set correctly?" 28 | ) 29 | 30 | with open(path_to_dbt_project_yml) as fp: 31 | profile_id = yaml_to_dict(fp.read())["profile"] 32 | bq_profile[profile_id] = { 33 | "target": target, 34 | "outputs": { 35 | target: { 36 | "type": target_type, 37 | "method": "oauth", 38 | "project": project_id, 39 | "schema": str(project_name).replace("-", "_"), 40 | # TODO: Is this the right default, should `schema` be passed in via an environment variable? 41 | "threads": 1, 42 | "timeout_seconds": 300, 43 | "location": location, 44 | "priority": "interactive", 45 | "impersonate_service_account": f"{env.DOP_DBT_USER}@{project_id}.iam.gserviceaccount.com", 46 | } 47 | }, 48 | } 49 | 50 | profile = dict_to_yaml(bq_profile) 51 | logging.info(f"DBT Profile: {profile}") 52 | 53 | return profile 54 | 55 | 56 | def setup_and_save_profiles(project_name, profile_path): 57 | profile_yml = setup_profiles(project_name=project_name) 58 | path_to_profile_yml = os.path.sep.join([profile_path, ".dbt", "profiles.yml"]) 59 | logging.info(f"Updating: {path_to_profile_yml}") 60 | with open(path_to_profile_yml, "w+") as fp: 61 | fp.write(profile_yml) 62 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/common/adapter/model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass(frozen=True) 5 | class Argument: 6 | name: str 7 | type: str 8 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/common/adapter/relation.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | class RelationValueError(ValueError): 5 | pass 6 | 7 | 8 | @dataclass(frozen=True) 9 | class BaseRelation: 10 | database: str 11 | schema: str 12 | identifier: str 13 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/common/adapter/schema.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import copy 3 | 4 | from dataclasses import dataclass 5 | from croniter import croniter 6 | from typing import List, Optional 7 | from decimal import Decimal 8 | 9 | from marshmallow import validate, post_load, Schema, fields 10 | 11 | TASK_KIND_MATERI = "materialization" 12 | TASK_KIND_ASSERT = "assertion" 13 | TASK_KIND_INVOKE = "invocation" 14 | TASK_KIND_AIRFLOW_OPERATOR = "airflow_operator" 15 | TASK_KIND_DBT = "dbt" 16 | 17 | NATIVE_TASK_KIND = [TASK_KIND_MATERI, TASK_KIND_ASSERT, TASK_KIND_INVOKE] 18 | CUSTOM_TASK_KIND = [TASK_KIND_DBT] 19 | 20 | 21 | def dbt_argument_validation_mapper(option, value): 22 | allowed_options = [ 23 | "-m", 24 | "-x", 25 | "--fail-fast", 26 | "--threads", 27 | "--exclude", 28 | "--full-refresh", 29 | "--bucket", 30 | "--bucket-path", 31 | ] 32 | if option not in allowed_options: 33 | raise DbtTaskException( 34 | f"Supported DBT command line argument options are: {allowed_options}, `{option}` supplied" 35 | ) 36 | 37 | 38 | def dbt_validation_func(task): 39 | allowed_options = ["run", "test", "docs generate"] 40 | if task.kind.target not in allowed_options: 41 | raise DbtTaskException( 42 | f"DBT task.kind.target must be one of {allowed_options}, `{task.kind.target}` supplied" 43 | ) 44 | 45 | # check version 46 | dbt_version = task.options.get("version") 47 | if not dbt_version: 48 | raise DbtTaskException("DBT version must be supplied in the configuration") 49 | 50 | v_major, v_minor, v_patch = dbt_version.split(".") 51 | if int(v_major) < 0 or Decimal(f"{v_minor}.{v_patch}") < Decimal("19.1"): 52 | raise DbtTaskException( 53 | f"DBT version must be >= 0.19.1, {dbt_version} is supplied" 54 | ) 55 | 56 | # check DBT arguments, only allow certain arguments to be used 57 | arguments = task.options.get("arguments") 58 | 59 | if arguments: 60 | for argument in arguments: 61 | dbt_argument_validation_mapper( 62 | option=argument.get("option"), value=argument.get("value") 63 | ) 64 | 65 | 66 | def materialization_validation_func(task): 67 | allowed_options = ["table", "view", "udf", "stored_procedure", "schema"] 68 | if task.kind.target not in allowed_options: 69 | raise MaterializationTaskException( 70 | f"Materialization task.kind.target must be one of {allowed_options}, `{task.kind.target}`supplied" 71 | ) 72 | 73 | 74 | def assertion_validation_func(task): 75 | allowed_options = ["assertion", "assertion_sensor"] 76 | if task.kind.target not in allowed_options: 77 | raise AssertionTaskException( 78 | f"Assertion task.kind.target must be one of {allowed_options}, {task.kind.target} supplied" 79 | ) 80 | 81 | 82 | def invocation_validation_func(task): 83 | allowed_options = ["stored_procedure"] 84 | if task.kind.target not in allowed_options: 85 | raise InvocationTaskException( 86 | f"Invocation task.kind.target must be one of {allowed_options}, {task.kind.target} supplied" 87 | ) 88 | 89 | 90 | def data_validation_mapper(task): 91 | if task.kind.action == TASK_KIND_DBT: 92 | return dbt_validation_func 93 | elif task.kind.action == TASK_KIND_MATERI: 94 | return materialization_validation_func 95 | elif task.kind.action == TASK_KIND_ASSERT: 96 | return assertion_validation_func 97 | elif task.kind.action == TASK_KIND_INVOKE: 98 | return invocation_validation_func 99 | 100 | return None 101 | 102 | 103 | class InvalidDagConfig(ValueError): 104 | pass 105 | 106 | 107 | class DbtTaskException(InvalidDagConfig): 108 | pass 109 | 110 | 111 | class MaterializationTaskException(InvalidDagConfig): 112 | pass 113 | 114 | 115 | class AssertionTaskException(InvalidDagConfig): 116 | pass 117 | 118 | 119 | class InvocationTaskException(InvalidDagConfig): 120 | pass 121 | 122 | 123 | class IsValidCron(validate.Validator): 124 | default_message = "Not a valid Cron Expression" 125 | 126 | def __call__(self, value) -> typing.Any: 127 | message = ( 128 | f"The schedule_interval expression `{value}` must be a valid CRON expression: " 129 | "validate it here https://crontab.guru/" 130 | ) 131 | if not croniter.is_valid(value): 132 | raise validate.ValidationError(message) 133 | 134 | return value 135 | 136 | 137 | @dataclass 138 | class Partitioning: 139 | field: str 140 | data_type: str 141 | 142 | 143 | class PartitioningSchema(Schema): 144 | field = fields.String(validate=validate.OneOf(["date"])) 145 | data_type = fields.String( 146 | validate=validate.OneOf(["timestamp", "datetime", "date"]) 147 | ) 148 | 149 | 150 | @dataclass 151 | class Kind: 152 | action: str 153 | target: str 154 | 155 | 156 | class KindSchema(Schema): 157 | action = fields.String( 158 | validate=validate.OneOf( 159 | [ 160 | TASK_KIND_MATERI, 161 | TASK_KIND_ASSERT, 162 | TASK_KIND_INVOKE, 163 | TASK_KIND_DBT, 164 | TASK_KIND_AIRFLOW_OPERATOR, 165 | ] 166 | ) 167 | ) 168 | target = fields.String() 169 | 170 | 171 | @dataclass 172 | class Task: 173 | kind: Kind 174 | database: Optional[str] 175 | schema: Optional[str] 176 | identifier: str 177 | partitioning: Optional[Partitioning] 178 | dependencies: List[str] 179 | options: dict 180 | 181 | 182 | class TaskSchema(Schema): 183 | kind = fields.Nested(KindSchema, required=True) 184 | database = fields.String(required=False) 185 | schema = fields.String(required=False) 186 | identifier = fields.String(required=True) 187 | partitioning = fields.Nested(PartitioningSchema, required=False, missing=None) 188 | dependencies = fields.List(cls_or_instance=fields.Str(), required=False, missing=[]) 189 | options = fields.Dict(required=False, missing={}) 190 | 191 | 192 | @dataclass 193 | class DagConfig: 194 | enabled: bool 195 | timezone: str 196 | schedule_interval: str 197 | params: Optional[dict] 198 | database: str 199 | schema: str 200 | tasks: List[Task] 201 | 202 | 203 | class DagConfigSchema(Schema): 204 | enabled = fields.Bool(required=False, missing=True) 205 | timezone = fields.Str(required=True) 206 | schedule_interval = fields.Str(validate=IsValidCron(), missing=None) 207 | params = fields.Dict(missing={}) 208 | database = fields.Str(required=True) 209 | schema = fields.Str(required=True) 210 | tasks = fields.List(cls_or_instance=fields.Nested(TaskSchema), required=True) 211 | 212 | @post_load 213 | def make_dag_config(self, data, **kwargs): 214 | data_with_objects = copy.deepcopy(data) 215 | tasks = [] 216 | for task in data_with_objects["tasks"]: 217 | task["kind"] = Kind(**task["kind"]) 218 | task["database"] = ( 219 | task["database"] 220 | if "database" in task 221 | else data_with_objects["database"] 222 | ) 223 | task["schema"] = ( 224 | task["schema"] if "schema" in task else data_with_objects["schema"] 225 | ) 226 | task["partitioning"] = ( 227 | Partitioning(**task["partitioning"]) 228 | if task["partitioning"] is not None 229 | else task["partitioning"] 230 | ) 231 | 232 | task_entity = Task(**task) 233 | 234 | # Additional validation 235 | validation_func = data_validation_mapper(task=task_entity) 236 | if validation_func: 237 | validation_func(task_entity) 238 | 239 | tasks.append(task_entity) 240 | 241 | data_with_objects["tasks"] = tasks 242 | return DagConfig(**data_with_objects) 243 | 244 | 245 | def load_dag_schema(payload) -> DagConfig: 246 | schema = DagConfigSchema() 247 | result = schema.load(payload) 248 | 249 | if result.errors: 250 | raise InvalidDagConfig(result.errors) 251 | 252 | return result.data 253 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/common/parser/yaml_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import yaml 4 | 5 | # Use LibYAML bindings if installed, much faster than the pure Python version 6 | # See https://pyyaml.org/wiki/PyYAMLDocumentation for details 7 | try: 8 | from yaml import CLoader as Loader 9 | except ImportError: 10 | from yaml import Loader # type: ignore 11 | 12 | 13 | def dict_to_yaml(dct: dict) -> str: 14 | """ 15 | Convert a dictionary to a YAML serialised string 16 | 17 | :param dct: Dictionary to be converted 18 | :return: serialised dictionary in YAML format 19 | """ 20 | yml: dict = yaml.load(json.dumps(dct), Loader=Loader) 21 | return yaml.dump(yml) 22 | 23 | 24 | def yaml_to_dict(yml: str) -> dict: 25 | """ 26 | Convert a YAML serialised string to a dictionary 27 | 28 | Using unsafe_load to allow complex python types in YAML. 29 | For example in an airflow sensor operator we can define as param 30 | execution_date: !!python/object/apply:datetime.timedelta [1] 31 | 32 | As we are defining the yaml and it cannot be modified by third parties, 33 | there is no risk of injection and execution of arbitrary code 34 | 35 | :param yml: YAML serialised string to be converted 36 | :return: Converted dictionary 37 | """ 38 | return yaml.unsafe_load(yml) 39 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/common/templating/jinja.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | from jinja2 import FileSystemLoader, Environment 5 | from dop import definitions 6 | 7 | 8 | def log(msg, info=False): 9 | if info: 10 | logging.info(msg) 11 | else: 12 | logging.debug(msg) 13 | return "" 14 | 15 | 16 | def raise_error(error): 17 | raise RuntimeError(error) 18 | 19 | 20 | class RunnerEnvironment: 21 | def __init__(self, runner): 22 | if runner not in definitions.SUPPORTED_TRANSFORMATION_RUNNERS: 23 | raise NotImplementedError( 24 | f'Transformation Runner "{runner}" is not supported' 25 | ) 26 | 27 | self._runner = runner 28 | 29 | def _get_runner_template_paths(self): 30 | common_template_path = os.path.join( 31 | definitions.ROOT_DIR, 32 | "component", 33 | "transformation", 34 | "common", 35 | "templating", 36 | "template", 37 | ) 38 | 39 | runner_base_path = os.path.join( 40 | definitions.ROOT_DIR, 41 | "component", 42 | "transformation", 43 | "runner", 44 | self._runner, 45 | "template", 46 | "macro", 47 | ) 48 | 49 | if not os.path.isdir(runner_base_path): 50 | raise RuntimeError(f"Path `{runner_base_path}` does not exist") 51 | 52 | if not os.path.isdir(common_template_path): 53 | raise RuntimeError(f"Path `{common_template_path}` does not exist") 54 | 55 | return [common_template_path, runner_base_path] 56 | 57 | def get_env(self): 58 | template_loader = FileSystemLoader(self._get_runner_template_paths()) 59 | runner_env = Environment(loader=template_loader, extensions=["jinja2.ext.do"]) 60 | 61 | runner_env.globals["log"] = log 62 | runner_env.globals["raise"] = raise_error 63 | 64 | return runner_env 65 | 66 | 67 | def get_runner_environment(runner): 68 | return RunnerEnvironment(runner=runner).get_env() 69 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/common/templating/template/global.sql: -------------------------------------------------------------------------------- 1 | {% macro is_incremental() -%} 2 | {% if not dag_run.conf or dag_run.conf.get('full_refresh') != true %} 3 | true 4 | {% endif %} 5 | {%- endmacro -%} 6 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/runner/bigquery/adapter/impl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import jinja2 3 | 4 | from typing import Optional, Dict, Any, List 5 | from google.cloud import bigquery 6 | from google.cloud.exceptions import GoogleCloudError, NotFound 7 | 8 | from dop.component.configuration import env 9 | from dop.component.transformation.common.parser import yaml_parser 10 | from dop.component.transformation.common.templating import jinja 11 | from dop.component.transformation.runner.bigquery.adapter.model import ( 12 | TableOptionsConfig, 13 | PartitionConfig, 14 | UDFArgument, 15 | StoredProcedureArgument, 16 | ) 17 | from dop.component.transformation.runner.bigquery.adapter.relation import ( 18 | BigQueryRelation as Relation, 19 | RelationHelper, 20 | ) 21 | from dop.component.util import auth 22 | 23 | 24 | def get_query_job_config( 25 | destination, 26 | write_disposition=bigquery.WriteDisposition.WRITE_EMPTY, 27 | create_disposition=bigquery.CreateDisposition.CREATE_NEVER, 28 | ): 29 | job_config = bigquery.QueryJobConfig() 30 | job_config.destination = destination 31 | job_config.write_disposition = write_disposition 32 | job_config.create_disposition = create_disposition 33 | 34 | return job_config 35 | 36 | 37 | def execute_job_with_error_logging(job): 38 | if job.dry_run: 39 | logging.info( 40 | f"Total GB it will process: {job.total_bytes_processed / 1024 / 1024 / 1024}" 41 | ) 42 | return 43 | 44 | try: 45 | job.result() 46 | logging.info("Affected: {} rows".format(job.num_dml_affected_rows)) 47 | logging.info("Job completed...") 48 | except GoogleCloudError as e: 49 | logging.error(e) 50 | logging.error(job.error_result) 51 | logging.error(job.errors) 52 | raise e 53 | 54 | 55 | def get_database(): 56 | return env.env_config.project_id 57 | 58 | 59 | def get_bq_client(project_id, location, credentials): 60 | return bigquery.client.Client( 61 | project=project_id, location=location, credentials=credentials 62 | ) 63 | 64 | 65 | def get_query_runner(options: Optional[Dict[str, Any]]): 66 | dry_run = options.get("dry_run", True) 67 | project_id = options["project_id"] 68 | location = options["location"] 69 | credentials = None 70 | 71 | if not project_id: 72 | raise ValueError("BigQuery requires a project_id") 73 | 74 | if env.env_config.is_sandbox_environment: 75 | credentials = auth.ServiceAccountImpersonationCredentialManager( 76 | source_sa_name=env.DOP_DOCKER_USER, project_id=project_id 77 | ).get_target_credentials() 78 | logging.info( 79 | f"Using service account impersonation, and service account `{env.DOP_DOCKER_USER}` is now active" 80 | ) 81 | 82 | client = get_bq_client( 83 | project_id=project_id, location=location, credentials=credentials 84 | ) 85 | jinja_environment = jinja.get_runner_environment(runner="bigquery") 86 | return QueryRunner( 87 | client=client, jinja_environment=jinja_environment, dry_run=dry_run 88 | ) 89 | 90 | 91 | class QueryRunner: 92 | def __init__( 93 | self, 94 | client: bigquery.client.Client, 95 | jinja_environment: jinja2.Environment, 96 | dry_run=True, 97 | ): 98 | self._client = client 99 | self._jinja_environment = jinja_environment 100 | self._dry_run = dry_run 101 | self._relation_helper = RelationHelper(client=self._client) 102 | 103 | def write_append(self, query, relation): 104 | job_config = get_query_job_config( 105 | destination=relation, 106 | write_disposition=bigquery.WriteDisposition.WRITE_APPEND, 107 | ) 108 | 109 | logging.info(f"Appending data to {relation} using query: {query}") 110 | 111 | query_job = self._client.query(query=query, job_config=job_config) 112 | execute_job_with_error_logging(job=query_job) 113 | 114 | def replace_or_upsert( 115 | self, query: str, relation: Relation, options: Optional[Dict[str, Any]] = None 116 | ): 117 | """ 118 | TODO: this has quite a bit of duplication, needs tidying up 119 | This does a full replacement or an upsert to the target relation. 120 | The function has the following behaviour 121 | 1. When the target table does not exist, it creates the table using schema inferred by the query results 122 | 2. If the target table is already there, it writes into a temp table first and then merge to the target table so the process is atomic and will break if the schema is changed 123 | 124 | :param query: SQL Query 125 | :param relation: BigQuery relation to write truncate 126 | :param options: For options such as specifying partitions, forcing full refresh etc 127 | Range based Partition: 128 | options={'partition_key': 'key', 'partition_data_type': 'int64', 'partition_range': {'start': 0, 'end': 1000, 'interval': 100 } } 129 | 130 | Date based Partition: 131 | options={'partition_key': 'key', 'partition_data_type': 'datetime'} 132 | 133 | """ 134 | 135 | def query_cleaned(q): 136 | if q.strip()[:-1] == ";": 137 | q = q.strip()[:-1] 138 | if q.strip()[:-1] == ";": 139 | raise RuntimeError( 140 | f"Query {query} should not be followed by `;` at the very end" 141 | ) 142 | 143 | return q 144 | 145 | query = query_cleaned(q=query) 146 | 147 | options = {} if not options else options 148 | full_refresh = options.get("full_refresh", False) 149 | 150 | tmp_relation = Relation( 151 | database=relation.database, 152 | schema=relation.schema, 153 | identifier="_tmp_" + relation.identifier, 154 | ) 155 | 156 | template_create_or_replace = self._jinja_environment.from_string( 157 | """ 158 | {% import 'materialization/table_create_or_replace.sql' as materialise_table %} 159 | {{ materialise_table.create_or_replace(query, relation_helper, options) }} 160 | """ 161 | ) 162 | 163 | table_options_config = TableOptionsConfig( 164 | options={ 165 | "expiration_timestamp": "TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 12 hour)" 166 | } 167 | ) 168 | 169 | partition_config = PartitionConfig.create(options=options) 170 | 171 | rendered_create_or_replace = template_create_or_replace.render( 172 | query=query, 173 | relation_helper=self._relation_helper, 174 | options={ 175 | "relation": relation, 176 | "tmp_relation": tmp_relation, 177 | "table_options_config": table_options_config, 178 | "partition_config": partition_config, 179 | "full_refresh": full_refresh, 180 | }, 181 | ) 182 | 183 | logging.info("Running Query: {}".format(rendered_create_or_replace)) 184 | 185 | job_config = bigquery.QueryJobConfig(dry_run=self._dry_run) 186 | query_job = self._client.query( 187 | query=rendered_create_or_replace, job_config=job_config 188 | ) 189 | 190 | execute_job_with_error_logging(job=query_job) 191 | 192 | template_upsert = self._jinja_environment.from_string( 193 | """ 194 | {% import 'materialization/table_upsert.sql' as materialise_table %} 195 | {{ materialise_table.upsert(query, relation_helper, options) }} 196 | """ 197 | ) 198 | 199 | rendered_upsert = template_upsert.render( 200 | query=query, 201 | relation_helper=self._relation_helper, 202 | options={ 203 | "relation": relation, 204 | "tmp_relation": tmp_relation, 205 | "partition_config": partition_config, 206 | "full_refresh": full_refresh, 207 | }, 208 | ) 209 | 210 | logging.info("Running the Upsert Query: {}".format(rendered_upsert)) 211 | 212 | job_config = bigquery.QueryJobConfig(dry_run=self._dry_run) 213 | query_job = self._client.query(query=rendered_upsert, job_config=job_config) 214 | 215 | execute_job_with_error_logging(job=query_job) 216 | 217 | def recreate_view(self, query, relation: Relation): 218 | full_table_id = f"{relation.database}.{relation.schema}.{relation.identifier}" 219 | view = bigquery.table.Table(table_ref=full_table_id) 220 | view.view_query = query 221 | 222 | logging.info("Executing query: {}".format(query)) 223 | 224 | try: 225 | self._client.delete_table(view) 226 | except NotFound: 227 | logging.info( 228 | "View {} not found, ignore the delete operation".format(relation) 229 | ) 230 | pass 231 | 232 | self._client.create_table(view) 233 | 234 | logging.info("View: {} has been created".format(relation)) 235 | 236 | def create_schema(self, project_id, dataset_id, exists_ok=True): 237 | # TODO: add support to dataset level TTL 238 | full_dataset_id = f"{project_id}.{dataset_id}" 239 | dataset = bigquery.dataset.Dataset(dataset_ref=full_dataset_id) 240 | 241 | self._client.create_dataset(dataset=dataset, exists_ok=exists_ok) 242 | logging.info( 243 | "New Dataset {} already exists or has been created".format(full_dataset_id) 244 | ) 245 | 246 | def recreate_udf(self, arguments: List[Dict], query, relation: Relation): 247 | if type(arguments) != list: 248 | raise TypeError( 249 | "arguments for UDF must be a list of entities with `name` and `type`. " 250 | "Please refer to the documentation for an example" 251 | ) 252 | 253 | query = self.render_udf_query( 254 | arguments=arguments, query=query, relation=relation 255 | ) 256 | logging.info("Creating the UDF using: {}".format(query)) 257 | 258 | job_config = bigquery.QueryJobConfig(dry_run=self._dry_run) 259 | query_job = self._client.query(query=query, job_config=job_config) 260 | 261 | execute_job_with_error_logging(job=query_job) 262 | 263 | logging.info("UDF: {} has been created".format(relation)) 264 | 265 | def recreate_stored_procedure(self, arguments: List[Dict], query, relation): 266 | if type(arguments) != list: 267 | raise TypeError( 268 | "arguments for Stored Procedure must be a list of entities with `name` and `type`. " 269 | "Please refer to the documentation for an example" 270 | ) 271 | 272 | query = self.render_stored_procedure_query( 273 | arguments=arguments, query=query, relation=relation 274 | ) 275 | logging.info(f"Creating the Stored Procedure using query: {query}") 276 | 277 | job_config = bigquery.QueryJobConfig() 278 | query_job = self._client.query(query=query, job_config=job_config) 279 | execute_job_with_error_logging(job=query_job) 280 | 281 | logging.info("Stored Procedure: {} has been created".format(relation)) 282 | 283 | @staticmethod 284 | def render_udf_query(arguments: List[Dict], query, relation): 285 | arguments_models = [ 286 | UDFArgument(name=argument["name"], type=argument["type"]) 287 | for argument in arguments 288 | ] 289 | parsed_arguments = ",".join( 290 | [f"{argument.name} {argument.type}" for argument in arguments_models] 291 | ) 292 | rendered_query = ( 293 | f""" 294 | CREATE OR REPLACE FUNCTION {relation}({parsed_arguments}) AS 295 | ( 296 | """ 297 | + query 298 | + """ 299 | ) 300 | """ 301 | ) 302 | 303 | return rendered_query 304 | 305 | @staticmethod 306 | def render_stored_procedure_query(arguments: List[Dict], query, relation): 307 | arguments_models = [ 308 | StoredProcedureArgument(name=argument["name"], type=argument["type"]) 309 | for argument in arguments 310 | ] 311 | parsed_arguments = ",".join( 312 | [f"{argument.name} {argument.type}" for argument in arguments_models] 313 | ) 314 | rendered_query = f""" 315 | CREATE OR REPLACE PROCEDURE {relation}({parsed_arguments}) 316 | BEGIN 317 | {query}; 318 | END; 319 | """ 320 | 321 | return rendered_query 322 | 323 | def assertion(self, query): 324 | def compile_assertion_results(rows): 325 | assertion_results = [] 326 | has_failure = False 327 | reserved_keys = ["success", "description"] 328 | for row in rows: 329 | assertion_result = { 330 | "success": None, 331 | "description": None, 332 | "other_asserted_values": {}, 333 | } 334 | if not row["success"]: 335 | has_failure = True 336 | 337 | assertion_result["success"] = row["success"] 338 | assertion_result["description"] = row["description"] 339 | for key, value in row.items(): 340 | if key not in reserved_keys: 341 | assertion_result["other_asserted_values"][key] = value 342 | 343 | assertion_results.append(assertion_result) 344 | 345 | return {"has_failure": has_failure, "assertion_results": assertion_results} 346 | 347 | job_config = bigquery.QueryJobConfig() 348 | query_job = self._client.query(query=query, job_config=job_config) 349 | 350 | logging.info("Running assertion using query: {}".format(query)) 351 | 352 | try: 353 | results = compile_assertion_results(rows=query_job.result()) 354 | 355 | logging.info( 356 | "\n\n#### Assertion Report ####\n\n" 357 | + yaml_parser.dict_to_yaml(results["assertion_results"]) 358 | + "\n\n#### Assertion Report ####\n\n" 359 | ) 360 | 361 | if results["has_failure"]: 362 | raise AssertionError( 363 | 'Assertion failed, check the "ASSERTION RESULTS" section for more details' 364 | ) 365 | 366 | except GoogleCloudError as e: 367 | logging.error(e) 368 | logging.error(query_job.error_result) 369 | logging.error(query_job.errors) 370 | raise e 371 | 372 | def call_stored_procedure(self, query): 373 | job_config = bigquery.QueryJobConfig() 374 | 375 | sp = f""" 376 | BEGIN 377 | {query} 378 | END; 379 | """ 380 | logging.info(f"Calling Stored Procedure(s) :{query}") 381 | 382 | query_job = self._client.query(query=sp, job_config=job_config) 383 | execute_job_with_error_logging(job=query_job) 384 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/runner/bigquery/adapter/model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional, Dict, Any 3 | 4 | from dop.component.transformation.common.adapter.model import Argument 5 | 6 | 7 | @dataclass 8 | class PartitionConfig: 9 | field: str 10 | data_type: str = "date" 11 | range: Optional[Dict[str, Any]] = None 12 | 13 | def render(self, alias: Optional[str] = None): 14 | column: str = self.field 15 | if alias: 16 | column = f"{alias}.{self.field}" 17 | 18 | if self.data_type in ("timestamp", "datetime"): 19 | return f"date({column})" 20 | else: 21 | return column 22 | 23 | @staticmethod 24 | def create(options): 25 | if not options.get("partition_key") or not options.get("partition_data_type"): 26 | return None 27 | elif options.get("partition_data_type") == "int64": 28 | if ( 29 | not options.get("partition_key") 30 | or not options.get("partition_range") 31 | or not options.get("partition_range").get("start") 32 | or not options.get("partition_range").get("end") 33 | or not options.get("partition_range").get("interval") 34 | ): 35 | raise RuntimeError(f"Invalid partition options: {options}") 36 | elif options.get("partition_data_type") in ["date", "timestamp", "datetime"]: 37 | if not options.get("partition_key"): 38 | raise RuntimeError(f"Invalid partition options: {options}") 39 | else: 40 | raise NotImplementedError( 41 | f'Partition data type: {options.get("partition_data_type")} is not supported' 42 | ) 43 | 44 | return PartitionConfig( 45 | field=options.get("partition_key"), 46 | data_type=options.get("partition_data_type"), 47 | range=options.get("partition_range"), 48 | ) 49 | 50 | 51 | @dataclass 52 | class TableOptionsConfig: 53 | options: Optional[Dict[str, Any]] = None 54 | 55 | 56 | @dataclass(frozen=True) 57 | class UDFArgument(Argument): 58 | pass 59 | 60 | 61 | @dataclass(frozen=True) 62 | class StoredProcedureArgument(Argument): 63 | pass 64 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/runner/bigquery/adapter/relation.py: -------------------------------------------------------------------------------- 1 | from dop.component.transformation.common.adapter.relation import ( 2 | BaseRelation, 3 | RelationValueError, 4 | ) 5 | 6 | from google.cloud import bigquery 7 | from dataclasses import dataclass 8 | 9 | from dop.component.transformation.runner.bigquery.adapter.model import PartitionConfig 10 | 11 | 12 | @dataclass(frozen=True) 13 | class BigQueryRelation(BaseRelation): 14 | database: str 15 | schema: str 16 | identifier: str 17 | 18 | def __post_init__(self): 19 | if ( 20 | not self.database 21 | or not self.schema 22 | or not self.identifier 23 | or any( 24 | [len(self.database) < 1, len(self.schema) < 1, len(self.identifier) < 1] 25 | ) 26 | ): 27 | raise RelationValueError( 28 | f"database: `{self.database}`, schema: `{self.schema}` " 29 | f"and identifier: `{self.identifier}` must not be empty" 30 | ) 31 | 32 | def __repr__(self): 33 | return f"`{self.database}.{self.schema}.{self.identifier}`" 34 | 35 | 36 | class RelationHelper: 37 | def __init__(self, client: bigquery.client.Client): 38 | self._client = client 39 | 40 | def check_relation_exists(self, relation: BaseRelation): 41 | query = f""" 42 | SELECT * FROM {relation.database}.{relation.schema}.INFORMATION_SCHEMA.TABLES 43 | WHERE table_name = '{relation.identifier}'; 44 | """ 45 | 46 | for _ in self._client.query(query=query): 47 | return True 48 | 49 | return False 50 | 51 | def has_same_partition_definition( 52 | self, partition_config: PartitionConfig, relation: BaseRelation 53 | ): 54 | existing_partition_definition = self.partition_definition(relation=relation) 55 | 56 | if not partition_config or not existing_partition_definition: 57 | return True 58 | elif ( 59 | partition_config.field == existing_partition_definition["column_name"] 60 | and partition_config.data_type == existing_partition_definition["data_type"] 61 | ): 62 | return True 63 | else: 64 | return False 65 | 66 | def partition_definition(self, relation: BaseRelation): 67 | query = f""" 68 | SELECT column_name, data_type FROM {relation.database}.{relation.schema}.INFORMATION_SCHEMA.COLUMNS 69 | WHERE table_name = '{relation.identifier}' AND is_partitioning_column='YES' LIMIT 1; 70 | """ 71 | 72 | for row in self._client.query(query=query): 73 | return {"column_name": row["column_name"], "data_type": row["data_type"]} 74 | 75 | return None 76 | 77 | def check_if_schemas_match( 78 | self, tmp_relation: BaseRelation, relation: BaseRelation 79 | ): 80 | query_template = """ 81 | SELECT CONCAT(coalesce(column_name,''),coalesce(is_nullable,''),coalesce(data_type,''), coalesce(is_partitioning_column, '')) as col_schema_idendifer 82 | FROM {r.database}.{r.schema}.INFORMATION_SCHEMA.COLUMNS 83 | WHERE table_name = '{r.identifier}' ORDER BY ordinal_position; 84 | """ 85 | tmp_schema = [ 86 | x["col_schema_idendifer"] 87 | for x in self._client.query(query=query_template.format(r=tmp_relation)) 88 | ] 89 | target_schema = [ 90 | x["col_schema_idendifer"] 91 | for x in self._client.query(query=query_template.format(r=relation)) 92 | ] 93 | 94 | if tmp_schema == target_schema: 95 | return True 96 | return False 97 | 98 | def get_columns_of_relation(self, relation: BaseRelation): 99 | query = f""" 100 | SELECT column_name FROM {relation.database}.{relation.schema}.INFORMATION_SCHEMA.COLUMNS 101 | WHERE table_name = '{relation.identifier}' ORDER BY ordinal_position; 102 | """ 103 | 104 | return [x["column_name"] for x in self._client.query(query=query)] 105 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/runner/bigquery/template/macro/adapter.sql: -------------------------------------------------------------------------------- 1 | {% macro partition_by(partition_config) -%} 2 | {%- if partition_config is none -%} 3 | {% do return('') %} 4 | {%- elif partition_config.data_type | lower in ('date','timestamp','datetime') -%} 5 | partition by {{ partition_config.render() }} 6 | {%- elif partition_config.data_type | lower in ('int64') -%} 7 | {%- set range = partition_config.range -%} 8 | partition by range_bucket( 9 | {{ partition_config.field }}, 10 | generate_array({{ range.start}}, {{ range.end }}, {{ range.interval }}) 11 | ) 12 | {%- endif -%} 13 | {%- endmacro -%} 14 | 15 | {% macro table_options(table_options_config) %} 16 | OPTIONS({% for opt_key, opt_val in table_options_config.options.items() %} 17 | {{ opt_key }}={{ opt_val }}{{ "," if not loop.last }} 18 | {% endfor %}) 19 | {%- endmacro -%} 20 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/runner/bigquery/template/macro/materialization/table_create_or_replace.sql: -------------------------------------------------------------------------------- 1 | {% import 'adapter.sql' as adapter %} 2 | 3 | {% macro create_or_replace(query, relation_helper, options) %} 4 | {%- set relation = options['relation'] -%} 5 | {%- set tmp_relation = options['tmp_relation'] -%} 6 | {%- set relation_exists = relation_helper.check_relation_exists(relation) -%} 7 | {%- set table_options_config = options['table_options_config'] -%} 8 | {%- set partition_config = options['partition_config'] -%} 9 | {%- set full_refresh = options['full_refresh'] -%} 10 | 11 | {# -- Always try to drop the tmp table #} 12 | drop table if exists {{ tmp_relation }}; 13 | 14 | {# -- Drop table first if partition definition is different and it's a full refresh #} 15 | {%- if full_refresh and relation_helper.has_same_partition_definition(partition_config, relation) -%} 16 | drop table if exists {{ relation }}; 17 | {%- endif -%} 18 | 19 | {# -- Create / Replace table #} 20 | create or replace table 21 | {%- if relation_exists and not full_refresh -%} 22 | {{ tmp_relation }} 23 | {%- else -%} 24 | {{ relation }} 25 | {%- endif -%} 26 | 27 | {# -- Partition Block #} 28 | {%- if partition_config is not none -%} 29 | {{ space }} {{ adapter.partition_by(partition_config) }} 30 | {%- endif -%} 31 | 32 | {# -- Table Options Block #} 33 | {%- if table_options_config is not none -%} 34 | {{ space }} {{ adapter.table_options(table_options_config) }} 35 | {%- endif -%} 36 | 37 | {# -- Main query block #} 38 | AS ( 39 | {{ query }} 40 | ); 41 | {% endmacro %} 42 | -------------------------------------------------------------------------------- /dags/dop/component/transformation/runner/bigquery/template/macro/materialization/table_upsert.sql: -------------------------------------------------------------------------------- 1 | {% import 'adapter.sql' as adapter %} 2 | 3 | {% macro upsert(query, relation_helper, options) %} 4 | {%- set relation = options['relation'] -%} 5 | {%- set tmp_relation = options['tmp_relation'] -%} 6 | {%- set relation_exists = relation_helper.check_relation_exists(relation) -%} 7 | {%- set tmp_relation_exists = relation_helper.check_relation_exists(tmp_relation) -%} 8 | {%- set columns_of_relation = relation_helper.get_columns_of_relation(relation) -%} 9 | {%- set partition_config = options['partition_config'] -%} 10 | {%- set full_refresh = options['full_refresh'] -%} 11 | {%- set incremental_time_partitioned = partition_config.data_type | lower in ('date','timestamp','datetime') -%} 12 | 13 | {# -- Only run the merge query if tmp relation is produced and we already have an existing relation. This is not applicable for full refresh #} 14 | {%- if tmp_relation_exists and relation_exists and not full_refresh -%} 15 | {%- if not tmp_relation_exists -%} 16 | {{ raise('`{{ tmp_relation }}` must exist before an upsert can be done') }} 17 | {%- endif -%} 18 | 19 | {%- if not relation_helper.check_if_schemas_match(tmp_relation, relation) -%} 20 | {{ raise('Schema is backwards incompatible, when making schema changes, a full refresh is required') }} 21 | {%- endif -%} 22 | 23 | {# -- For time based partition only, workout the partitions to be replaced #} 24 | {%- if incremental_time_partitioned -%} 25 | declare dbt_partitions_for_replacement array; 26 | set (dbt_partitions_for_replacement) = ( 27 | select as struct 28 | array_agg(distinct {{ partition_config.render() }}) 29 | from {{ tmp_relation }} 30 | ); 31 | {%- endif -%} 32 | 33 | merge into {{ relation }} as target_relation 34 | using ( 35 | select * from {{ tmp_relation }} 36 | ) as tmp_relation 37 | on FALSE 38 | 39 | when not matched by source 40 | {%- if incremental_time_partitioned -%} 41 | {{ space }} and {{ partition_config.render(alias='target_relation') }} in unnest(dbt_partitions_for_replacement) 42 | {%- endif -%} 43 | {{ space }} then delete 44 | 45 | when not matched then insert 46 | ({% for col in columns_of_relation %}`{{ col }}`{{ "," if not loop.last }}{% endfor %}) 47 | VALUES ({% for col in columns_of_relation %}`{{ col }}`{{ "," if not loop.last }}{% endfor %}); 48 | 49 | {%- endif -%} 50 | {# -- Always try to drop the tmp table #} 51 | drop table if exists {{ tmp_relation }}; 52 | {% endmacro %} 53 | -------------------------------------------------------------------------------- /dags/dop/component/util/auth.py: -------------------------------------------------------------------------------- 1 | from google.auth import impersonated_credentials, default 2 | 3 | 4 | class ServiceAccountImpersonationCredentialManager: 5 | target_scopes = ["https://www.googleapis.com/auth/cloud-platform"] 6 | 7 | def __init__(self, source_sa_name, project_id): 8 | self._source_sa_name = source_sa_name 9 | self._project_id = project_id 10 | 11 | def get_target_credentials(self): 12 | impersonated_sa = ( 13 | f"{self._source_sa_name}@{self._project_id}.iam.gserviceaccount.com" 14 | ) 15 | source_credentials, _ = default() 16 | 17 | target_credentials = impersonated_credentials.Credentials( 18 | source_credentials=source_credentials, 19 | target_principal=impersonated_sa, 20 | target_scopes=self.target_scopes, 21 | delegates=[], 22 | lifetime=500, 23 | ) 24 | 25 | return target_credentials 26 | -------------------------------------------------------------------------------- /dags/dop/component/util/secret_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from google.cloud import secretmanager 4 | 5 | 6 | def secret_client(credentials): 7 | return secretmanager.SecretManagerServiceClient(credentials=credentials) 8 | 9 | 10 | def access_secret(project_id, secret_id, credentials, version="latest"): 11 | # Build the resource name of the secret version. 12 | client = secret_client(credentials=credentials) 13 | name = client.secret_version_path(project_id, secret_id, version) 14 | # Access the secret version. 15 | response = client.access_secret_version(name) 16 | 17 | payload = response.payload.data.decode("UTF-8") 18 | logging.info("Accessing secret version of {}".format(response.name)) 19 | 20 | return payload 21 | -------------------------------------------------------------------------------- /dags/dop/definitions.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) 4 | SUPPORTED_TRANSFORMATION_RUNNERS = ["bigquery"] 5 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | postgres: 4 | image: postgres:13.2-alpine 5 | container_name: dop_postgres 6 | restart: always 7 | environment: 8 | - POSTGRES_USER=airflow 9 | - POSTGRES_PASSWORD=airflow 10 | - POSTGRES_DB=airflow 11 | logging: 12 | options: 13 | max-size: 10m 14 | max-file: "3" 15 | 16 | webserver: 17 | build: infrastructure/docker/composer_${AIRFLOW_VERSION} 18 | container_name: dop_webserver 19 | restart: always 20 | depends_on: 21 | - postgres 22 | environment: 23 | - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow 24 | - AIRFLOW__CORE__EXECUTOR=LocalExecutor 25 | - AIRFLOW__CORE__LOGGING_LEVEL=INFO 26 | - GOOGLE_APPLICATION_CREDENTIALS=/secret/gcp-credentials/application_default_credentials.json 27 | - DOP_SANDBOX_ENVIRONMENT=true # set to true if running locally on a laptop, this enables certain features such as service account impersonation 28 | - DOP_DEVELOPER_MODE=true # enable developer mode so that WIP code can be used in docker 29 | 30 | # The following environment environment variables need to be set on both the docker local environment as well as the composer environment 31 | - DOP_SERVICE_PROJECT_PATH=/opt/airflow/dags/dop_service_project # The absolute directory of the service project path. Each DBT project in this path should be within their folder and must be valid. I.e. on Docker, this could be /opt/airflow/dags/dop/dbt-projects. On Composer this could be anywhere under the `/home/airflow/gcs/dags` or `/home/airflow/gcs/data` directory 32 | - DOP_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP project_id - to be used as the project where data will be consumed & persisted 33 | - DOP_LOCATION=${LOCATION?location_is_undefined} # GCP region - to be used to persist all data 34 | - DOP_INFRA_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP infrastructure project id, for local development this isn't used so leaving it as the same as gcp service project id 35 | logging: 36 | options: 37 | max-size: 10m 38 | max-file: "3" 39 | volumes: 40 | - ./examples/service_project:/opt/airflow/dags/dop_service_project 41 | - ./dags:/opt/airflow/dags 42 | - ./infrastructure/docker/developer_only/.airflowignore:/opt/airflow/dags/.airflowignore 43 | - ~/.config/gcloud/application_default_credentials.json:/secret/gcp-credentials/application_default_credentials.json # mount application default credentials only so no keys as used 44 | ports: 45 | - "8082:8080" 46 | command: webserver 47 | healthcheck: 48 | test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"] 49 | interval: 30s 50 | timeout: 30s 51 | retries: 3 52 | -------------------------------------------------------------------------------- /docs/a_typical_dop_orchestration_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/a_typical_dop_orchestration_flow.png -------------------------------------------------------------------------------- /docs/dop_docker_account_impersonation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/dop_docker_account_impersonation.png -------------------------------------------------------------------------------- /docs/dop_service_project_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/dop_service_project_architecture.png -------------------------------------------------------------------------------- /docs/example_dag_with_dbt_running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/example_dag_with_dbt_running.png -------------------------------------------------------------------------------- /docs/grant_service_account_user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/grant_service_account_user.png -------------------------------------------------------------------------------- /docs/local_airflow_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/local_airflow_ui.png -------------------------------------------------------------------------------- /docs/set-variables-ide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/set-variables-ide.png -------------------------------------------------------------------------------- /docs/trigger_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/trigger_dag.png -------------------------------------------------------------------------------- /docs/trigger_full_refresh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/trigger_full_refresh.png -------------------------------------------------------------------------------- /examples/service_project/.gcloudignore: -------------------------------------------------------------------------------- 1 | # Ignore the following when packaging & deploying using rsync 2 | docker-compose-dop.yml 3 | Makefile 4 | README.md 5 | .git 6 | .gitignore 7 | .gcloudignore 8 | **/__pycache__ 9 | embedded_dop/source/.git 10 | embedded_dop/source/docs 11 | embedded_dop/source/examples 12 | embedded_dop/source/tests 13 | embedded_dop/source/.gitignore 14 | embedded_dop/source/README.md 15 | embedded_dop/source/LICENSE.md 16 | dbt_start/logs 17 | dbt_start/target 18 | -------------------------------------------------------------------------------- /examples/service_project/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore the DOP source code, this is pulled in dynamically from the DOP source repository 2 | embedded_dop/source 3 | -------------------------------------------------------------------------------- /examples/service_project/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build down up up-follow logs exec restart 2 | 3 | include .env 4 | export 5 | 6 | DOP_GIT_SSH_REPO_PATH := #{REPLACE WITH THE SSH GIT REPO PATH OF DOP} 7 | DOP_PROJECT_ID := #{REPLACE WITH A GCP PROJECT ID WHERE DOP WILL EXECUTE ALL JOBS} 8 | DOP_LOCATION := #{REPLACE WITH A GCP REGION WHERE DATA WILL BE PERSISTED BY DOP} 9 | DOP_ARTIFACTS_BUCKET := #{REPLACE WITH ARTIFACT BUCKET NAME} 10 | DOP_INFRA_PROJECT_ID := #{REPLACE WITH THE GCP INFRASTRUCTURE PROJECT ID WHERE BUILD ARTIFACTS ARE STORED} 11 | 12 | REPO_BASE_NAME := $(shell basename `git rev-parse --show-toplevel`) 13 | BRANCH := $(shell git rev-parse --abbrev-ref HEAD) 14 | HASH := $(shell git rev-parse HEAD | head -c7) 15 | SERVICE_PROJECT_ABS_PATH := $(shell pwd) 16 | DATETIME := $(shell date '+%Y%m%d-%H%M%S') 17 | 18 | LOCAL_DOP_EMBEDDED_SOURCE_PATH := ./embedded_dop/source 19 | DOP_TAG_NAME := master # Designed to be overwritten if a TAG or a different branch needs to be used for deployment 20 | AIRFLOW_VERSION := 1.10.10 21 | 22 | 23 | ENVS := PROJECT_ID=$(DOP_PROJECT_ID) \ 24 | LOCATION=$(DOP_LOCATION) \ 25 | SERVICE_PROJECT_ABS_PATH=$(SERVICE_PROJECT_ABS_PATH) 26 | 27 | validate: 28 | if [ -z ${DOP_GIT_SSH_REPO_PATH} ]; then \ 29 | echo "DOP_GIT_SSH_REPO_PATH must be defined. Aborting";\ 30 | exit 1; \ 31 | elif [ -z ${DOP_PROJECT_ID} ]; then \ 32 | echo "DOP_PROJECT_ID must be defined. Aborting";\ 33 | exit 1; \ 34 | elif [ -z ${DOP_LOCATION} ]; then \ 35 | echo "DOP_LOCATION must be defined. Aborting";\ 36 | exit 1; \ 37 | elif [ -z ${DOP_ARTIFACTS_BUCKET} ]; then \ 38 | echo "DOP_ARTIFACTS_BUCKET must be defined. Aborting";\ 39 | exit 1; \ 40 | elif [ -z ${DOP_INFRA_PROJECT_ID} ]; then \ 41 | echo "DOP_INFRA_PROJECT_ID must be defined. Aborting";\ 42 | exit 1; \ 43 | fi 44 | 45 | validate-deploy: 46 | if [ -z ${DEPLOY_BUCKET_NAME} ]; then \ 47 | echo "DEPLOY_BUCKET_NAME must be defined. Aborting";\ 48 | exit 1; \ 49 | elif [ -z ${DOP_ARTIFACT_ID} ]; then \ 50 | echo "DOP_ARTIFACT_ID must be defined. Aborting";\ 51 | exit 1; \ 52 | fi 53 | 54 | clean: 55 | docker rm -f $(docker ps -a | grep dop_ | awk '{print $1}') 56 | 57 | git-checkout-dop: validate 58 | git clone $(DOP_GIT_SSH_REPO_PATH) $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) 2> /dev/null || git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) clean -fdx && git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) remote update && (git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) reset --hard origin/$(DOP_TAG_NAME) || git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) checkout $(DOP_TAG_NAME)) 59 | 60 | build: git-checkout-dop 61 | $(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml up -d --build webserver 62 | 63 | down: validate 64 | $(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml down 65 | 66 | up: git-checkout-dop 67 | $(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml up -d 68 | 69 | up-follow: validate 70 | $(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml up 71 | 72 | restart: 73 | make down && make up 74 | 75 | logs: 76 | docker logs dop_webserver -f 77 | 78 | exec: 79 | docker exec -it dop_webserver /bin/bash -c "source ./script/exec_entrypoint.sh; /bin/bash" 80 | 81 | executor-example-dbt-run: validate 82 | docker run --workdir "/home/dbtuser/dbt_start" --env DOP_PROJECT_ID=$(DOP_PROJECT_ID) --env DOP_LOCATION=$(DOP_LOCATION) dop-dbt:latest /bin/bash -c "pipenv run dbt run" 83 | 84 | build-dbt-image: 85 | gcloud builds submit \ 86 | --substitutions SHORT_SHA=$(HASH),_DATETIME=$(DATETIME) \ 87 | --config=$(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/cloudbuild/build-dbt.yaml \ 88 | --project=$(DOP_INFRA_PROJECT_ID) \ 89 | . 90 | 91 | build-artifact: git-checkout-dop build-dbt-image 92 | gcloud builds submit \ 93 | --substitutions SHORT_SHA=$(HASH),BRANCH_NAME=$(BRANCH),REPO_NAME=$(REPO_BASE_NAME),_CLOUDBUILD_ARTIFACTS_BUCKET_NAME=$(DOP_ARTIFACTS_BUCKET),_DATETIME=$(DATETIME) \ 94 | --config=$(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/cloudbuild/build.yaml \ 95 | --project=$(DOP_INFRA_PROJECT_ID) \ 96 | . 97 | 98 | deploy: validate validate-deploy 99 | gcloud builds submit \ 100 | --substitutions REPO_NAME=$(REPO_BASE_NAME),_DOP_ARTIFACT_ID=$(DOP_ARTIFACT_ID),_CLOUDBUILD_ARTIFACTS_BUCKET_NAME=$(DOP_ARTIFACTS_BUCKET),_DEPLOY_BUCKET_NAME=$(DEPLOY_BUCKET_NAME) \ 101 | --config=$(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/cloudbuild/deploy.yaml \ 102 | --project=$(DOP_INFRA_PROJECT_ID) \ 103 | . 104 | -------------------------------------------------------------------------------- /examples/service_project/README.md: -------------------------------------------------------------------------------- 1 | Table of contents 2 | ================= 3 | * [DOP Service Project Architecture](#dop-service-project-architecture) 4 | * [Boilerplate structure explained](#boilerplate-structure-explained) 5 | * [DBT Projects](#dbt-projects) 6 | * [The embedded_dop directory](#the-embedded_dop-directory) 7 | * [executor_config](#executor_config) 8 | * [orchestration](#orchestration) 9 | * [source](#source) 10 | * [The Makefile](#the-makefile) 11 | * [Use DOP on Docker](#use-dop-on-docker) 12 | * [Deploy to Cloud Composer](#deploy-to-cloud-composer) 13 | * [Build Artifact](#build-artifact) 14 | * [Deploy](#deploy) 15 | * [Important steps to follow for deploying to an existing Composer Cluster](#important-steps-to-follow-for-deploying-to-an-existing-composer-cluster) 16 | * [DOP Orchestration - How To Use](#dop-orchestration---how-to-use) 17 | * [Create a new DOP Orchestration DAG](#create-a-new-dop-orchestration-dag) 18 | * [Task definitions](#task-definitions) 19 | * [Native Transformation - Materialization](#native-transformation---materialization) 20 | * [Native Transformation - Assertion](#native-transformation---assertion) 21 | * [Native Transformation - Invocation](#native-transformation---invocation) 22 | * [DBT Task](#dbt-task) 23 | * [Full Refresh](#full-refresh) 24 | 25 | The `service_project` directory can be used as a boilerplate to setup DOP on an existing GIT repository (the service project repository). 26 | You can copy and paste everything in this directory (including `.gcloudignore` and `.gitignore` as these are required for DOP to function correctly). 27 | If you already have a `Makefile` or any other conflicting files, you may need to move things around by merging those files or moving i.e. the `Makefile` into `embedded_dop` 28 | 29 | **Please note that this boilerplate is optimised for running DBT jobs inside DOP alongside native capabilities. If you don't use DBT, some automation may not work as expected.** 30 | 31 | ## DOP Service Project Architecture 32 | This explains how DOP functions and how it can be integrated into your existing git repositories 33 | ![service_project_architecture](../../docs/dop_service_project_architecture.png) 34 | 35 | ## Boilerplate structure explained 36 | 37 | ### DBT Projects 38 | Currently the setup is optimised to orchestrate and run DBT jobs on Google Cloud and this directory can be used as a template in your service project to quickly setup DOP with multiple DBT projects. 39 | 40 | For this to work, the service project repository must contain one or multiple DBT projects, each of them in their own folder. 41 | For example, 42 | ``` 43 | dbt_project_1/dbt_project.yaml 44 | dbt_project_1/... 45 | 46 | dbt_project_2/dbt_project.yaml 47 | dbt_project_2/... 48 | ``` 49 | 50 | ### The embedded_dop directory 51 | There are three main folders within this directory. 52 | 53 | #### executor_config 54 | This folder contains files required to build docker containers to be used on Cloud Composer and invoked via the Airflow K8 Pod Operator. 55 | 56 | For example, 57 | - the `dbt/config.yaml` file contains instructions to tell the build process where to locate the DBT projects inside this repository 58 | - `Pipfile` and `Pipfile.lock` are used to maintain and lock Python dependencies so what's installed inside the docker container is always the same after each build 59 | 60 | Currently this is not used on the sandbox environment when running locally with Docker Compose, DBT is still installed on the fly using dynamically created Python Virtual environment but this may change in the future. 61 | 62 | #### orchestration 63 | This folder contains some example orchestration jobs and it shows how DOP can be used to orchestrate the flow between DBT jobs, native transformations and any executors added in the future. 64 | It's probably a good idea to look through these examples which will give you a good idea on how to orchestrate workload in DOP. 65 | 66 | #### source 67 | This is a directory reserved to store the DOP code, you won't see it in this repository because it's ignored by version control but when the build process runs, the DOP source code will be checked out to here. 68 | 69 | ### The Makefile 70 | The `Makefile` contains instructions to automate the whole initialization process for DOP including checking out the DOP repository as well as defining required variables. 71 | For the Makefile to work, placeholders (as defined in `#{}`) must be replaced with real values in the Makefile or overwritten via make command arguments. 72 | 73 | ## Use DOP on Docker 74 | You can now use DOP on your laptop (Linux / Mac only for now, Windows instructions is in the works) by following the instructions as below. 75 | 76 | Running it for the first time (this builds the docker image from scratch and may take a while, you can check where it got to with `make logs`) 77 | ``` 78 | make build 79 | ``` 80 | 81 | Once it's up and running, you can access the UI on 82 | ``` 83 | http://localhost:8082 84 | ``` 85 | 86 | Bring down the docker environment 87 | ``` 88 | make down 89 | ``` 90 | 91 | Subsequent runs to bring up the docker environment 92 | ``` 93 | make up 94 | ``` 95 | 96 | And to get into the docker container itself (useful for debugging), run 97 | ``` 98 | make exec 99 | ``` 100 | 101 | ## Deploy to Cloud Composer 102 | There is a light weight semi-automated deployment process built using Cloud Build, to deploy to an existing Composer Cluster, follow instructions as below. 103 | 104 | ### Build Artifact 105 | ``` 106 | make build-artifact 107 | ``` 108 | This will produce an artifact id pointed to the most recent build. 109 | 110 | By default this will build the artifact using the DOP `master` branch, if a different branch or tag is required, this can be overwritten by using `DOP_TAG_NAME`, i.e. `DOP_TAG_NAME=v0.1.0`. 111 | 112 | ``` 113 | make build-artifact DOP_TAG_NAME={} 114 | ``` 115 | 116 | It is **very important** to consider using a tag made on the DOP source repository for a Production deployment so that the DOP version won't accidentally change when making a service project deployment. 117 | 118 | 119 | ### Deploy 120 | ``` 121 | make deploy DEPLOY_BUCKET_NAME={} DOP_ARTIFACT_ID={} 122 | ``` 123 | `DEPLOY_BUCKET_NAME`: This is the bucket name for Cloud Composer i.e. `us-central1-dop-sandbox-us-xxxxxxxx-bucket` 124 | `DOP_ARTIFACT_ID`: Use the most recent artifact id produced by the `Build Artifact` step or any historical artifact ids to rollback 125 | 126 | #### Important steps to follow for deploying to an existing Composer Cluster 127 | If you are deploying DOP to an existing composer cluster which often already has other DAGs running, 128 | it is important to set some exclusions in your existing deployment process. 129 | 130 | A typical deployment to Cloud Composer involves doing a rsync to a GCS bucket, in order to make sure the DOP service project path is not removed in this process add the following exclusions in the rsync. 131 | ``` 132 | export SERVICE_PROJECT_NAME=dop_ && gsutil -m rsync -r -d -x "^$SERVICE_PROJECT_NAME" gs:///dags gs://$BUCKET_NAME/dags 133 | ``` 134 | 135 | ## DOP Orchestration - How To Use 136 | If you prefer to just give it a go without reading documentation, see the example DAG [embedded_dop/orchestration/example_covid19](embedded_dop/orchestration/example_covid19). 137 | Try playing around by changing the `config.yaml` file and see changes reflected in the Airflow GUI on your local environment 138 | 139 | ### Create a new DOP Orchestration DAG 140 | 1. Create a new folder under [embedded_dop/orchestration](embedded_dop/orchestration), the folder name will be the name of the DAG, with a `dop__` prefix. i.e. `dop__`. 141 | 1. Create a configuration file inside the new folder and call it `config.yaml`. The Airflow DAG is automatically generated by parsing the `config.yaml` file. The config file has the following structure 142 | ``` 143 | enabled: 144 | schedule_interval: 145 | timezone: 146 | schema: 147 | tasks: 148 | ``` 149 | ### Task definitions 150 | The following kinds of tasks are currently supported 151 | 152 | #### Native Transformation - Materialization 153 | This task kind is designed to persist structure or data. Supported options are 154 | ``` 155 | - identifier: 156 | kind: 157 | action: materialization 158 | target: 159 | dependencies: 160 | 161 | ``` 162 | Targets 163 | - schema (for BQ, this is creating a dataset): create a schema 164 | - udf (with dynamic arguments): create a UDF from a SQL script 165 | - table: create a table by materializing the output from a SQL script 166 | - view: create a view from a SQL script 167 | - stored_procedure (with dynamic arguments): create a stored procedure from a SQL script 168 | 169 | Features 170 | - Delta management using a date/timestamp partitioned column 171 | - Automatic schema inference by query results with schema backwards compatibility checks and stops the execution when schema is backwards incompatible 172 | - A full refresh can be triggered to do a full rebuild from sources 173 | 174 | For the Materialization task, `identifer` must match to a SQL file located in the `/sql` folder. 175 | To see a live example on how to configure each task, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml). 176 | 177 | #### Native Transformation - Assertion 178 | This task kind can be used to check data quality. Supported options are 179 | ``` 180 | - identifier: 181 | kind: 182 | action: assertion 183 | target: assertion 184 | dependencies: 185 | 186 | ``` 187 | For the Assertion task, `identifer` must match to a SQL file located in the `/sql` folder and in the `SELECT` statement of the assertion SQL, it must return at least two columns as shown below, 188 | ``` 189 | SELECT 190 | AS success, 191 | AS description 192 | ``` 193 | The Airflow task will fail if `success` is evaluated as `false` 194 | 195 | To see a live example on how to configure each task, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml). 196 | 197 | #### Native Transformation - Invocation 198 | This task kind can be used to trigger something that has already been created, i.e. a stored procedure. Supported options are 199 | ``` 200 | - identifier: 201 | kind: 202 | action: invocation 203 | target: stored_procedure 204 | dependencies: 205 | 206 | ``` 207 | To see a live example on how to configure each task, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml). 208 | 209 | #### DBT Task 210 | This task kind can be used to trigger a DBT job. The DBT jobs runs in a Python Virtual Environment when executed locally but runs in containers on Cloud Composer. 211 | You don't have to worry about this as a user because the user experience on this between the two environments are almost identical. 212 | ``` 213 | - identifier: 214 | kind: 215 | action: dbt 216 | target: 217 | options: 218 | project: 219 | version: 220 | arguments: 221 | - option: 222 | value: 223 | - ... 224 | dependencies: 225 | 226 | ``` 227 | Under `options` you may optionally specify arguments for a DBT job, this can be very useful for breaking down a very large DBT job into smaller chunks, making it easier to maintain. 228 | 229 | Some of the ideas are 230 | - Make use of [tags](https://docs.getdbt.com/reference/resource-configs/tags). For example, use tags to split DBT tasks into logical groups and run them in separate steps. 231 | - Create a dedicated DAG for `full refresh` with tags so each DBT logically group can be refreshed separately to save cost as well as making refresh faster 232 | 233 | Keep in mind try not to over engineer the solution, only try to split the DBT job if it makes sense to do so and solving a real issue (i.e. rebuilding the whole thing costs too much / takes too long or, without tags the job is unmaintainable and very hard to identify area of failures) 234 | 235 | To see a live example on how to configure this, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml). 236 | 237 | ### Full Refresh 238 | This is an example of a full refresh (overwriting existing schema & data), you can pass in a JSON payload using the trigger dag function in the Airflow GUI. 239 | 240 | Please note that regardless of native transformations or DBT jobs (or any other task with full refresh support), using the `{"full_refresh" : true}` flag will force a full refresh on all applicable tasks. 241 | 242 | ![Trigger DAG](../../docs/trigger_dag.png) 243 | 244 | ![Set DAG configuration options](../../docs/trigger_full_refresh.png) 245 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | venv 6 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/analysis/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/analysis/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/data/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start/dbt_project.yml: -------------------------------------------------------------------------------- 1 | # Name your project! Project names should contain only lowercase characters 2 | # and underscores. A good package name should reflect your organization's 3 | # name or the intended use of these models 4 | name: 'dop_test' 5 | version: '1.0.0' 6 | config-version: 2 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'dbt_start' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `source-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | source-paths: ["models"] 15 | analysis-paths: ["analysis"] 16 | test-paths: ["tests"] 17 | data-paths: ["data"] 18 | macro-paths: ["macros"] 19 | snapshot-paths: ["snapshots"] 20 | 21 | target-path: "target" # directory which will store compiled SQL files 22 | clean-targets: # directories to be removed by `dbt clean` 23 | - "target" 24 | - "dbt_modules" 25 | 26 | 27 | # Configuring models 28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 29 | 30 | # In this example config, we tell dbt to build all models in the example/ directory 31 | # as tables. These settings can be overridden in the individual model files 32 | # using the `{{ config(...) }}` macro. 33 | models: 34 | dop_test: 35 | +materialized: table 36 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/macros/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start/models/aggregation_a/covid19_cases_by_country.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'insert_overwrite', 5 | partition_by = {'field': 'date', 'data_type': 'date'} 6 | ) 7 | }} 8 | 9 | with covid_cases as ( 10 | 11 | SELECT date, country_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested 12 | FROM {{ ref('stg_covid19_cases') }} 13 | {% if is_incremental() %} 14 | where date >= "{{ var('ds') }}" 15 | {% endif %} 16 | GROUP BY date, country_name 17 | 18 | ) 19 | select * from covid_cases 20 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/models/aggregation_b/covid19_cases_by_country_and_region.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'insert_overwrite', 5 | partition_by = {'field': 'date', 'data_type': 'date'} 6 | ) 7 | }} 8 | 9 | with covid_cases as ( 10 | 11 | SELECT date, country_name, subregion1_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested 12 | FROM {{ ref('stg_covid19_cases') }} 13 | {% if is_incremental() %} 14 | where date >= "{{ var('ds') }}" 15 | {% endif %} 16 | GROUP BY date, country_name, subregion1_name 17 | 18 | ) 19 | select * from covid_cases 20 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/models/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: jaffle_shop 5 | database: bigquery-public-data 6 | schema: covid19_open_data 7 | loader: BigQuery # informational only (free text) 8 | loaded_at_field: CAST(date as timestamp) # configure for all sources 9 | 10 | # meta fields are rendered in auto-generated documentation 11 | meta: 12 | contains_pii: false 13 | owner: "@google" 14 | 15 | # Add tags to this source 16 | tags: 17 | - covid19 18 | 19 | quoting: 20 | database: false 21 | schema: false 22 | identifier: false 23 | 24 | tables: 25 | - name: covid19_open_data 26 | freshness: # make this a little more strict 27 | warn_after: {count: 1, period: day} 28 | error_after: {count: 2, period: day} 29 | 30 | models: 31 | - name: stg_covid19_cases 32 | description: covid19 global open data 33 | - name: covid19_cases_by_country 34 | description: Global cases by country 35 | columns: 36 | - name: country_name 37 | description: Country name 38 | tests: 39 | - not_null 40 | - name: covid19_cases_by_country_and_region 41 | description: Global cases by regions in countries 42 | columns: 43 | - name: country_name 44 | description: Country name 45 | tests: 46 | - not_null 47 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/models/staging/stg_covid19_cases.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'insert_overwrite', 5 | partition_by = {'field': 'date', 'data_type': 'date'} 6 | ) 7 | }} 8 | 9 | with covid_cases as ( 10 | 11 | select * REPLACE(CAST(new_recovered AS INT64) as new_recovered, CAST(new_tested AS INT64) as new_tested) 12 | from `{{ source('jaffle_shop', 'covid19_open_data') }}` 13 | {% if is_incremental() %} 14 | where date >= "{{ var('ds') }}" -- Same as DML merge for a range but the first run is always historical load and second run is incremental / don't need historical DAG anymore 15 | {% endif %} 16 | 17 | ) 18 | select * from covid_cases 19 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/snapshots/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/tests/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | venv 6 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/analysis/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/analysis/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/data/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/dbt_project.yml: -------------------------------------------------------------------------------- 1 | # Name your project! Project names should contain only lowercase characters 2 | # and underscores. A good package name should reflect your organization's 3 | # name or the intended use of these models 4 | name: 'dop_test_two' 5 | version: '1.0.0' 6 | config-version: 2 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'dbt_start_two' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `source-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | source-paths: ["models"] 15 | analysis-paths: ["analysis"] 16 | test-paths: ["tests"] 17 | data-paths: ["data"] 18 | macro-paths: ["macros"] 19 | snapshot-paths: ["snapshots"] 20 | 21 | target-path: "target" # directory which will store compiled SQL files 22 | clean-targets: # directories to be removed by `dbt clean` 23 | - "target" 24 | - "dbt_modules" 25 | 26 | 27 | # Configuring models 28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 29 | 30 | # In this example config, we tell dbt to build all models in the example/ directory 31 | # as tables. These settings can be overridden in the individual model files 32 | # using the `{{ config(...) }}` macro. 33 | models: 34 | dop_test_two: 35 | +materialized: table 36 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/macros/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/models/aggregation_a/covid19_cases_by_country.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'insert_overwrite', 5 | partition_by = {'field': 'date', 'data_type': 'date'} 6 | ) 7 | }} 8 | 9 | with covid_cases as ( 10 | 11 | SELECT date, country_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested 12 | FROM {{ ref('stg_covid19_cases') }} 13 | {% if is_incremental() %} 14 | where date >= "{{ var('ds') }}" 15 | {% endif %} 16 | GROUP BY date, country_name 17 | 18 | ) 19 | select * from covid_cases 20 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/models/aggregation_b/covid19_cases_by_country_and_region.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'insert_overwrite', 5 | partition_by = {'field': 'date', 'data_type': 'date'} 6 | ) 7 | }} 8 | 9 | with covid_cases as ( 10 | 11 | SELECT date, country_name, subregion1_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested 12 | FROM {{ ref('stg_covid19_cases') }} 13 | {% if is_incremental() %} 14 | where date >= "{{ var('ds') }}" 15 | {% endif %} 16 | GROUP BY date, country_name, subregion1_name 17 | 18 | ) 19 | select * from covid_cases 20 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/models/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: jaffle_shop 5 | database: bigquery-public-data 6 | schema: covid19_open_data 7 | loader: BigQuery # informational only (free text) 8 | loaded_at_field: CAST(date as timestamp) # configure for all sources 9 | 10 | # meta fields are rendered in auto-generated documentation 11 | meta: 12 | contains_pii: false 13 | owner: "@google" 14 | 15 | # Add tags to this source 16 | tags: 17 | - covid19 18 | 19 | quoting: 20 | database: false 21 | schema: false 22 | identifier: false 23 | 24 | tables: 25 | - name: covid19_open_data 26 | freshness: # make this a little more strict 27 | warn_after: {count: 1, period: day} 28 | error_after: {count: 2, period: day} 29 | 30 | models: 31 | - name: stg_covid19_cases 32 | description: covid19 global open data 33 | - name: covid19_cases_by_country 34 | description: Global cases by country 35 | columns: 36 | - name: country_name 37 | description: Country name 38 | tests: 39 | - not_null 40 | - name: covid19_cases_by_country_and_region 41 | description: Global cases by regions in countries 42 | columns: 43 | - name: country_name 44 | description: Country name 45 | tests: 46 | - not_null 47 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/models/staging/stg_covid19_cases.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'insert_overwrite', 5 | partition_by = {'field': 'date', 'data_type': 'date'} 6 | ) 7 | }} 8 | 9 | with covid_cases as ( 10 | 11 | select * REPLACE(CAST(new_recovered AS INT64) as new_recovered, CAST(new_tested AS INT64) as new_tested) 12 | from `{{ source('jaffle_shop', 'covid19_open_data') }}` 13 | {% if is_incremental() %} 14 | where date >= "{{ var('ds') }}" -- Same as DML merge for a range but the first run is always historical load and second run is incremental / don't need historical DAG anymore 15 | {% endif %} 16 | 17 | ) 18 | select * from covid_cases 19 | -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/snapshots/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/dbt_start_two/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/tests/.gitkeep -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/executor_config/dbt/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | pipenv = "*" 10 | install = "*" 11 | dbt = "0.19.1" 12 | 13 | [requires] 14 | python_version = "3.7" 15 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/executor_config/dbt/config.yaml: -------------------------------------------------------------------------------- 1 | # This config file is required to initialise all DBT projects 2 | 3 | dbt_projects: 4 | - project_path: dbt_start # must match to the directory path of the DBT project under the service project. I.e. dbt_start or dbt/dbt_start 5 | - project_path: dbt_start_two 6 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/dummy_upstream_dependency/config.yaml: -------------------------------------------------------------------------------- 1 | enabled: true 2 | schedule_interval: "0 4 * * *" 3 | timezone: "Europe/London" 4 | schema: dop_sandbox_us 5 | 6 | tasks: 7 | - identifier: i_am_the_dummy_dependency 8 | kind: 9 | action: airflow_operator 10 | target: airflow.operators.dummy_operator.DummyOperator 11 | options: 12 | arguments: -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/config.yaml: -------------------------------------------------------------------------------- 1 | enabled: true 2 | schedule_interval: "0 4 * * *" 3 | timezone: "Europe/London" 4 | schema: sample_dataset 5 | params: 6 | value_a: 7 | - 1 8 | - 2 9 | - 3 10 | tasks: 11 | - identifier: dim_date 12 | kind: 13 | action: materialization 14 | target: table 15 | - identifier: salesforce_marketing_cloud_is_ready 16 | kind: 17 | action: assertion 18 | target: assertion 19 | - identifier: staging_salesforce_marketing_cloud 20 | kind: 21 | action: materialization 22 | target: table 23 | dependencies: 24 | - salesforce_marketing_cloud_is_ready 25 | - dim_date 26 | - identifier: fact_transaction 27 | kind: 28 | action: materialization 29 | target: table 30 | dependencies: 31 | - dim_customer 32 | - staging_salesforce_marketing_cloud 33 | - identifier: dim_product 34 | kind: 35 | action: materialization 36 | target: table 37 | dependencies: 38 | - staging_salesforce_marketing_cloud 39 | - dim_customer 40 | - identifier: dim_voucher 41 | kind: 42 | action: materialization 43 | target: table 44 | dependencies: 45 | - staging_salesforce_marketing_cloud 46 | - identifier: fact_customer_activity 47 | kind: 48 | action: materialization 49 | target: table 50 | dependencies: 51 | - staging_salesforce_marketing_cloud 52 | - dim_customer 53 | - identifier: salesforce_service_cloud_is_ready 54 | kind: 55 | action: assertion 56 | target: assertion 57 | - identifier: staging_salesforce_service_cloud 58 | kind: 59 | action: materialization 60 | target: table 61 | dependencies: 62 | - salesforce_service_cloud_is_ready 63 | - identifier: dim_customer 64 | kind: 65 | action: materialization 66 | target: table 67 | dependencies: 68 | - staging_salesforce_service_cloud 69 | - dim_date 70 | - identifier: dim_customer_assertion 71 | kind: 72 | action: assertion 73 | target: assertion 74 | dependencies: 75 | - dim_customer 76 | - identifier: fact_newly_registered_customer 77 | kind: 78 | action: materialization 79 | target: table 80 | dependencies: 81 | - staging_salesforce_service_cloud 82 | - identifier: dim_customer_subscription 83 | kind: 84 | action: materialization 85 | target: table 86 | dependencies: 87 | - dim_customer 88 | - identifier: dim_customer_subscription_assertion 89 | kind: 90 | action: assertion 91 | target: assertion 92 | dependencies: 93 | - dim_customer_subscription 94 | - identifier: zend_desk_is_ready 95 | kind: 96 | action: assertion 97 | target: assertion 98 | dependencies: 99 | - dim_date 100 | - identifier: staging_zend_desk 101 | kind: 102 | action: materialization 103 | target: table 104 | dependencies: 105 | - zend_desk_is_ready 106 | - identifier: zend_desk_ticket_assignments 107 | kind: 108 | action: materialization 109 | target: table 110 | dependencies: 111 | - staging_zend_desk 112 | - identifier: zend_desk_ticket_comments 113 | kind: 114 | action: materialization 115 | target: table 116 | dependencies: 117 | - staging_zend_desk 118 | - identifier: zend_desk_ticket_priority_changes 119 | kind: 120 | action: materialization 121 | target: table 122 | dependencies: 123 | - staging_zend_desk 124 | - identifier: zend_desk_ticket_summary 125 | kind: 126 | action: materialization 127 | target: table 128 | dependencies: 129 | - staging_zend_desk 130 | - identifier: zend_desk_ticket_user_issues 131 | kind: 132 | action: materialization 133 | target: table 134 | dependencies: 135 | - staging_zend_desk 136 | - identifier: list_of_users_require_attention 137 | kind: 138 | action: materialization 139 | target: table 140 | dependencies: 141 | - zend_desk_ticket_user_issues 142 | - zend_desk_ticket_summary 143 | - zend_desk_ticket_priority_changes 144 | - zend_desk_ticket_comments 145 | - zend_desk_ticket_assignments 146 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_assertion.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_assertion.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription_assertion.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription_assertion.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_date.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_date.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_product.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_product.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_voucher.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_voucher.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_customer_activity.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_customer_activity.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_newly_registered_customer.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_newly_registered_customer.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_transaction.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_transaction.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/list_of_users_require_attention.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/list_of_users_require_attention.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_marketing_cloud_is_ready.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_marketing_cloud_is_ready.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_service_cloud_is_ready.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_service_cloud_is_ready.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_marketing_cloud.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_marketing_cloud.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_service_cloud.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_service_cloud.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_zend_desk.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_zend_desk.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_is_ready.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_is_ready.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_assignments.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_assignments.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_comments.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_comments.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_priority_changes.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_priority_changes.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_summary.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_summary.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_user_issues.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_user_issues.sql -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/config.yaml: -------------------------------------------------------------------------------- 1 | enabled: true 2 | schedule_interval: "0 4 * * *" 3 | timezone: "Europe/London" 4 | schema: dop_sandbox_us 5 | params: 6 | value_a: 7 | - 1 8 | - 2 9 | - 3 10 | tasks: 11 | - identifier: assert_upstream_data_is_ready 12 | kind: 13 | action: assertion 14 | target: assertion 15 | 16 | - identifier: create_schema_dop_sandbox_us 17 | kind: 18 | action: materialization 19 | target: schema 20 | schema: dop_sandbox_us 21 | dependencies: 22 | - assert_upstream_data_is_ready 23 | 24 | - identifier: udf_empty_str_as_null 25 | kind: 26 | action: materialization 27 | target: udf 28 | options: 29 | arguments: 30 | - name: str 31 | type: STRING 32 | dependencies: 33 | - create_schema_dop_sandbox_us 34 | 35 | - identifier: udf_unpivot 36 | kind: 37 | action: materialization 38 | target: udf 39 | options: 40 | arguments: 41 | - name: x 42 | type: ANY TYPE 43 | - name: col_regex 44 | type: STRING 45 | dependencies: 46 | - create_schema_dop_sandbox_us 47 | 48 | - identifier: stg_covid19 49 | kind: 50 | action: materialization 51 | target: table 52 | partitioning: 53 | field: date 54 | data_type: date 55 | dependencies: 56 | - create_schema_dop_sandbox_us 57 | 58 | - identifier: covid19_by_country 59 | kind: 60 | action: materialization 61 | target: table 62 | partitioning: 63 | field: date 64 | data_type: date 65 | dependencies: 66 | - stg_covid19 67 | 68 | - identifier: covid19_by_country_and_region 69 | kind: 70 | action: materialization 71 | target: table 72 | partitioning: 73 | field: date 74 | data_type: date 75 | dependencies: 76 | - stg_covid19 77 | 78 | - identifier: view_covid19_by_country_and_region 79 | kind: 80 | action: materialization 81 | target: view 82 | dependencies: 83 | - covid19_by_country_and_region 84 | 85 | - identifier: data_quality_checks 86 | kind: 87 | action: assertion 88 | target: assertion 89 | dependencies: 90 | - covid19_by_country 91 | - covid19_by_country_and_region 92 | 93 | - identifier: sp_all_countries_and_regions 94 | kind: 95 | action: materialization 96 | target: stored_procedure 97 | options: 98 | arguments: 99 | - name: execution_date 100 | type: DATE 101 | dependencies: 102 | - stg_covid19 103 | 104 | - identifier: invoke_sp_all_countries_and_regions 105 | kind: 106 | action: invocation 107 | target: stored_procedure 108 | dependencies: 109 | - sp_all_countries_and_regions 110 | 111 | - identifier: create_schema_dbt_start 112 | kind: 113 | action: materialization 114 | target: schema 115 | schema: dbt_start 116 | dependencies: 117 | - assert_upstream_data_is_ready 118 | 119 | - identifier: dbt_start_staging 120 | kind: 121 | action: dbt 122 | target: run 123 | options: 124 | project: dbt_start 125 | version: 0.19.1 126 | arguments: 127 | - option: -m 128 | value: staging 129 | dependencies: 130 | - create_schema_dbt_start 131 | 132 | - identifier: dbt_start_aggregation_a 133 | kind: 134 | action: dbt 135 | target: run 136 | options: 137 | project: dbt_start 138 | version: 0.19.1 139 | arguments: 140 | - option: -m 141 | value: aggregation_a 142 | dependencies: 143 | - dbt_start_staging 144 | 145 | - identifier: dbt_start_aggregation_b 146 | kind: 147 | action: dbt 148 | target: run 149 | options: 150 | project: dbt_start 151 | version: 0.19.1 152 | arguments: 153 | - option: -m 154 | value: aggregation_b 155 | dependencies: 156 | - dbt_start_aggregation_a 157 | 158 | - identifier: dbt_start_test 159 | kind: 160 | action: dbt 161 | target: test 162 | options: 163 | project: dbt_start 164 | version: 0.19.1 165 | dependencies: 166 | - dbt_start_aggregation_b 167 | 168 | - identifier: dbt_start_docs 169 | kind: 170 | action: dbt 171 | target: docs generate 172 | options: 173 | project: dbt_start 174 | version: 0.19.1 175 | arguments: 176 | - option: --bucket 177 | value: datatonic-uk-dop-dev-diego 178 | - option: --bucket-path 179 | value: dbt 180 | dependencies: 181 | - dbt_start_test 182 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/assert_upstream_data_is_ready.sql: -------------------------------------------------------------------------------- 1 | {% set source_data = 'bigquery-public-data.covid19_open_data.covid19_open_data' %} 2 | 3 | with covid_cases as ( 4 | 5 | SELECT * FROM `{{ source_data }}` 6 | {% if is_incremental() %} 7 | where date >= DATE("{{ ds }}") 8 | {% endif %} 9 | 10 | ) 11 | SELECT COUNT(*) > 0 AS success, 12 | COUNT(*) AS num_of_records, 13 | 'Do we have data available in `{{ source_data }}`?' AS description 14 | FROM covid_cases 15 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/covid19_by_country.sql: -------------------------------------------------------------------------------- 1 | with covid_cases as ( 2 | 3 | SELECT date, country_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested 4 | FROM `dop_sandbox_us.stg_covid19` 5 | {% if is_incremental() %} 6 | where date >= DATE("{{ ds }}") 7 | {% endif %} 8 | GROUP BY date, country_name 9 | 10 | ) 11 | select * from covid_cases 12 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/covid19_by_country_and_region.sql: -------------------------------------------------------------------------------- 1 | with covid_cases as ( 2 | 3 | SELECT date, country_name, subregion1_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested 4 | FROM `dop_sandbox_us.stg_covid19` 5 | {% if is_incremental() %} 6 | where date >= DATE("{{ ds }}") 7 | {% endif %} 8 | GROUP BY date, country_name, subregion1_name 9 | 10 | ) 11 | select * from covid_cases 12 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/data_quality_checks.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) > 0 AS success, 2 | COUNT(*) AS num_of_records, 3 | 'Do we have data in covid19_by_country for date >= "{{ ds }}"?' AS description 4 | FROM `dop_sandbox_us.covid19_by_country` 5 | {% if is_incremental() %} 6 | WHERE date >= "{{ ds }}" 7 | {% endif %} 8 | 9 | UNION ALL 10 | 11 | SELECT COUNT(*) > 0 AS success, 12 | COUNT(*) AS num_of_records, 13 | 'Do we have data in covid19_by_country_and_region for date >= "{{ ds }}"' AS description 14 | FROM `dop_sandbox_us.covid19_by_country_and_region` 15 | {% if is_incremental() %} 16 | WHERE date >= "{{ ds }}" 17 | {% endif %} 18 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/invoke_sp_all_countries_and_regions.sql: -------------------------------------------------------------------------------- 1 | call {{ task['schema'] }}.sp_all_countries_and_regions(DATE('{{ ds }}')); 2 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/sp_all_countries_and_regions.sql: -------------------------------------------------------------------------------- 1 | SELECT distinct country_name, subregion1_name 2 | FROM `bigquery-public-data.covid19_open_data.covid19_open_data` 3 | WHERE date >= execution_date 4 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/stg_covid19.sql: -------------------------------------------------------------------------------- 1 | with covid_cases as ( 2 | 3 | SELECT * REPLACE(CAST(new_recovered AS INT64) as new_recovered, CAST(new_tested AS INT64) as new_tested) 4 | FROM `bigquery-public-data.covid19_open_data.covid19_open_data` 5 | {% if is_incremental() %} 6 | where date >= DATE("{{ ds }}") 7 | {% endif %} 8 | 9 | ) 10 | select * from covid_cases 11 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/udf_empty_str_as_null.sql: -------------------------------------------------------------------------------- 1 | IF(TRIM(str) = '', NULL, str) 2 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/udf_unpivot.sql: -------------------------------------------------------------------------------- 1 | ( 2 | SELECT 3 | ARRAY_AGG(STRUCT( 4 | REGEXP_EXTRACT(y, '[^"]*') AS key 5 | , REGEXP_EXTRACT(y, r':([^"]*)\"?[,}\]]') AS value 6 | )) 7 | FROM UNNEST(( 8 | SELECT REGEXP_EXTRACT_ALL(json,col_regex||r'[^:]+:\"?[^"]+\"?') arr 9 | FROM (SELECT TO_JSON_STRING(x) json))) y 10 | ) 11 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_covid19/sql/view_covid19_by_country_and_region.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM `dop_sandbox_us.covid19_by_country_and_region` 2 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_dataflow_template/config.yaml: -------------------------------------------------------------------------------- 1 | # To run this example: 2 | # - Set enabled to true 3 | # - Add Dataflow Admin role to Composer service account 4 | # - Replace PROJECT_ID, REGION, TEMP_BUCKET and OUTPUT_BUCKET placeholders 5 | enabled: false 6 | schedule_interval: "0 * * * *" 7 | timezone: "Europe/London" 8 | schema: sample_dataset 9 | 10 | tasks: 11 | - identifier: dummy_start_operator 12 | kind: 13 | action: airflow_operator 14 | target: airflow.operators.dummy_operator.DummyOperator 15 | 16 | - identifier: dataflow_template 17 | kind: 18 | action: airflow_operator 19 | target: airflow.contrib.operators.dataflow_operator.DataflowTemplateOperator 20 | options: 21 | arguments: 22 | template: gs://dataflow-templates/latest/Word_Count 23 | job_name: word_count 24 | dataflow_default_options: 25 | project: PROJECT_ID 26 | region: REGION 27 | tempLocation: gs://TEMP_BUCKET/dataflow/staging/ 28 | parameters: 29 | inputFile: gs://dataflow-samples/shakespeare/kinglear.txt 30 | output: gs://OUTPUT_BUCKET/word_count/output 31 | dependencies: 32 | - dummy_start_operator 33 | 34 | - identifier: dummy_end_operator 35 | kind: 36 | action: airflow_operator 37 | target: airflow.operators.dummy_operator.DummyOperator 38 | dependencies: 39 | - dataflow_template 40 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_external_task_sensor/config.yaml: -------------------------------------------------------------------------------- 1 | enabled: true 2 | schedule_interval: "0 * * * *" 3 | timezone: "Europe/London" 4 | schema: sample_dataset 5 | 6 | tasks: 7 | - identifier: external_task_sensor 8 | kind: 9 | action: airflow_operator 10 | target: airflow.sensors.external_task_sensor.ExternalTaskSensor 11 | options: 12 | arguments: 13 | external_dag_id: dop__example_upstream_dependency 14 | external_task_id: upstream_dependency 15 | execution_delta: !!python/object/apply:datetime.timedelta [0, 300] 16 | 17 | - identifier: dummy_operator 18 | kind: 19 | action: airflow_operator 20 | target: airflow.operators.dummy_operator.DummyOperator 21 | dependencies: 22 | - external_task_sensor 23 | -------------------------------------------------------------------------------- /examples/service_project/embedded_dop/orchestration/example_upstream_dependency/config.yaml: -------------------------------------------------------------------------------- 1 | enabled: true 2 | schedule_interval: "55 * * * *" 3 | timezone: "Europe/London" 4 | schema: sample_dataset 5 | 6 | tasks: 7 | - identifier: upstream_dependency 8 | kind: 9 | action: airflow_operator 10 | target: airflow.operators.dummy_operator.DummyOperator 11 | -------------------------------------------------------------------------------- /infrastructure/cloudbuild/build-dbt.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'gcr.io/cloud-builders/docker' 3 | entrypoint: 'bash' 4 | args: ['-c', 'docker pull eu.gcr.io/${PROJECT_ID}/dop-dbt:latest || exit 0'] 5 | - name: 'gcr.io/cloud-builders/docker' 6 | entrypoint: 'bash' 7 | args: [ 8 | '-c', 9 | 'docker build . -f embedded_dop/source/infrastructure/executor/dbt/Dockerfile -t eu.gcr.io/${PROJECT_ID}/dop-dbt:${SHORT_SHA}-${_DATETIME} -t eu.gcr.io/${PROJECT_ID}/dop-dbt:latest --cache-from eu.gcr.io/${PROJECT_ID}/dop-dbt:latest' 10 | ] 11 | images: ['eu.gcr.io/${PROJECT_ID}/dop-dbt:${SHORT_SHA}-${_DATETIME}', 'eu.gcr.io/${PROJECT_ID}/dop-dbt:latest'] 12 | -------------------------------------------------------------------------------- /infrastructure/cloudbuild/build.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'gcr.io/cloud-builders/docker' 3 | id: 'Generate Artifact ID' 4 | entrypoint: '/bin/bash' 5 | args: [ 6 | "-c", 7 | 'echo -n ${REPO_NAME}_${BRANCH_NAME}_${SHORT_SHA}_${BUILD_ID} > .artifact_id' 8 | ] 9 | - name: 'gcr.io/cloud-builders/docker' 10 | id: 'Generate Commit Hash. This is used by other services, do not modify.' 11 | entrypoint: '/bin/bash' 12 | args: [ 13 | "-c", 14 | 'echo -n ${SHORT_SHA}-${_DATETIME} > .commit-hash' 15 | ] 16 | - name: 'gcr.io/cloud-builders/gsutil' 17 | id: 'Store Artifact, `dop_` is added to the REPO_NAME as a prefix to avoid naming conflict' 18 | entrypoint: '/bin/bash' 19 | args: [ 20 | "-c", 21 | 'gsutil -m rsync -r -d . gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/$(cat .artifact_id)/dags/dop_${REPO_NAME}' 22 | ] 23 | - name: 'gcr.io/cloud-builders/docker' 24 | id: 'Display Artifact ID - This can be used to deploy' 25 | entrypoint: '/bin/bash' 26 | args: [ 27 | "-c", 28 | 'cat .artifact_id' 29 | ] 30 | -------------------------------------------------------------------------------- /infrastructure/cloudbuild/deploy.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'gcr.io/cloud-builders/docker' 3 | id: 'Display Artifact ID' 4 | entrypoint: '/bin/bash' 5 | args: [ 6 | "-c", 7 | 'echo ${_DOP_ARTIFACT_ID}' 8 | ] 9 | - name: 'gcr.io/cloud-builders/gsutil' 10 | id: 'List files in the dags/ folder' 11 | entrypoint: '/bin/bash' 12 | args: [ 13 | "-c", 14 | 'gsutil ls gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/${_DOP_ARTIFACT_ID}/dags' 15 | ] 16 | - name: 'gcr.io/cloud-builders/gsutil' 17 | id: 'Deploy' 18 | entrypoint: '/bin/bash' 19 | args: [ 20 | "-c", 21 | 'gsutil -m rsync -r -d gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/${_DOP_ARTIFACT_ID}/dags/dop_${REPO_NAME} gs://${_DEPLOY_BUCKET_NAME}/dags/dop_${REPO_NAME}' 22 | ] 23 | - name: 'gcr.io/cloud-builders/gsutil' 24 | id: 'Log Deployment' 25 | entrypoint: '/bin/bash' 26 | args: [ 27 | "-c", 28 | 'touch ${_DOP_ARTIFACT_ID} && gsutil -m cp ${_DOP_ARTIFACT_ID} gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/deploys/$(date -u "+%Y-%m-%d_%H-%M-%S")/' 29 | ] 30 | -------------------------------------------------------------------------------- /infrastructure/dbt-docs/README.md: -------------------------------------------------------------------------------- 1 | # DBT docs 2 | 3 | DBT documentation is generated using `dbt docs generate`. This command generates in the `target` folder of the project HTML documentation that can be served with any web server. 4 | 5 | DBT also provides a command to serve documentation `dbt docs serve`. It starts a web server in the port 8080 (the port number can be changes using the `--port` param). 6 | 7 | More information about these commands can be found at 8 | 9 | https://docs.getdbt.com/reference/commands/cmd-docs 10 | 11 | 12 | ## Generating DBT docs 13 | 14 | DOP provides the option to generate DBT docs using a task in the `config.yaml` file. 15 | This task generates the files and copy the files a bucket provided in the task arguments 16 | 17 | - identifier: dbt_start_docs 18 | kind: 19 | action: dbt 20 | target: docs generate 21 | options: 22 | project: PROJECT_NAME 23 | version: DBT_VERSION 24 | arguments: 25 | - option: --bucket 26 | value: DBT_DOCS_BUCKET 27 | - option: --bucket-path 28 | value: DBT_DOCS_PATH 29 | 30 | The option `--bucket-path` is optional. If not present, files will be copied to the root folder of the bucket 31 | 32 | 33 | ## Serving a static website in Google Cloud 34 | 35 | Once the files have been generated, Google Cloud provides several option to serve them as a static website: 36 | 37 | https://cloud.google.com/architecture/web-serving-overview 38 | 39 | 40 | ### GCS static site 41 | 42 | The simplest option is to mark the bucket as public, and files can be accessed using the URL 43 | 44 | https://storage.googleapis.com/DBT_DOCS_BUCKET/DBT_DOCS_PATH/index.html 45 | 46 | This is the simplest configuration option, but the website will be public to anyone that knows the URL. 47 | 48 | To use an HTTPS custom domain, a load balancer is required 49 | 50 | https://cloud.google.com/storage/docs/hosting-static-website 51 | 52 | GCS doesn't provide the option to have a private website requiring authentication. 53 | There is a [feature request](https://issuetracker.google.com/issues/114133245?pli=1) to implement this functionality, but it's not implemented yet 54 | 55 | To allow access to only authenticated users, an App Engine application is provided, described in the next section 56 | 57 | 58 | ## AppEngine 59 | 60 | AppEngine allows authentication, but it doesn't allow writing in the filesystem. 61 | 62 | DBT task described about creates files in GCS, but these files cannot be copied to the local disk to be served as static resources 63 | 64 | The chosen approach has been to create a Flask application that reads docs files from GCS and serve them directly. 65 | This application is in the `app-engine` folder 66 | 67 | To avoid reading the files every request, a [cache](https://docs.python.org/3/library/functools.html#functools.lru_cache) 68 | mechanism has been implemented that is cleared periodically. 69 | 70 | Cache duration and bucket can be configured as environment parameters. 71 | If not set in `app.yaml`, the following default values are used: 72 | 73 | - BUCKET_NAME: Default AppEngine bucket (PROJECT_NAME.appspot.com) 74 | - BUCKET_PATH: Empty. Files will be stored in the root folder 75 | - CACHE_MAX_AGE_IN_SECONDS: 300 seconds. If previous request was received more than 5 minutes ago, cache is cleared 76 | 77 | 78 | This app can be easily deployed running `gcloud app deploy` from the `app-engine` folder. 79 | More information about deploying AppEngine can be found at 80 | 81 | https://cloud.google.com/appengine/docs/standard/python3/testing-and-deploying-your-app 82 | 83 | 84 | By default, App Engine has public access, but it can be easily configured to use authentication using Identity-Aware Proxy 85 | 86 | To configure it follow the instructions at 87 | 88 | https://cloud.google.com/iap/docs/app-engine-quickstart 89 | 90 | To give access to all the users of the organisation, add as member `allAuthenticatedUsers` with role `IAP-secured Web App User`, and to restrict the access, apply the role just to the groups or users that should have access. 91 | -------------------------------------------------------------------------------- /infrastructure/dbt-docs/app-engine/.gcloudignore: -------------------------------------------------------------------------------- 1 | # This file specifies files that are *not* uploaded to Google Cloud Platform 2 | # using gcloud. It follows the same syntax as .gitignore, with the addition of 3 | # "#!include" directives (which insert the entries of the given .gitignore-style 4 | # file at that point). 5 | # 6 | # For more information, run: 7 | # $ gcloud topic gcloudignore 8 | # 9 | .gcloudignore 10 | # If you would like to upload your .git directory, .gitignore file or files 11 | # from your .gitignore file, remove the corresponding line 12 | # below: 13 | .git 14 | .gitignore 15 | 16 | # Python pycache: 17 | __pycache__/ 18 | # Ignored by the build system 19 | /setup.cfg 20 | -------------------------------------------------------------------------------- /infrastructure/dbt-docs/app-engine/app.yaml: -------------------------------------------------------------------------------- 1 | runtime: python39 2 | -------------------------------------------------------------------------------- /infrastructure/dbt-docs/app-engine/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from functools import lru_cache 4 | 5 | from flask import Flask 6 | from google.cloud import storage 7 | 8 | app = Flask(__name__) 9 | 10 | # Bucket where dbt docs are stored 11 | DBT_BUCKET_NAME = os.getenv("DBT_BUCKET_NAME") 12 | if not DBT_BUCKET_NAME: 13 | project_id = os.getenv("GOOGLE_CLOUD_PROJECT") 14 | if not project_id: 15 | raise ValueError( 16 | "'GOOGLE_CLOUD_PROJECT' or 'BUCKET_NAME' env variable must be set" 17 | ) 18 | DBT_BUCKET_NAME = f"{project_id}.appspot.com" 19 | 20 | # Path in the bucket where dbt docs are stored 21 | DBT_BUCKET_PATH = os.getenv("DBT_BUCKET_PATH", "") 22 | 23 | # Bucket files will be cached during this period 24 | CACHE_MAX_AGE_IN_SECONDS = os.getenv("CACHE_MAX_AGE_IN_SECONDS", 300) 25 | 26 | storage_client = storage.Client() 27 | bucket = storage_client.bucket(DBT_BUCKET_NAME) 28 | # Used for cache expiration 29 | last_cache_reload = time.time() 30 | 31 | 32 | @app.route("/") 33 | def index(): 34 | """ 35 | Read index.html file from GCS bucket 36 | """ 37 | return read_gcs_blob("index.html") 38 | 39 | 40 | @app.route("/catalog.json") 41 | def catalog(): 42 | """ 43 | Read catalog.json file from GCS bucket 44 | """ 45 | return read_gcs_blob("catalog.json") 46 | 47 | 48 | @app.route("/manifest.json") 49 | def manifest(): 50 | """ 51 | Read manifest.json file from GCS bucket 52 | """ 53 | return read_gcs_blob("manifest.json") 54 | 55 | 56 | @lru_cache(maxsize=3) 57 | def read_gcs_blob(name): 58 | """ 59 | Read a blob from GCS 60 | 61 | :param name: blob to be read from GCS 62 | :return: blob content 63 | """ 64 | path = f"{DBT_BUCKET_PATH}/{name}" if DBT_BUCKET_PATH else name 65 | blob = bucket.blob(path) 66 | return blob.download_as_bytes().decode("utf-8") 67 | 68 | 69 | @app.before_request 70 | def before_request(): 71 | """ 72 | function to run before each request. Clear the cache if expired 73 | """ 74 | global last_cache_reload 75 | if time.time() - last_cache_reload > CACHE_MAX_AGE_IN_SECONDS: 76 | print("Clearing cache") 77 | read_gcs_blob.cache_clear() 78 | last_cache_reload = time.time() 79 | 80 | 81 | if __name__ == "__main__": 82 | app.run(host="127.0.0.1", port=8080, debug=True) 83 | -------------------------------------------------------------------------------- /infrastructure/dbt-docs/app-engine/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.0.1 2 | google-cloud-storage==1.38.0 3 | -------------------------------------------------------------------------------- /infrastructure/docker/README.md: -------------------------------------------------------------------------------- 1 | # Composer versions 2 | 3 | Not every deployment uses the same version. In order to be able to test and run 4 | those environments there are several `composer_{AIRFLOW_VERSION}` folders. 5 | 6 | Each of them contains the necessary elements to build build its docker image and 7 | to be run without an entry point or a command. This way they are interchangeable 8 | when they get loaded by the `docker-compose-dop.yaml`, used by the Data 9 | Engineers, or when loaded by the `docker-compose.yaml` used by the core DOP 10 | developers. 11 | 12 | To use certain version it must be defined in one of the ways below: 13 | 14 | 1. Declared as a make variable inline in the CLI: 15 | ``` 16 | make build AIRFLOW_VERSION=1.10.15 17 | ``` 18 | 2. Exported as an environment variable 19 | ``` 20 | export AIRFLOW_VERSION=1.10.15 21 | make build 22 | ``` 23 | 3. Defined in a `.env` file in the same folder as the Makefile to persist its 24 | value between terminal sessions and make `make` easier to call 25 | ``` 26 | echo 'AIRFLOW_VERSION=1.10.15' >> .env 27 | make build 28 | ``` 29 | 30 | It could also be declared in the Makefile itself. But it's better to split 31 | configuration and functionality. 32 | 33 | # Requirements files 34 | ## composer-constrains.txt 35 | 36 | This matches to a specific Cloud Composer version documented here https://cloud.google.com/composer/docs/concepts/versioning/composer-versions 37 | It is also used inside docker with the purpose to align pip packages with Composer as much as possible 38 | 39 | ## requirements.txt 40 | This contains extra pip dependencies required by the local Airflow environment in Docker. It depends on `requirements.composer.txt` 41 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.10/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:1.10.10-2-python3.6 2 | LABEL maintainer="Datatonic" 3 | 4 | ARG AIRFLOW_HOME=/opt/airflow 5 | ENV AIRFLOW_HOME=${AIRFLOW_HOME} 6 | 7 | USER root 8 | # Install dos2unix used to resolve windows line ending issues 9 | # And gcc used in dbt packages compilation 10 | RUN apt-get update && apt-get install dos2unix gcc -y 11 | 12 | USER airflow 13 | 14 | # Install composer dependencies & additional required dependencies not included in Composer 15 | COPY composer_1.10.10/requirements.composer.txt /requirements.composer.txt 16 | COPY requirements.txt /pre-installed-requirements.txt 17 | RUN set -ex \ 18 | && pip install --user -r /pre-installed-requirements.txt 19 | 20 | COPY --chown=airflow:airflow script/entrypoint.sh ${AIRFLOW_HOME}/script/entrypoint.sh 21 | COPY --chown=airflow:airflow script/exec_entrypoint.sh ${AIRFLOW_HOME}/script/exec_entrypoint.sh 22 | 23 | # Resolve windows line ending issues 24 | RUN dos2unix -n ${AIRFLOW_HOME}/script/entrypoint.sh ${AIRFLOW_HOME}/script/entrypoint.sh 25 | RUN dos2unix -n ${AIRFLOW_HOME}/script/exec_entrypoint.sh ${AIRFLOW_HOME}/script/exec_entrypoint.sh 26 | 27 | # allow execution of entrypoint script 28 | RUN chmod +x ${AIRFLOW_HOME}/script/entrypoint.sh 29 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.10/requirements.composer.txt: -------------------------------------------------------------------------------- 1 | # Composer default versions 2 | # mirroring version: composer-1.12.1-airflow-1.10.10 3 | 4 | absl-py==0.9.0 5 | alembic==1.4.2 6 | amqp==2.6.0 7 | apache-airflow-backport-providers-google==2020.6.24 8 | apache-beam==2.23.0 9 | apispec==1.3.3 10 | argcomplete==1.11.1 11 | astunparse==1.6.3 12 | attrs==19.3.0 13 | avro-python3==1.9.1 14 | Babel==2.8.0 15 | bcrypt==3.1.6 16 | billiard==3.6.3.0 17 | cached-property==1.5.1 18 | cachetools==3.1.1 19 | cattrs==0.9.0 20 | celery==4.4.5 21 | certifi==2019.11.28 22 | cffi==1.14.0 23 | chardet==3.0.4 24 | click==6.7 25 | colorama==0.4.3 26 | colorlog==4.0.2 27 | configparser==3.5.3 28 | crcmod==1.7 29 | croniter==0.3.31 30 | cryptography==2.8 31 | defusedxml==0.6.0 32 | dill==0.3.1.1 33 | docutils==0.16 34 | fastavro==0.21.24 35 | fasteners==0.15 36 | Flask==1.1.1 37 | Flask-Admin==1.5.4 38 | Flask-AppBuilder==2.3.0 39 | Flask-Babel==1.0.0 40 | Flask-Bcrypt==0.7.1 41 | Flask-Caching==1.3.3 42 | Flask-JWT-Extended==3.24.1 43 | Flask-Login==0.4.1 44 | Flask-OpenID==1.2.5 45 | Flask-SQLAlchemy==2.4.1 46 | flask-swagger==0.2.13 47 | Flask-WTF==0.14.3 48 | flower==0.9.4 49 | funcsigs==1.0.2 50 | future==0.18.2 51 | gast==0.3.3 52 | google-ads==4.0.0 53 | google-api-core==1.16.0 54 | google-api-python-client==1.8.0 55 | google-apitools==0.5.31 56 | google-auth-httplib2==0.0.3 57 | google-auth-oauthlib==0.4.1 58 | google-auth==1.11.3 59 | google-cloud-automl==1.0.1 60 | google-cloud-automl==1.0.1 61 | google-cloud-bigquery-datatransfer==1.1.0 62 | google-cloud-bigquery==1.24.0 63 | google-cloud-bigtable==1.0.0 64 | google-cloud-container==0.4.0 65 | google-cloud-core==1.3.0 66 | google-cloud-datacatalog==0.7.0 67 | google-cloud-dataproc==0.5.0 68 | google-cloud-datastore==1.7.4 69 | google-cloud-dlp==0.13.0 70 | google-cloud-kms==1.4.0 71 | google-cloud-language==1.3.0 72 | google-cloud-logging==1.15.0 73 | google-cloud-monitoring==1.0.0 74 | google-cloud-pubsub==1.0.2 75 | google-cloud-redis==1.0.0 76 | google-cloud-redis==1.0.0 77 | google-cloud-secret-manager==0.2.0 78 | google-cloud-spanner==1.13.0 79 | google-cloud-speech==1.3.2 80 | google-cloud-storage==1.26.0 81 | google-cloud-tasks==1.5.0 82 | google-cloud-texttospeech==1.0.1 83 | google-cloud-translate==2.0.1 84 | google-cloud-videointelligence==1.13.0 85 | google-cloud-vision==1.0.0 86 | google-pasta==0.2.0 87 | google-resumable-media==0.5.0 88 | googleapis-common-protos==1.51.0 89 | graphviz==0.13.2 90 | grpc-google-iam-v1==0.12.3 91 | grpcio==1.29.0 92 | grpcio-gcp==0.2.2 93 | gunicorn==19.10.0 94 | h5py==2.10.0 95 | httplib2==0.17.0 96 | humanize==0.5.1 97 | idna==2.9 98 | importlib-metadata==1.5.0 99 | iso8601==0.1.12 100 | itsdangerous==1.1.0 101 | Jinja2==2.10.3 102 | json-merge-patch==0.2 103 | jsonschema==3.2.0 104 | Keras-Preprocessing==1.1.2 105 | kombu==4.6.10 106 | kubernetes==11.0.0 107 | lazy-object-proxy==1.4.3 108 | lockfile==0.12.2 109 | Mako==1.1.2 110 | Markdown==2.6.11 111 | MarkupSafe==1.1.1 112 | marshmallow==2.19.5 113 | marshmallow-enum==1.5.1 114 | marshmallow-sqlalchemy==0.18.0 115 | mock==2.0.0 116 | monotonic==1.5 117 | mysqlclient==1.3.14 118 | numpy==1.18.2 119 | oauthlib==3.1.0 120 | opt-einsum==3.2.1 121 | pandas==0.25.3 122 | pandas-gbq==0.13.1 123 | pendulum==1.4.4 124 | # pip==19.0.2 not applicable for docker 125 | pipdeptree==1.0.0 126 | prison==0.1.3 127 | protobuf==3.11.3 128 | psutil==5.7.0 129 | pyasn1==0.4.8 130 | psycopg2-binary==2.8.4 131 | pyarrow==0.15.1 132 | pyasn1-modules==0.2.8 133 | pycparser==2.20 134 | pydata-google-auth==0.3.0 135 | pydot==1.4.1 136 | Pygments==2.6.1 137 | PyJWT==1.7.1 138 | pymongo==3.9.0 139 | pyOpenSSL==19.1.0 140 | pyrsistent==0.15.7 141 | python-daemon==2.1.2 142 | python-dateutil==2.8.1 143 | python-editor==1.0.4 144 | python-http-client==3.2.7 145 | python3-openid==3.1.0 146 | pytz==2019.3 147 | pytzdata==2019.3 148 | PyYAML==5.3.1 149 | redis==3.5.3 150 | requests==2.23.0 151 | requests-oauthlib==1.3.0 152 | rsa==4.0 153 | scipy==1.4.1 154 | sendgrid==5.6.0 155 | setproctitle==1.1.10 156 | setuptools==47.3.1 157 | six==1.14.0 158 | SQLAlchemy==1.3.15 159 | SQLAlchemy-JSONField==0.9.0 160 | SQLAlchemy-Utils==0.36.3 161 | statsd==3.3.0 162 | tabulate==0.8.6 163 | tenacity==4.12.0 164 | tensorboard==2.2.2 165 | tensorboard-plugin-wit==1.6.0.post3 166 | tensorflow==2.2.0 167 | tensorflow-estimator==2.2.0 168 | termcolor==1.1.0 169 | text-unidecode==1.2 170 | thrift==0.13.0 171 | tornado==6.0.4 172 | typing==3.7.4.1 173 | typing-extensions==3.7.4.1 174 | tzlocal==1.5.1 175 | unicodecsv==0.14.1 176 | uritemplate==3.0.1 177 | urllib3==1.25.8 178 | vine==1.3.0 179 | virtualenv==16.2.0 180 | websocket-client==0.57.0 181 | Werkzeug==0.16.1 182 | wheel==0.34.2 183 | wrapt==1.12.1 184 | WTForms==2.2.1 185 | zipp==1.2.0 186 | zope.deprecation==4.4.0 187 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.10/requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.composer.txt 2 | # Additional / modified versions 3 | dataclasses==0.7 4 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.10/script/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow initdb 3 | airflow scheduler & 4 | airflow webserver 5 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.10/script/exec_entrypoint.sh: -------------------------------------------------------------------------------- 1 | export POSTGRES_HOST="postgres" 2 | export POSTGRES_PORT="5432" 3 | export POSTGRES_USER="airflow" 4 | export POSTGRES_PASSWORD="airflow" 5 | export POSTGRES_DB="airflow" 6 | export POSTGRES_EXTRAS="" 7 | 8 | AIRFLOW__CORE__SQL_ALCHEMY_CONN="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}" 9 | AIRFLOW__CELERY__RESULT_BACKEND="db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}" 10 | 11 | export AIRFLOW__CORE__SQL_ALCHEMY_CONN 12 | export AIRFLOW__CELERY__RESULT_BACKEND 13 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.14/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:1.10.14-python3.6 2 | 3 | # Install composer dependencies & additional required dependencies not included in Composer 4 | COPY constrains-composer.txt requirements.txt ./ 5 | RUN set -ex && pip install --user -r requirements.txt 6 | 7 | ENTRYPOINT airflow initdb; airflow scheduler & airflow webserver 8 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.14/constrains-composer.txt: -------------------------------------------------------------------------------- 1 | # Composer default versions 2 | # mirroring version: composer-1.15.2-airflow-1.10.14 3 | 4 | absl-py==0.11.0 5 | alembic==1.4.3 6 | amqp==2.6.1 7 | apache-airflow-backport-providers-apache-beam==2021.2.5 8 | apache-airflow-backport-providers-cncf-kubernetes==2021.2.5 9 | apache-airflow-backport-providers-google==2021.2.5 10 | apache-airflow-upgrade-check==1.0.0 11 | apache-beam==2.27.0 12 | apispec==1.3.3 13 | appdirs==1.4.4 14 | argcomplete==1.12.2 15 | astunparse==1.6.3 16 | attrs==20.3.0 17 | avro-python3==1.9.2.1 18 | Babel==2.9.0 19 | bcrypt==3.2.0 20 | billiard==3.6.3.0 21 | cached-property==1.5.2 22 | cachetools==4.1.1 23 | cattrs==1.0.0 24 | celery==4.4.7 25 | certifi==2020.11.8 26 | cffi==1.14.4 27 | chardet==3.0.4 28 | click==6.7 29 | colorama==0.4.4 30 | colorlog==4.0.2 31 | configparser==3.5.3 32 | crcmod==1.7 33 | croniter==0.3.36 34 | cryptography==3.2.1 35 | dataclasses==0.8 36 | defusedxml==0.6.0 37 | dill==0.3.1.1 38 | distlib==0.3.1 39 | dnspython==1.16.0 40 | docopt==0.6.2 41 | docutils==0.15.2 42 | email-validator==1.1.2 43 | fastavro==1.2.0 44 | fasteners==0.15 45 | filelock==3.0.12 46 | Flask==1.1.2 47 | Flask-Admin==1.5.4 48 | Flask-AppBuilder==2.3.4 49 | Flask-Babel==1.0.0 50 | Flask-Bcrypt==0.7.1 51 | Flask-Caching==1.3.3 52 | Flask-JWT-Extended==3.25.0 53 | Flask-Login==0.4.1 54 | Flask-OpenID==1.2.5 55 | Flask-SQLAlchemy==2.4.4 56 | flask-swagger==0.2.14 57 | Flask-WTF==0.14.3 58 | flower==0.9.5 59 | funcsigs==1.0.2 60 | future==0.18.2 61 | gast==0.3.3 62 | google-ads==4.0.0 63 | google-api-core==1.26.0 64 | google-api-python-client==1.12.8 65 | google-apitools==0.5.31 66 | google-auth==1.24.0 67 | google-auth-httplib2==0.0.4 68 | google-auth-oauthlib==0.4.2 69 | google-cloud-automl==2.1.0 70 | google-cloud-bigquery==2.4.0 71 | google-cloud-bigquery-datatransfer==3.0.0 72 | google-cloud-bigquery-storage==2.1.0 73 | google-cloud-bigtable==1.6.0 74 | google-cloud-build==2.0.0 75 | google-cloud-container==1.0.1 76 | google-cloud-core==1.4.3 77 | google-cloud-datacatalog==3.0.0 78 | google-cloud-dataproc==2.2.0 79 | google-cloud-datastore==1.15.3 80 | google-cloud-dlp==1.0.0 81 | google-cloud-kms==2.2.0 82 | google-cloud-language==1.3.0 83 | google-cloud-logging==2.2.0 84 | google-cloud-memcache==0.2.0 85 | google-cloud-monitoring==2.0.0 86 | google-cloud-os-login==2.1.0 87 | google-cloud-pubsub==2.1.0 88 | google-cloud-pubsublite==0.1.0 89 | google-cloud-redis==2.0.0 90 | google-cloud-secret-manager==1.0.0 91 | google-cloud-spanner==1.19.1 92 | google-cloud-speech==1.3.2 93 | google-cloud-storage==1.33.0 94 | google-cloud-tasks==2.1.0 95 | google-cloud-texttospeech==1.0.1 96 | google-cloud-translate==1.7.0 97 | google-cloud-videointelligence==1.16.1 98 | google-cloud-vision==1.0.0 99 | google-cloud-workflows==0.2.0 100 | google-crc32c==1.0.0 101 | google-pasta==0.2.0 102 | google-resumable-media==1.1.0 103 | googleapis-common-protos==1.52.0 104 | graphviz==0.15 105 | grpc-google-iam-v1==0.12.3 106 | grpcio==1.33.2 107 | grpcio-gcp==0.2.2 108 | gunicorn==20.0.4 109 | h5py==2.10.0 110 | hdfs==2.5.8 111 | httplib2==0.17.4 112 | humanize==3.1.0 113 | idna==2.8 114 | importlib-metadata==2.1.0 115 | importlib-resources==1.5.0 116 | iso8601==0.1.13 117 | itsdangerous==1.1.0 118 | Jinja2==2.11.2 119 | json-merge-patch==0.2 120 | jsonschema==3.2.0 121 | Keras-Preprocessing==1.1.2 122 | kombu==4.6.11 123 | kubernetes==11.0.0 124 | lazy-object-proxy==1.4.3 125 | libcst==0.3.14 126 | lockfile==0.12.2 127 | Mako==1.1.3 128 | Markdown==2.6.11 129 | MarkupSafe==1.1.1 130 | marshmallow==2.21.0 131 | marshmallow-enum==1.5.1 132 | marshmallow-sqlalchemy==0.23.1 133 | mock==2.0.0 134 | monotonic==1.5 135 | mypy-extensions==0.4.3 136 | mysqlclient==1.3.14 137 | natsort==7.1.0 138 | numpy==1.19.4 139 | oauth2client==4.1.3 140 | oauthlib==3.1.0 141 | opt-einsum==3.3.0 142 | overrides==3.1.0 143 | packaging==20.7 144 | pandas==1.1.4 145 | pandas-gbq==0.14.1 146 | pbr==5.5.1 147 | pendulum==1.4.4 148 | pep562==1.0 149 | pip==20.1.1 150 | pipdeptree==1.0.0 151 | prison==0.1.3 152 | prometheus-client==0.8.0 153 | proto-plus==1.11.0 154 | protobuf==3.14.0 155 | psutil==5.7.3 156 | psycopg2-binary==2.8.6 157 | pyarrow==2.0.0 158 | pyasn1==0.4.8 159 | pyasn1-modules==0.2.8 160 | pycparser==2.20 161 | pydata-google-auth==1.1.0 162 | pydot==1.4.1 163 | Pygments==2.7.2 164 | PyJWT==1.7.1 165 | pymongo==3.10.1 166 | pyOpenSSL==20.0.0 167 | pyparsing==2.4.7 168 | pyrsistent==0.17.3 169 | python-daemon==2.2.4 170 | python-dateutil==2.8.1 171 | python-editor==1.0.4 172 | python-http-client==3.3.1 173 | python-nvd3==0.15.0 174 | python-slugify==4.0.1 175 | python3-openid==3.2.0 176 | pytz==2020.4 177 | pytzdata==2020.1 178 | PyYAML==5.3.1 179 | redis==3.5.3 180 | requests==2.25.0 181 | requests-oauthlib==1.3.0 182 | rsa==4.6 183 | scipy==1.4.1 184 | sendgrid==5.6.0 185 | setproctitle==1.2 186 | setuptools==51.0.0 187 | six==1.15.0 188 | SQLAlchemy==1.3.20 189 | SQLAlchemy-JSONField==0.9.0 190 | SQLAlchemy-Utils==0.36.8 191 | statsd==3.3.0 192 | tabulate==0.8.7 193 | tenacity==4.12.0 194 | tensorboard==2.2.2 195 | tensorboard-plugin-wit==1.7.0 196 | tensorflow==2.2.0 197 | tensorflow-estimator==2.2.0 198 | termcolor==1.1.0 199 | text-unidecode==1.3 200 | thrift==0.13.0 201 | tornado==5.1.1 202 | typing==3.7.4.3 203 | typing-extensions==3.7.4.3 204 | typing-inspect==0.6.0 205 | tzlocal==1.5.1 206 | unicodecsv==0.14.1 207 | uritemplate==3.0.1 208 | urllib3==1.25.11 209 | vine==1.3.0 210 | virtualenv==20.2.1 211 | websocket-client==0.54.0 212 | Werkzeug==0.16.1 213 | wheel==0.36.1 214 | wrapt==1.12.1 215 | WTForms==2.3.3 216 | zipp==3.4.0 217 | zope.deprecation==4.4.0 218 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.14/requirements.txt: -------------------------------------------------------------------------------- 1 | -r constrains-composer.txt 2 | # Additional / modified versions 3 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.15/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:1.10.15-python3.6 2 | 3 | # Install composer dependencies & additional required dependencies not included in Composer 4 | COPY constrains-composer.txt requirements.txt ./ 5 | RUN set -ex && pip install --user -r requirements.txt 6 | 7 | ENTRYPOINT airflow db init; airflow scheduler & airflow webserver 8 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.15/constrains-composer.txt: -------------------------------------------------------------------------------- 1 | # Composer default versions 2 | # mirroring version: composer-1.16.1-airflow-1.10.15 3 | 4 | absl-py==0.12.0 5 | alembic==1.5.7 6 | amqp==2.6.1 7 | apache-airflow-backport-providers-apache-beam==2021.3.13 8 | apache-airflow-backport-providers-cncf-kubernetes==2021.3.3 9 | apache-airflow-backport-providers-google==2021.3.3 10 | apache-beam==2.27.0 11 | apispec==1.3.3 12 | appdirs==1.4.4 13 | argcomplete==1.12.2 14 | astunparse==1.6.3 15 | attrs==20.3.0 16 | avro-python3==1.9.2.1 17 | Babel==2.9.0 18 | bcrypt==3.2.0 19 | billiard==3.6.3.0 20 | cached-property==1.5.2 21 | cachetools==4.2.1 22 | cattrs==1.0.0 23 | celery==4.4.7 24 | certifi==2020.12.5 25 | cffi==1.14.5 26 | chardet==4.0.0 27 | click==6.7 28 | colorama==0.4.4 29 | colorlog==4.0.2 30 | configparser==3.5.3 31 | crcmod==1.7 32 | croniter==0.3.37 33 | cryptography==3.4.6 34 | dataclasses==0.8 35 | defusedxml==0.7.1 36 | dill==0.3.1.1 37 | distlib==0.3.1 38 | dnspython==2.1.0 39 | docopt==0.6.2 40 | docutils==0.16 41 | email-validator==1.1.2 42 | fastavro==1.3.4 43 | fasteners==0.16 44 | filelock==3.0.12 45 | Flask==1.1.2 46 | Flask-Admin==1.5.4 47 | Flask-AppBuilder==2.3.4 48 | Flask-Babel==1.0.0 49 | Flask-Bcrypt==0.7.1 50 | Flask-Caching==1.3.3 51 | Flask-JWT-Extended==3.25.1 52 | Flask-Login==0.4.1 53 | Flask-OpenID==1.2.5 54 | Flask-SQLAlchemy==2.5.1 55 | flask-swagger==0.2.14 56 | Flask-WTF==0.14.3 57 | flower==0.9.7 58 | funcsigs==1.0.2 59 | future==0.18.2 60 | gast==0.3.3 61 | google-ads==4.0.0 62 | google-api-core==1.26.1 63 | google-api-python-client==1.12.8 64 | google-apitools==0.5.31 65 | google-auth==1.28.0 66 | google-auth-httplib2==0.1.0 67 | google-auth-oauthlib==0.4.3 68 | google-cloud-automl==2.2.0 69 | google-cloud-bigquery==2.13.0 70 | google-cloud-bigquery-datatransfer==3.1.0 71 | google-cloud-bigquery-storage==2.1.0 72 | google-cloud-bigtable==1.7.0 73 | google-cloud-build==2.0.0 74 | google-cloud-container==1.0.1 75 | google-cloud-core==1.6.0 76 | google-cloud-datacatalog==3.1.0 77 | google-cloud-dataproc==2.3.0 78 | google-cloud-datastore==1.15.3 79 | google-cloud-dlp==1.0.0 80 | google-cloud-kms==2.2.0 81 | google-cloud-language==1.3.0 82 | google-cloud-logging==2.2.0 83 | google-cloud-memcache==0.3.0 84 | google-cloud-monitoring==2.0.0 85 | google-cloud-os-login==2.1.0 86 | google-cloud-pubsub==2.3.0 87 | google-cloud-pubsublite==0.3.0 88 | google-cloud-redis==2.1.0 89 | google-cloud-secret-manager==1.0.0 90 | google-cloud-spanner==1.19.1 91 | google-cloud-speech==1.3.2 92 | google-cloud-storage==1.36.2 93 | google-cloud-tasks==2.2.0 94 | google-cloud-texttospeech==1.0.1 95 | google-cloud-translate==1.7.0 96 | google-cloud-videointelligence==1.16.1 97 | google-cloud-vision==1.0.0 98 | google-cloud-workflows==0.2.0 99 | google-crc32c==1.1.2 100 | google-pasta==0.2.0 101 | google-resumable-media==1.2.0 102 | googleapis-common-protos==1.53.0 103 | graphviz==0.16 104 | greenlet==1.0.0 105 | grpc-google-iam-v1==0.12.3 106 | grpcio==1.36.1 107 | grpcio-gcp==0.2.2 108 | gunicorn==20.0.4 109 | h5py==2.10.0 110 | hdfs==2.6.0 111 | httplib2==0.17.4 112 | humanize==3.3.0 113 | idna==2.8 114 | importlib-metadata==2.1.1 115 | importlib-resources==1.5.0 116 | iso8601==0.1.14 117 | itsdangerous==1.1.0 118 | Jinja2==2.11.3 119 | json-merge-patch==0.2 120 | jsonschema==3.2.0 121 | Keras-Preprocessing==1.1.2 122 | kombu==4.6.11 123 | kubernetes==11.0.0 124 | lazy-object-proxy==1.4.3 125 | libcst==0.3.17 126 | lockfile==0.12.2 127 | Mako==1.1.4 128 | Markdown==2.6.11 129 | MarkupSafe==1.1.1 130 | marshmallow==2.21.0 131 | marshmallow-enum==1.5.1 132 | marshmallow-sqlalchemy==0.23.1 133 | mock==2.0.0 134 | monotonic==1.5 135 | mypy-extensions==0.4.3 136 | mysqlclient==1.3.14 137 | natsort==7.1.1 138 | numpy==1.19.5 139 | oauth2client==4.1.3 140 | oauthlib==3.1.0 141 | opt-einsum==3.3.0 142 | overrides==3.1.0 143 | packaging==20.9 144 | pandas==1.1.5 145 | pandas-gbq==0.14.1 146 | pbr==5.5.1 147 | pendulum==1.4.4 148 | pep562==1.0 149 | pip==20.1.1 150 | pipdeptree==1.0.0 151 | prison==0.1.3 152 | prometheus-client==0.8.0 153 | proto-plus==1.18.1 154 | protobuf==3.15.6 155 | psutil==5.8.0 156 | psycopg2-binary==2.8.6 157 | pyarrow==2.0.0 158 | pyasn1==0.4.8 159 | pyasn1-modules==0.2.8 160 | pycparser==2.20 161 | pydata-google-auth==1.1.0 162 | pydot==1.4.2 163 | Pygments==2.8.1 164 | PyJWT==1.7.1 165 | pymongo==3.11.3 166 | pyOpenSSL==20.0.1 167 | pyparsing==2.4.7 168 | pyrsistent==0.17.3 169 | python-daemon==2.3.0 170 | python-dateutil==2.8.1 171 | python-editor==1.0.4 172 | python-http-client==3.3.2 173 | python-nvd3==0.15.0 174 | python-slugify==4.0.1 175 | python3-openid==3.2.0 176 | pytz==2021.1 177 | pytzdata==2020.1 178 | PyYAML==5.4.1 179 | redis==3.5.3 180 | requests==2.25.1 181 | requests-oauthlib==1.3.0 182 | rsa==4.7.2 183 | scipy==1.4.1 184 | sendgrid==5.6.0 185 | setproctitle==1.2.2 186 | setuptools==54.2.0 187 | six==1.15.0 188 | SQLAlchemy==1.3.20 189 | SQLAlchemy-JSONField==0.9.0 190 | SQLAlchemy-Utils==0.36.8 191 | starkbank-ecdsa==1.1.0 192 | statsd==3.3.0 193 | tabulate==0.8.9 194 | tenacity==4.12.0 195 | tensorboard==2.2.2 196 | tensorboard-plugin-wit==1.8.0 197 | tensorflow==2.2.0 198 | tensorflow-estimator==2.2.0 199 | termcolor==1.1.0 200 | text-unidecode==1.3 201 | thrift==0.13.0 202 | tornado==5.1.1 203 | typing==3.7.4.3 204 | typing-extensions==3.7.4.3 205 | typing-inspect==0.6.0 206 | tzlocal==1.5.1 207 | unicodecsv==0.14.1 208 | uritemplate==3.0.1 209 | urllib3==1.26.4 210 | vine==1.3.0 211 | virtualenv==20.4.3 212 | websocket-client==0.58.0 213 | Werkzeug==0.16.1 214 | wheel==0.36.2 215 | wrapt==1.12.1 216 | WTForms==2.3.3 217 | zipp==3.4.1 218 | zope.deprecation==4.4.0 219 | -------------------------------------------------------------------------------- /infrastructure/docker/composer_1.10.15/requirements.txt: -------------------------------------------------------------------------------- 1 | -r constrains-composer.txt 2 | # Additional / modified versions 3 | -------------------------------------------------------------------------------- /infrastructure/docker/developer_only/.airflowignore: -------------------------------------------------------------------------------- 1 | dop_service_project 2 | -------------------------------------------------------------------------------- /infrastructure/docker/docker-compose-dop.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | postgres: 4 | image: postgres:13.2-alpine 5 | container_name: dop_postgres 6 | restart: always 7 | environment: 8 | - POSTGRES_USER=airflow 9 | - POSTGRES_PASSWORD=airflow 10 | - POSTGRES_DB=airflow 11 | logging: 12 | options: 13 | max-size: 10m 14 | max-file: "3" 15 | 16 | webserver: 17 | build: composer_${AIRFLOW_VERSION} 18 | container_name: dop_webserver 19 | restart: always 20 | entrypoint: ./script/entrypoint.sh 21 | depends_on: 22 | - postgres 23 | environment: 24 | - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow 25 | - AIRFLOW__CORE__EXECUTOR=LocalExecutor 26 | - AIRFLOW__CORE__LOGGING_LEVEL=INFO 27 | - GOOGLE_APPLICATION_CREDENTIALS=/secret/gcp-credentials/application_default_credentials.json 28 | - DOP_SANDBOX_ENVIRONMENT=true # set to true if running locally on a laptop, this enables certain features such as service account impersonation 29 | 30 | # The following environment environment variables need to be set on both the docker local environment as well as the composer environment 31 | - DOP_SERVICE_PROJECT_PATH=/opt/airflow/dags/dop_service_project # The absolute directory of the service project path. Each DBT project in this path should be within their folder and must be valid. I.e. on Docker, this could be /opt/airflow/dags/dop/dbt-projects. On Composer this could be anywhere under the `/home/airflow/gcs/dags` or `/home/airflow/gcs/data` directory 32 | - DOP_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP project_id - to be used as the project where data will be consumed & persisted 33 | - DOP_LOCATION=${LOCATION?location_is_undefined} # GCP region - to be used to persist all data 34 | - DOP_INFRA_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP infrastructure project id, for local development this isn't used so leaving it as the same as gcp service project id 35 | logging: 36 | options: 37 | max-size: 10m 38 | max-file: "3" 39 | volumes: 40 | - ${SERVICE_PROJECT_ABS_PATH}:/opt/airflow/dags/dop_service_project 41 | - ~/.config/gcloud/application_default_credentials.json:/secret/gcp-credentials/application_default_credentials.json # mount application default credentials only so no keys as used 42 | # - ${SERVICE_PROJECT_ABS_PATH}/plugins:/usr/local/airflow/plugins 43 | ports: 44 | - "8082:8080" 45 | command: webserver 46 | healthcheck: 47 | test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"] 48 | interval: 30s 49 | timeout: 30s 50 | retries: 3 51 | -------------------------------------------------------------------------------- /infrastructure/executor/dbt/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM google/cloud-sdk:slim 2 | 3 | ENV LANG C.UTF-8 4 | ENV LC_ALL C.UTF-8 5 | 6 | ARG DBT_HOME=/home/dbtuser 7 | ARG BUILD_DIR=/tmp/dbt_build_tmp 8 | 9 | RUN apt-get update && apt-get install -y git 10 | 11 | RUN set -ex \ 12 | && pip3 install PyYAML \ 13 | && pip3 install pipenv 14 | 15 | RUN groupadd -g 999 dbtuser && useradd -r -u 999 -g dbtuser dbtuser 16 | WORKDIR ${DBT_HOME} 17 | 18 | RUN chown -R dbtuser:dbtuser ${DBT_HOME} 19 | 20 | USER dbtuser 21 | RUN mkdir ${DBT_HOME}/.dbt 22 | 23 | RUN mkdir ${BUILD_DIR} 24 | 25 | # Update pip dependencies 26 | COPY --chown=dbtuser:dbtuser ./embedded_dop/executor_config/dbt/Pipfile ./embedded_dop/executor_config/dbt/Pipfile.lock ./ 27 | RUN pipenv sync 28 | 29 | # store the whole service project repository in the .tmp folder and build what's required and then delete everything else 30 | COPY --chown=dbtuser:dbtuser ./ ${BUILD_DIR} 31 | RUN ls -al ${BUILD_DIR} 32 | 33 | # initialise dbt 34 | RUN DBT_HOME=${DBT_HOME} BUILD_DIR=${BUILD_DIR} python3 ${BUILD_DIR}/embedded_dop/source/infrastructure/executor/dbt/init.py 35 | 36 | # remote the build dir 37 | RUN rm -rf ${BUILD_DIR} 38 | -------------------------------------------------------------------------------- /infrastructure/executor/dbt/README.md: -------------------------------------------------------------------------------- 1 | # Building the DBT executor docker image 2 | This builds the production ready docker image to run with Kubernetes Pod Operator 3 | 4 | ## How to start the build process 5 | See the Makefile in the `examples/service_project` folder for more details. 6 | 7 | TODO: this container image is only used for production / cloud composer, but it would be better to bring this more inline with the local docker environment without compromising usability 8 | -------------------------------------------------------------------------------- /infrastructure/executor/dbt/init.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import json 4 | import shutil 5 | import subprocess 6 | 7 | DOP_DBT_USER = "dop-dbt-user" 8 | 9 | try: 10 | from yaml import CLoader as Loader, CDumper as Dumper 11 | except ImportError: 12 | from yaml import Loader, Dumper # noqa: F401 13 | 14 | 15 | def copy_and_overwrite(from_path, to_path): 16 | if os.path.exists(to_path): 17 | shutil.rmtree(to_path) 18 | shutil.copytree(from_path, to_path) 19 | 20 | 21 | def yaml_to_dict(y): 22 | yml = yaml.load(y, Loader=Loader) 23 | return yml 24 | 25 | 26 | def dict_to_yaml(d): 27 | yml = yaml.load(json.dumps(d), Loader=Loader) 28 | return yaml.dump(yml) 29 | 30 | 31 | def build_profile_file_content(profile_ids): 32 | # the profile is generated dynamically at runtime, therefore multiple target profiles are not required 33 | target = "all" 34 | target_type = "bigquery" 35 | 36 | bq_profile = {} 37 | 38 | for profile_id in profile_ids: 39 | bq_profile[profile_id] = { 40 | "target": target, 41 | "outputs": { 42 | target: { 43 | "type": target_type, 44 | "method": "oauth", 45 | "project": '{{ env_var("DOP_PROJECT_ID") }}', 46 | "schema": '{{ env_var("DOP_DBT_SCHEMA", "' 47 | + str(profile_id).replace("-", "_") 48 | + '") }}', 49 | "threads": 1, 50 | "timeout_seconds": 300, 51 | "location": '{{ env_var("DOP_LOCATION") }}', 52 | "priority": "interactive", 53 | "impersonate_service_account": f"{DOP_DBT_USER}" 54 | + '@{{ env_var("DOP_PROJECT_ID") }}.iam.gserviceaccount.com', 55 | } 56 | }, 57 | } 58 | 59 | profile = dict_to_yaml(bq_profile) 60 | 61 | return profile 62 | 63 | 64 | def save_profile_yml(dbt_home, file_content): 65 | with open(os.path.sep.join([dbt_home, ".dbt", "profiles.yml"]), "w+") as fp: 66 | fp.write(file_content) 67 | 68 | 69 | dbt_home = os.environ["DBT_HOME"] 70 | build_dir = os.environ["BUILD_DIR"] 71 | 72 | print(f"DBT_HOME: {dbt_home}") 73 | print(f"BUILD_DIR: {build_dir}") 74 | 75 | dop_config_path = os.path.sep.join( 76 | [build_dir, "embedded_dop", "executor_config", "dbt", "config.yaml"] 77 | ) 78 | 79 | with open(dop_config_path) as fp_config: 80 | dop_config = yaml_to_dict(fp_config.read()) 81 | # validation 82 | if not dop_config.get("dbt_projects"): 83 | raise RuntimeError("The `dbt_projects` section must be defined") 84 | 85 | dbt_configs = dop_config.get("dbt_projects") 86 | profile_ids = [] 87 | dbt_projects_path = [] 88 | for dbt_config in dbt_configs: 89 | # validation 90 | if not dbt_config.get("project_path"): 91 | raise RuntimeError("`project_path` must be defined for DBT") 92 | 93 | project_path = dbt_config.get("project_path") 94 | 95 | project_yml_path = os.path.sep.join( 96 | [build_dir, project_path, "dbt_project.yml"] 97 | ) 98 | with open(project_yml_path) as fp_yml: 99 | profile_ids.append(yaml_to_dict(fp_yml.read()).get("profile")) 100 | 101 | # copy dbt projects to the dbt home location 102 | to_path = os.path.sep.join([dbt_home, project_path]) 103 | copy_and_overwrite( 104 | from_path=os.path.sep.join([build_dir, project_path]), to_path=to_path 105 | ) 106 | 107 | dbt_projects_path.append(to_path) 108 | 109 | # create the profiles yml file for all dbt projects 110 | if profile_ids: 111 | file_content = build_profile_file_content(profile_ids=profile_ids) 112 | 113 | print(f"profiles.yml: \n{file_content}") 114 | 115 | save_profile_yml(dbt_home=dbt_home, file_content=file_content) 116 | 117 | for dbt_project_path in dbt_projects_path: 118 | for dbt_cmd in ["clean", "deps"]: 119 | proc = subprocess.Popen( 120 | ["pipenv", "run", "dbt", dbt_cmd], 121 | stdout=subprocess.PIPE, 122 | cwd=dbt_project_path, 123 | ) 124 | while True: 125 | line = proc.stdout.readline() 126 | if not line: 127 | break 128 | print(line.rstrip()) 129 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/tests/__init__.py -------------------------------------------------------------------------------- /tests/integration_tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/tests/integration_tests/.gitkeep -------------------------------------------------------------------------------- /tests/unit_tests/component/transformation/common/adapter/test_schema.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dags.dop.component.transformation.common.adapter import ( 3 | schema as transformation_schema, 4 | ) 5 | from dags.dop.component.transformation.common.adapter.schema import InvalidDagConfig 6 | 7 | 8 | def test_dag_config_overall_validation(): 9 | invalid_payload = {} 10 | schema = transformation_schema.DagConfigSchema() 11 | errors = schema.load(invalid_payload).errors 12 | 13 | assert errors.get("timezone") 14 | assert errors.get("tasks") 15 | 16 | 17 | def test_dag_config_cron_validation(): 18 | invalid_payload = {"schedule_interval": "0 *"} 19 | schema = transformation_schema.DagConfigSchema() 20 | errors = schema.load(invalid_payload).errors 21 | 22 | assert errors.get("schedule_interval") 23 | 24 | 25 | def test_task_overall_config_validation(): 26 | invalid_payload = { 27 | "schedule_interval": "0 1 * * *", 28 | "timezone": "Europe/London", 29 | "tasks": [{}], 30 | } 31 | schema = transformation_schema.DagConfigSchema() 32 | errors = schema.load(invalid_payload).errors 33 | 34 | assert errors["tasks"][0].get("identifier") 35 | assert errors["tasks"][0].get("kind") 36 | 37 | 38 | def test_task_kind_config_validation(): 39 | invalid_payload = { 40 | "schedule_interval": "0 1 * * *", 41 | "timezone": "Europe/London", 42 | "tasks": [ 43 | { 44 | "identifier": "stg_covid19", 45 | "kind": { 46 | "action": "materialization_invalid", 47 | "target": "table_invalid", 48 | }, 49 | } 50 | ], 51 | } 52 | schema = transformation_schema.DagConfigSchema() 53 | errors = schema.load(invalid_payload).errors 54 | 55 | assert errors["tasks"][0].get("kind").get("action") == ["Not a valid choice."] 56 | assert errors["tasks"][0].get("kind").get("target") is None 57 | 58 | 59 | def test_partitioning_validation_with_invalid_field(): 60 | payload = generate_valid_schema() 61 | payload["tasks"] = [ 62 | {"partitioning": {"field": "this_is_wrong", "data_type": "date"}} 63 | ] 64 | 65 | with pytest.raises(InvalidDagConfig): 66 | transformation_schema.load_dag_schema(payload) 67 | 68 | 69 | def test_partitioning_validation_with_invalid_data_type(): 70 | payload = generate_valid_schema() 71 | payload["tasks"] = [ 72 | {"partitioning": {"field": "my_field", "data_type": "an_invalid_type"}} 73 | ] 74 | 75 | with pytest.raises(InvalidDagConfig): 76 | transformation_schema.load_dag_schema(payload) 77 | 78 | 79 | def test_partitioning_validation_for_timestamp_type(): 80 | valid_payload = generate_valid_schema() 81 | valid_payload["tasks"] = [ 82 | { 83 | "partitioning": {"field": "date", "data_type": "timestamp"}, 84 | "identifier": "stg_covid19", 85 | "schema": "dop_sandbox_us", 86 | "kind": {"action": "materialization", "target": "table"}, 87 | "dependencies": ["a", "b", "c"], 88 | } 89 | ] 90 | 91 | dag_config = transformation_schema.load_dag_schema(valid_payload) 92 | 93 | assert isinstance( 94 | dag_config.tasks[0].partitioning, transformation_schema.Partitioning 95 | ) 96 | assert dag_config.tasks[0].partitioning.data_type == "timestamp" 97 | 98 | 99 | def test_partitioning_validation_for_datetime_type(): 100 | valid_payload = generate_valid_schema() 101 | valid_payload["tasks"] = [ 102 | { 103 | "partitioning": {"field": "date", "data_type": "datetime"}, 104 | "identifier": "stg_covid19", 105 | "schema": "dop_sandbox_us", 106 | "kind": {"action": "materialization", "target": "table"}, 107 | "dependencies": ["a", "b", "c"], 108 | } 109 | ] 110 | 111 | dag_config = transformation_schema.load_dag_schema(valid_payload) 112 | 113 | assert isinstance( 114 | dag_config.tasks[0].partitioning, transformation_schema.Partitioning 115 | ) 116 | assert dag_config.tasks[0].partitioning.data_type == "datetime" 117 | 118 | 119 | def test_partitioning_validation_for_date_type(): 120 | valid_payload = generate_valid_schema() 121 | valid_payload["tasks"] = [ 122 | { 123 | "partitioning": {"field": "date", "data_type": "date"}, 124 | "identifier": "stg_covid19", 125 | "schema": "dop_sandbox_us", 126 | "kind": {"action": "materialization", "target": "table"}, 127 | "dependencies": ["a", "b", "c"], 128 | } 129 | ] 130 | 131 | dag_config = transformation_schema.load_dag_schema(valid_payload) 132 | 133 | assert isinstance( 134 | dag_config.tasks[0].partitioning, transformation_schema.Partitioning 135 | ) 136 | assert dag_config.tasks[0].partitioning.data_type == "date" 137 | 138 | 139 | def test_schema_deserialization(): 140 | payload = generate_valid_schema() 141 | 142 | try: 143 | transformation_schema.load_dag_schema(payload) 144 | except transformation_schema.InvalidDagConfig as e: 145 | pytest.fail(f"Should not raise exception InvalidDagConfig, error: {e}") 146 | 147 | dag_config = transformation_schema.load_dag_schema(payload) 148 | assert isinstance(dag_config, transformation_schema.DagConfig) 149 | assert isinstance(dag_config.tasks[0], transformation_schema.Task) 150 | assert isinstance(dag_config.tasks[0].kind, transformation_schema.Kind) 151 | assert isinstance( 152 | dag_config.tasks[0].partitioning, transformation_schema.Partitioning 153 | ) 154 | assert dag_config.tasks[0].dependencies == ["a", "b", "c"] 155 | 156 | 157 | def generate_valid_schema(): 158 | return { 159 | "schedule_interval": "0 1 * * *", 160 | "timezone": "Europe/London", 161 | "params": {"value_a": [1, 2, 3]}, 162 | "database": "sandbox", 163 | "schema": "dop_sandbox_us", 164 | "tasks": [ 165 | { 166 | "partitioning": {"field": "date", "data_type": "date"}, 167 | "identifier": "stg_covid19", 168 | "schema": "dop_sandbox_us", 169 | "kind": {"action": "materialization", "target": "table"}, 170 | "dependencies": ["a", "b", "c"], 171 | } 172 | ], 173 | } 174 | -------------------------------------------------------------------------------- /tests/unit_tests/requirements.txt: -------------------------------------------------------------------------------- 1 | # Test libraries 2 | pytest==6.2.4 3 | 4 | # Dependencies extracted from infrastructure/docker/requirements.txt 5 | croniter==0.3.31 6 | marshmallow==2.19.5 7 | --------------------------------------------------------------------------------