├── .flake8
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE.md
├── Makefile
├── README.md
├── dags
    └── dop
    │   ├── __init__.py
    │   ├── airflow_module
    │       ├── dag_builder
    │       │   ├── dag_builder_util.py
    │       │   └── transformation_dag_builder.py
    │       └── operator
    │       │   ├── common.py
    │       │   ├── dbt_k8_operator.py
    │       │   ├── dbt_operator.py
    │       │   ├── dbt_operator_helper.py
    │       │   └── run_results_schema.json
    │   ├── component
    │       ├── configuration
    │       │   ├── __init__.py
    │       │   └── env.py
    │       ├── helper
    │       │   ├── __init__.py
    │       │   ├── dbt_init.py
    │       │   └── dbt_profile.py
    │       ├── transformation
    │       │   ├── common
    │       │   │   ├── adapter
    │       │   │   │   ├── model.py
    │       │   │   │   ├── relation.py
    │       │   │   │   └── schema.py
    │       │   │   ├── parser
    │       │   │   │   └── yaml_parser.py
    │       │   │   └── templating
    │       │   │   │   ├── jinja.py
    │       │   │   │   └── template
    │       │   │   │       └── global.sql
    │       │   └── runner
    │       │   │   └── bigquery
    │       │   │       ├── adapter
    │       │   │           ├── impl.py
    │       │   │           ├── model.py
    │       │   │           └── relation.py
    │       │   │       └── template
    │       │   │           └── macro
    │       │   │               ├── adapter.sql
    │       │   │               └── materialization
    │       │   │                   ├── table_create_or_replace.sql
    │       │   │                   └── table_upsert.sql
    │       └── util
    │       │   ├── auth.py
    │       │   └── secret_manager.py
    │   └── definitions.py
├── docker-compose.yml
├── docs
    ├── a_typical_dop_orchestration_flow.png
    ├── dop_docker_account_impersonation.png
    ├── dop_service_project_architecture.png
    ├── example_dag_with_dbt_running.png
    ├── grant_service_account_user.png
    ├── local_airflow_ui.png
    ├── set-variables-ide.png
    ├── trigger_dag.png
    └── trigger_full_refresh.png
├── examples
    └── service_project
    │   ├── .gcloudignore
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README.md
    │   ├── dbt_start
    │       ├── .gitignore
    │       ├── README.md
    │       ├── analysis
    │       │   └── .gitkeep
    │       ├── data
    │       │   └── .gitkeep
    │       ├── dbt_project.yml
    │       ├── macros
    │       │   └── .gitkeep
    │       ├── models
    │       │   ├── aggregation_a
    │       │   │   └── covid19_cases_by_country.sql
    │       │   ├── aggregation_b
    │       │   │   └── covid19_cases_by_country_and_region.sql
    │       │   ├── schema.yml
    │       │   └── staging
    │       │   │   └── stg_covid19_cases.sql
    │       ├── snapshots
    │       │   └── .gitkeep
    │       └── tests
    │       │   └── .gitkeep
    │   ├── dbt_start_two
    │       ├── .gitignore
    │       ├── README.md
    │       ├── analysis
    │       │   └── .gitkeep
    │       ├── data
    │       │   └── .gitkeep
    │       ├── dbt_project.yml
    │       ├── macros
    │       │   └── .gitkeep
    │       ├── models
    │       │   ├── aggregation_a
    │       │   │   └── covid19_cases_by_country.sql
    │       │   ├── aggregation_b
    │       │   │   └── covid19_cases_by_country_and_region.sql
    │       │   ├── schema.yml
    │       │   └── staging
    │       │   │   └── stg_covid19_cases.sql
    │       ├── snapshots
    │       │   └── .gitkeep
    │       └── tests
    │       │   └── .gitkeep
    │   └── embedded_dop
    │       ├── executor_config
    │           └── dbt
    │           │   ├── Pipfile
    │           │   ├── Pipfile.lock
    │           │   └── config.yaml
    │       └── orchestration
    │           ├── dummy_upstream_dependency
    │               └── config.yaml
    │           ├── example_complex_design
    │               ├── config.yaml
    │               └── sql
    │               │   ├── dim_customer.sql
    │               │   ├── dim_customer_assertion.sql
    │               │   ├── dim_customer_subscription.sql
    │               │   ├── dim_customer_subscription_assertion.sql
    │               │   ├── dim_date.sql
    │               │   ├── dim_product.sql
    │               │   ├── dim_voucher.sql
    │               │   ├── fact_customer_activity.sql
    │               │   ├── fact_newly_registered_customer.sql
    │               │   ├── fact_transaction.sql
    │               │   ├── list_of_users_require_attention.sql
    │               │   ├── salesforce_marketing_cloud_is_ready.sql
    │               │   ├── salesforce_service_cloud_is_ready.sql
    │               │   ├── staging_salesforce_marketing_cloud.sql
    │               │   ├── staging_salesforce_service_cloud.sql
    │               │   ├── staging_zend_desk.sql
    │               │   ├── zend_desk_is_ready.sql
    │               │   ├── zend_desk_ticket_assignments.sql
    │               │   ├── zend_desk_ticket_comments.sql
    │               │   ├── zend_desk_ticket_priority_changes.sql
    │               │   ├── zend_desk_ticket_summary.sql
    │               │   └── zend_desk_ticket_user_issues.sql
    │           ├── example_covid19
    │               ├── config.yaml
    │               └── sql
    │               │   ├── assert_upstream_data_is_ready.sql
    │               │   ├── covid19_by_country.sql
    │               │   ├── covid19_by_country_and_region.sql
    │               │   ├── data_quality_checks.sql
    │               │   ├── invoke_sp_all_countries_and_regions.sql
    │               │   ├── sp_all_countries_and_regions.sql
    │               │   ├── stg_covid19.sql
    │               │   ├── udf_empty_str_as_null.sql
    │               │   ├── udf_unpivot.sql
    │               │   └── view_covid19_by_country_and_region.sql
    │           ├── example_dataflow_template
    │               └── config.yaml
    │           ├── example_external_task_sensor
    │               └── config.yaml
    │           └── example_upstream_dependency
    │               └── config.yaml
├── infrastructure
    ├── cloudbuild
    │   ├── build-dbt.yaml
    │   ├── build.yaml
    │   └── deploy.yaml
    ├── dbt-docs
    │   ├── README.md
    │   └── app-engine
    │   │   ├── .gcloudignore
    │   │   ├── app.yaml
    │   │   ├── main.py
    │   │   └── requirements.txt
    ├── docker
    │   ├── README.md
    │   ├── composer_1.10.10
    │   │   ├── Dockerfile
    │   │   ├── requirements.composer.txt
    │   │   ├── requirements.txt
    │   │   └── script
    │   │   │   ├── entrypoint.sh
    │   │   │   └── exec_entrypoint.sh
    │   ├── composer_1.10.14
    │   │   ├── Dockerfile
    │   │   ├── constrains-composer.txt
    │   │   └── requirements.txt
    │   ├── composer_1.10.15
    │   │   ├── Dockerfile
    │   │   ├── constrains-composer.txt
    │   │   └── requirements.txt
    │   ├── developer_only
    │   │   └── .airflowignore
    │   └── docker-compose-dop.yml
    └── executor
    │   └── dbt
    │       ├── Dockerfile
    │       ├── README.md
    │       └── init.py
└── tests
    ├── __init__.py
    ├── integration_tests
        └── .gitkeep
    └── unit_tests
        ├── component
            └── transformation
            │   └── common
            │       └── adapter
            │           └── test_schema.py
        └── requirements.txt


/.flake8:
--------------------------------------------------------------------------------
1 | # See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#line-length for more details
2 | [flake8]
3 | max-line-length = 88
4 | select = C,E,F,W,B,B950
5 | extend-ignore = E203, E501
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | venv
 3 | .installed.cfg
 4 | bin
 5 | develop-eggs
 6 | dist
 7 | downloads
 8 | eggs
 9 | parts
10 | src/*.egg-info
11 | lib
12 | lib64
13 | *.pyc
14 | *.pyo
15 | .python-version
16 | 
17 | # folders
18 | 
19 | # IDE
20 | .idea
21 | 
22 | .env
23 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |       - id: check-yaml
 6 |       - id: end-of-file-fixer
 7 |       - id: trailing-whitespace
 8 |   - repo: https://github.com/psf/black
 9 |     rev: 21.5b1
10 |     hooks:
11 |       - id: black
12 |         language_version: python3
13 |   - repo: https://gitlab.com/pycqa/flake8
14 |     rev: 3.9.2
15 |     hooks:
16 |       - id: flake8
17 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # DOP v0.3.0 — 2021-08-11
 2 | 
 3 | ## Features
 4 | 
 5 | * **Support for "generic" airflow operators**: you can now use regular python
 6 |   operators as part of your config files.
 7 | 
 8 | * **Support for “dbt docs” command to generate documentation for all dbt
 9 |   tasks**: Users can now add “docs generate” as a target in their DOP
10 |   configuration and additionally specify a GCS bucket with the `--bucket`
11 |   and `--bucket-path` options where documents are copied to.
12 | 
13 | * **Serve dbt docs**: Documents generated by dbt can be served as a web page by
14 |   deploying the provided app on GAE. Note that deploying is an additional step
15 |   that needs to be carried out after docs have been generated. See
16 |   `infrastructure/dbt-docs/README.md` for details.
17 | 
18 | * **dbt tasks artifacts `run_results` created by dbt tasks saved to BigQuery**:
19 |   This json file contains information on completed dbt invocations and is saved
20 |   in the BQ table “run_results” for analysis and debugging.
21 | 
22 | * **Add support for Airflow `v1.10.14` and `v1.10.15` local environments**:
23 |   Users can specify which version they want to use by setting
24 |   the `AIRFLOW_VERSION` environment variable.
25 | 
26 | * **Pre-commit linters**: added pre-commit hooks to ensure python, yaml and some
27 |   support for plain text file consistency in formatting and style throughout DOP
28 |   codebase.
29 | 
30 | ## Changes
31 | 
32 | * **Ensure DAGs using the same DBT project do not run concurrently**: Safety
33 |   feature to safely allow selective execution of workflows by calling specific
34 |   commands or tags (e.g. `dbt run --m`) within a single dbt project. This avoids
35 |   creating inter-dependant workflows to avoid overriding each other's artifacts,
36 |   since they will share the same target location (within the dbt container).
37 | 
38 | * **Test time-partitioning**: Time-partitioning of datetime type properly
39 |   validated as part of schema validation.
40 | 
41 | * **Use Python 3.7 and dbt 0.19.1 in Composer K8s Operator**
42 | 
43 | * **Add Dataflow example task**: with the introduction of "regular" in the yaml
44 |   config Airflow Operators, it is now possible to run compute intensive Dataflow
45 |   jobs. Check `example_dataflow_template` for an example on how to implement a
46 |   Dataflow pipeline.
47 |   
48 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Datatonic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build down up
 2 | 
 3 | # Defaults to the latest
 4 | AIRFLOW_VERSION := 1.10.15
 5 | 
 6 | include .env
 7 | export
 8 | DOP_PROJECT_ID := #{REPLACE WITH A GCP PROJECT ID WHERE DOP WILL EXECUTE ALL JOBS}
 9 | DOP_LOCATION := #{REPLACE WITH A GCP REGION WHERE DATA WILL BE PERSISTED BY DOP}
10 | 
11 | ENVS := PROJECT_ID=$(DOP_PROJECT_ID) \
12 | 	LOCATION=$(DOP_LOCATION)
13 | 
14 | validate:
15 | 	if [ -z ${DOP_PROJECT_ID} ]; then \
16 | 	echo "DOP_PROJECT_ID must be defined. Aborting";\
17 | 	exit 1; \
18 | 	elif [ -z ${DOP_LOCATION} ]; then \
19 | 	echo "DOP_LOCATION must be defined. Aborting";\
20 | 	exit 1; \
21 | 	elif [ -z ${AIRFLOW_VERSION} ]; then \
22 | 	echo "AIRFLOW_VERSION must be defined. Aborting";\
23 | 	exit 1; \
24 | 	fi
25 | 
26 | build:
27 | 	$(ENVS) docker-compose up -d --build webserver
28 | 
29 | down: validate
30 | 	$(ENVS) docker-compose down
31 | 
32 | up:
33 | 	$(ENVS) docker-compose up -d
34 | 
35 | logs:
36 | 	docker logs dop_webserver -f
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Table of contents
  2 | =================
  3 | * [What is DOP](#what-is-dop)
  4 |   * [Design Concept](#design-concept)
  5 |   * [A Typical DOP Orchestration Flow](#a-typical-dop-orchestration-flow)
  6 | * [Prerequisites - Run in Docker](#prerequisites---run-in-docker)
  7 |   * [For DOP Native Features](#for-dop-native-features)
  8 |   * [For DBT](#for-dbt)
  9 | * [Instructions for Setting things up](#instructions-for-setting-things-up)
 10 |   * [Run Airflow with DOP in Docker - Mac](#run-airflow-with-dop-in-docker---mac)
 11 |   * [Run Airflow with DOP in Docker - Windows](#run-airflow-with-dop-in-docker---windows)
 12 |   * [Run on Composer](#run-on-composer)
 13 |      * [Prerequisites](#prerequisites)
 14 |      * [Create Composer Cluster](#create-composer-cluster)
 15 |      * [Deployment](#deployment)
 16 | * [Misc](#misc)
 17 |   * [Service Account Impersonation](#service-account-impersonation)
 18 | 
 19 | # What is DOP
 20 | ## Design Concept
 21 | DOP is designed to simplify the orchestration effort across many connected components using a configuration file without the need to write any code.
 22 | We have a vision to make orchestration easier to manage and more accessible to a wider group of people.
 23 | 
 24 | Here are some of the key design concept behind DOP,
 25 | - Built on top of Apache Airflow - Utilises it’s DAG capabilities with interactive GUI
 26 | - DAGs without code - YAML + SQL
 27 | - Native capabilities (SQL) - Materialisation, Assertion and Invocation
 28 | - Extensible via plugins - DBT job, Spark job, Egress job, Triggers, etc
 29 | - Easy to setup and deploy - fully automated dev environment and easy to deploy
 30 | - Open Source - open sourced under the MIT license
 31 | 
 32 | **Please note that this project is heavily optimised to run with GCP (Google Cloud Platform) services which is our current focus. By focusing on one cloud provider, it allows us to really improve on end user experience through automation**
 33 | 
 34 | ## A Typical DOP Orchestration Flow
 35 | ![Typical DOP Flow](docs/a_typical_dop_orchestration_flow.png)
 36 | 
 37 | # Prerequisites - Run in Docker
 38 | Note that all the IAM related prerequisites will be available as a Terraform template soon!
 39 | 
 40 | ## For DOP Native Features
 41 | 1. Download and install Docker https://docs.docker.com/get-docker/ (if you are on Windows, please follow instruction here as there are some additional steps required for it to work https://docs.docker.com/docker-for-windows/install/)
 42 | 1. Download and install Google Cloud Platform (GCP) SDK following instructions here https://cloud.google.com/sdk/docs/install.
 43 | 1. Create a dedicated service account for docker with limited permissions for the `development` GCP project, the Docker instance is not designed to be connected to the production environment
 44 |     1. Call it `dop-docker-user@<your GCP project id>` and create it in `https://console.cloud.google.com/iam-admin/serviceaccounts?project=<your GCP project id>`
 45 |     1. Assign the `roles/bigquery.dataEditor` and `roles/bigquery.jobUser` role to the service account under `https://console.cloud.google.com/iam-admin/iam?project=<your GCP project id>`
 46 | 1. Your GCP user / group will need to be given the `roles/iam.serviceAccountUser` and the `roles/iam.serviceAccountTokenCreator` role on the`development` project just for the `dop-docker-user` service account in order to enable [Service Account Impersonation](#service-account-impersonation).
 47 | ![Grant service account user](docs/grant_service_account_user.png)
 48 | 1. Authenticating with your GCP environment by typing in `gcloud auth application-default login` in your terminal and following instructions. Make sure you proceed to the stage where `application_default_credentials.json` is created on your machine (For windows users, make a note of the path, this will be required on a later stage)
 49 | 1. Clone this repository to your machine.
 50 | 
 51 | ## For DBT
 52 | 1. Setup a service account for your GCP project called `dop-dbt-user` in `https://console.cloud.google.com/iam-admin/serviceaccounts?project=<your GCP project id>`
 53 | 1. Assign the `roles/bigquery.dataEditor` and `roles/bigquery.jobUser` role to the service account at project level under `https://console.cloud.google.com/iam-admin/iam?project=<your GCP project id>`
 54 | 1. Your GCP user / group will need to be given the `roles/iam.serviceAccountUser` and the `roles/iam.serviceAccountTokenCreator` role on the `development` project just for the `dop-dbt-user` service account in order to enable [Service Account Impersonation](#service-account-impersonation).
 55 | 
 56 | # Instructions for Setting things up
 57 | 
 58 | ## Run Airflow with DOP in Docker - Mac
 59 | 
 60 | See [README in the service project setup](examples/service_project/README.md) and follow instructions.
 61 | 
 62 | Once it's setup, you should see example DOP DAGs such as `dop__example_covid19`
 63 | ![Airflow in Docker](docs/local_airflow_ui.png)
 64 | 
 65 | ### Local development
 66 | 
 67 | To simplify the development, in the root folder, there is a `Makefile` and a `docker-compose.yml` that start Postgres and Airflow locally
 68 | 
 69 | From the root of the repo run:
 70 | ```shell
 71 | make build \
 72 |     DOP_PROJECT_ID=<my project id> \
 73 |     DOP_LOCATION=<my project location>
 74 | ```
 75 | 
 76 | For subsequent runs run
 77 | ```shell
 78 | make up \
 79 |     DOP_PROJECT_ID=<my project id> \
 80 |     DOP_LOCATION=<my project location>
 81 | ```
 82 | 
 83 | To shut the local environment down run:
 84 | ```shell
 85 | make down \
 86 |     DOP_PROJECT_ID=<my project id> \
 87 |     DOP_LOCATION=<my project location>
 88 | ```
 89 | 
 90 | On Linux, the mounted volumes in container use the native Linux filesystem user/group permissions.
 91 | Airflow image is started with the user/group 50000 and doesn't have read or write access in some mounted volumes
 92 | (check volumes section in `docker-compose.yml`)
 93 | 
 94 | ```shell
 95 | $ docker exec -u airflow -it dop_webserver id
 96 | uid=50000(airflow) gid=50000(airflow) groups=50000(airflow)
 97 | $ docker exec -u airflow -it dop_webserver ls -ld dags
 98 | drwxrwxr-x 5 1001 1001 4096 Jun  4 07:25 dags
 99 | $ docker exec -u airflow -it dop_webserver ls -l /secret/gcp-credentials/application_default_credentials.json
100 | -rw------- 1 1001 1001 341 Jun  4 09:54 /secret/gcp-credentials/application_default_credentials.json
101 | ```
102 | 
103 | So, permissions must be updated manually to have read permissions on the secrets file and write permissions in the dags folder
104 | 
105 | 
106 | ## Run Airflow with DOP in Docker - Windows
107 | This is currently working in progress, however the instructions on what needs to be done is in the [Makefile](examples/service_project/Makefile)
108 | 
109 | ## Run on Composer
110 | 
111 | ### Prerequisites
112 | 1. Create a dedicate service account for Composer and call it `dop-composer-user` with following roles at project level
113 |     - roles/bigquery.dataEditor
114 |     - roles/bigquery.jobUser
115 |     - roles/composer.worker
116 |     - roles/compute.viewer
117 | 1. Create a dedicated service account for DBT with limited permissions.
118 |     1. [Already done in here if it’s DEV] Call it `dop-dbt-user@<GCP project id>` and create in `https://console.cloud.google.com/iam-admin/serviceaccounts?project=<your GCP project id>`
119 |     1. [Already done in here if it’s DEV] Assign the `roles/bigquery.dataEditor` and `roles/bigquery.jobUser` role to the service account at project level under `https://console.cloud.google.com/iam-admin/iam?project=<your GCP project id>`
120 |     1. The `dop-composer-user` will need to be given the `roles/iam.serviceAccountUser` and the `roles/iam.serviceAccountTokenCreator` role just for the `dop-dbt-user` service account in order to enable [Service Account Impersonation](#service-account-impersonation).
121 | 
122 | ### Create Composer Cluster
123 | 1. Use the service account already created `dop-composer-user` instead of the default service account
124 | 1. Use the following environment variables
125 |     ```
126 |     DOP_PROJECT_ID : {REPLACE WITH THE GCP PROJECT ID WHERE DOP WILL PERSIST ALL DATA TO}
127 |     DOP_LOCATION : {REPLACE WITH GCP REGION LOCATION WHRE DOP WILL PERSIST ALL DATA TO}
128 |     DOP_SERVICE_PROJECT_PATH := {REPLACE WITH THE ABSOLUTE PATH OF THE Service Project, i.e. /home/airflow/gcs/dags/dop_{service project name}
129 |     DOP_INFRA_PROJECT_ID := {REPLACE WITH THE GCP INFRASTRUCTURE PROJECT ID WHERE BUILD ARTIFACTS ARE STORED, i.e. a DBT docker image stored in GCR}
130 |     ```
131 |    and optionally
132 |    ```
133 |    DOP_GCR_PULL_SECRET_NAME:= {This maybe needed if the project storing the gcr images are not he same as where Cloud Composer runs, however this might be a better alternative https://medium.com/google-cloud/using-single-docker-repository-with-multiple-gke-projects-1672689f780c}
134 |    ```
135 | 1. Add the following Python Packages
136 |     ```
137 |     dataclasses==0.7
138 |     ```
139 | 1. Finally create a new node pool with the following k8 label
140 |     ```
141 |     key: cloud.google.com/gke-nodepool
142 |     value: kubernetes-task-pool
143 |     ```
144 | 
145 | ### Deployment
146 | See [Service Project README](examples/service_project/README.md#deploy-to-cloud-composer)
147 | 
148 | # Misc
149 | ## Service Account Impersonation
150 | Impersonation is a GCP feature allows a user / service account to impersonate as another service account.
151 | This is a very useful feature and offers the following benefits
152 | - When doing development locally, especially with automation involved (i.e using Docker), it is very risky to interact with GCP services by using your user account directly because it may have a lot of permissions. By impersonate as another service account with less permissions, it is a lot safer (least privilege)
153 | - There is no credential needs to be downloaded, all permissions are linked to the user account. If an employee leaves the company, access to GCP will be revoked immediately because the impersonation process is no longer possible
154 | 
155 | The following diagram explains how we use Impersonation in DOP when it runs in Docker
156 | ![DOP Docker Account Impersonation](docs/dop_docker_account_impersonation.png)
157 | 
158 | And when running DBT jobs on production, we are also using this technique to use the composer service account to impersonate as the `dop-dbt-user` service account so that service account keys are not required.
159 | 
160 | There are two very google articles explaining how impersonation works and why using it
161 | - [Using ImpersonatedCredentials for Google Cloud APIs](https://medium.com/google-cloud/using-impersonatedcredentials-for-google-cloud-apis-14581ca990d8)
162 | - [Stop Downloading Google Cloud Service Account Keys!](https://medium.com/@jryancanty/stop-downloading-google-cloud-service-account-keys-1811d44a97d9)
163 | 
164 | 
165 | ## Pre-commit Linter
166 | [pre-commit](https://pre-commit.com/) tool runs a number of checks against the code, enforcing that all the code pushed to the repository follows the same guidelines and best practices. In this project the checks are:
167 | * Trim trailing whitespaces
168 | * Fix end-of-file
169 | * YAML file format
170 | * Python code formatter using [Black](https://black.readthedocs.io/en/stable/)
171 | * Python style guide using [Flake8](https://flake8.pycqa.org/en/latest/)
172 | 
173 | To install locally, follow the [installation guide](https://pre-commit.com/index.html#install) in the pre-commit page
174 | 
175 | The normal usage is to run `pre-commit run` after staging files. If the [git hook](https://pre-commit.com/index.html#3-install-the-git-hook-scripts) has been installed, pre-commit will run automatically on `git commit`.
176 | 


--------------------------------------------------------------------------------
/dags/dop/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/dags/dop/__init__.py


--------------------------------------------------------------------------------
/dags/dop/airflow_module/dag_builder/dag_builder_util.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta, datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | 
 6 | 
 7 | def get_default_dag_start_date(tzinfo):
 8 |     return datetime(
 9 |         1970, 1, 1, tzinfo=tzinfo
10 |     )  # You cannot back fill a dag prior to this date
11 | 
12 | 
13 | def create_dag(
14 |     dag_id,
15 |     start_date,
16 |     schedule_interval=None,
17 |     retries=3,
18 |     retry_delay=None,
19 |     owner="airflow",
20 |     depends_on_past=False,
21 |     catchup=False,
22 |     max_active_runs=5,
23 |     concurrency=int(Variable.get("DAG_CONCURRENCY", default_var=2)),
24 |     template_searchpath=None,
25 |     user_defined_macros=None,
26 | ):
27 |     default_args = {
28 |         "owner": owner,
29 |         "depends_on_past": depends_on_past,
30 |         "start_date": start_date,
31 |         "retries": retries,
32 |         "retry_delay": retry_delay if retry_delay is not None else timedelta(minutes=5),
33 |     }
34 | 
35 |     return DAG(
36 |         dag_id=dag_id,
37 |         default_args=default_args,
38 |         schedule_interval=schedule_interval,
39 |         catchup=catchup,
40 |         max_active_runs=max_active_runs,
41 |         concurrency=concurrency,
42 |         template_searchpath=template_searchpath,
43 |         user_defined_macros=user_defined_macros,
44 |     )
45 | 


--------------------------------------------------------------------------------
/dags/dop/airflow_module/operator/common.py:
--------------------------------------------------------------------------------
  1 | from airflow.operators.python_operator import PythonOperator
  2 | from airflow.sensors.base_sensor_operator import BaseSensorOperator
  3 | 
  4 | 
  5 | class BasePythonOperator(PythonOperator):
  6 |     def __init__(
  7 |         self,
  8 |         python_callable,
  9 |         op_args=None,
 10 |         op_kwargs=None,
 11 |         provide_context=False,
 12 |         templates_dict=None,
 13 |         templates_exts=None,
 14 |         *args,
 15 |         **kwargs
 16 |     ):
 17 |         if kwargs.get("priority_weight") is None:
 18 |             kwargs["priority_weight"] = 1
 19 | 
 20 |         super(BasePythonOperator, self).__init__(
 21 |             python_callable=python_callable,
 22 |             op_args=op_args,
 23 |             op_kwargs=op_kwargs,
 24 |             provide_context=provide_context,
 25 |             templates_dict=templates_dict,
 26 |             templates_exts=templates_exts,
 27 |             *args,
 28 |             **kwargs
 29 |         )
 30 | 
 31 | 
 32 | class AbstractBaseSensorOperator(BaseSensorOperator):
 33 |     def __init__(self, *args, **kwargs):
 34 |         super(AbstractBaseSensorOperator, self).__init__(*args, **kwargs)
 35 | 
 36 | 
 37 | class TransformationOperator(BasePythonOperator):
 38 |     template_fields = (
 39 |         "action",
 40 |         "target",
 41 |         "database",
 42 |         "schema",
 43 |         "identifier",
 44 |         "arguments",
 45 |         "sql",
 46 |         "templates_dict",
 47 |     )
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         python_callable,
 52 |         op_args=None,
 53 |         op_kwargs=None,
 54 |         provide_context=False,
 55 |         templates_dict=None,
 56 |         templates_exts=None,
 57 |         *args,
 58 |         **kwargs
 59 |     ):
 60 |         super(TransformationOperator, self).__init__(
 61 |             python_callable=python_callable,
 62 |             op_args=op_args,
 63 |             op_kwargs=op_kwargs,
 64 |             provide_context=provide_context,
 65 |             templates_dict=templates_dict,
 66 |             templates_exts=templates_exts,
 67 |             *args,
 68 |             **kwargs
 69 |         )
 70 |         task = op_kwargs["task"]
 71 |         self.action = task.kind.action
 72 |         self.target = task.kind.target
 73 |         self.database = task.database
 74 |         self.schema = task.schema
 75 |         self.identifier = task.identifier
 76 |         self.arguments = task.options.get("arguments")
 77 |         self.sql = templates_dict["sql"]
 78 | 
 79 | 
 80 | class MaterializationOperator(TransformationOperator):
 81 |     ui_color = "#f2fade"
 82 | 
 83 | 
 84 | class RecreateSpOperator(TransformationOperator):
 85 |     ui_color = "#a0c6d9"
 86 | 
 87 | 
 88 | class InvocationOperator(TransformationOperator):
 89 |     ui_color = "#99cee8"
 90 | 
 91 | 
 92 | class AssertOperator(BasePythonOperator):
 93 |     ui_color = "#fcc2a7"
 94 |     template_fields = ("assertion_sql", "templates_dict")
 95 | 
 96 |     def __init__(
 97 |         self,
 98 |         python_callable,
 99 |         provide_context=False,
100 |         templates_dict=None,
101 |         *args,
102 |         **kwargs
103 |     ):
104 |         super(AssertOperator, self).__init__(
105 |             python_callable=python_callable,
106 |             provide_context=provide_context,
107 |             templates_dict=templates_dict,
108 |             *args,
109 |             **kwargs
110 |         )
111 |         self.assertion_sql = templates_dict["sql"]
112 | 


--------------------------------------------------------------------------------
/dags/dop/airflow_module/operator/dbt_k8_operator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | from typing import List, Dict
  5 | 
  6 | from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator
  7 | from airflow.sensors.base_sensor_operator import apply_defaults
  8 | from dop.component.configuration.env import env_config
  9 | from dop.airflow_module.operator import dbt_operator_helper
 10 | 
 11 | # List of files generated by dbt docs generate
 12 | # https://docs.getdbt.com/reference/commands/cmd-docs
 13 | DBT_DOC_FILES = ["index.html", "manifest.json", "catalog.json"]
 14 | DBT_DOC_FOLDER = "target"
 15 | DBT_USER = "dbtuser"
 16 | DBT_RUN_RESULTS_PATH = "target/run_results.json"
 17 | 
 18 | # See: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
 19 | node_pool_affinity = {
 20 |     "nodeAffinity": {
 21 |         # requiredDuringSchedulingIgnoredDuringExecution means in order
 22 |         # for a pod to be scheduled on a node, the node must have the
 23 |         # specified labels. However, if labels on a node change at
 24 |         # runtime such that the affinity rules on a pod are no longer
 25 |         # met, the pod will still continue to run on the node.
 26 |         "requiredDuringSchedulingIgnoredDuringExecution": {
 27 |             "nodeSelectorTerms": [
 28 |                 {
 29 |                     "matchExpressions": [
 30 |                         {
 31 |                             # When nodepools are created in Google Kubernetes
 32 |                             # Engine, the nodes inside of that nodepool are
 33 |                             # automatically assigned the label
 34 |                             # 'cloud.google.com/gke-nodepool' with the value of
 35 |                             # the nodepool's name.
 36 |                             "key": "cloud.google.com/gke-nodepool",
 37 |                             "operator": "In",
 38 |                             "values": ["kubernetes-task-pool"],
 39 |                         }
 40 |                     ]
 41 |                 }
 42 |             ]
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | 
 48 | def retrieve_commit_hash():
 49 |     with open(
 50 |         os.path.sep.join([env_config.service_project_path, ".commit-hash"])
 51 |     ) as fp:
 52 |         return fp.read()
 53 | 
 54 | 
 55 | class DbtK8Operator(KubernetesPodOperator):
 56 |     template_fields = (
 57 |         "action",
 58 |         "target",
 59 |         "dbt_project_name",
 60 |         "image_tag",
 61 |         "dbt_arguments",
 62 |         "gcr_pull_secret_name",
 63 |         "arguments",
 64 |     )
 65 |     ui_color = "#FF694B"
 66 | 
 67 |     @apply_defaults
 68 |     def __init__(
 69 |         self,
 70 |         dbt_project_name: str,
 71 |         dbt_version: str,
 72 |         dbt_arguments: List[Dict],
 73 |         *args,
 74 |         **kwargs,
 75 |     ):
 76 |         """
 77 |         :param dbt_project_name: the name for the dbt project name inline with what's defined in `.dbt-project-repos.json`
 78 |         :param dbt_version: Not used
 79 |         :param args:
 80 |         :param kwargs: must contain the Task entity
 81 |         """
 82 | 
 83 |         task = kwargs["task"]
 84 |         self.dbt_project_name = dbt_project_name
 85 |         self.dbt_version = "N/A, this is fixed in the docker image"
 86 |         self.action = task.kind.action
 87 |         self.target = task.kind.target
 88 |         self.dbt_arguments = dbt_arguments
 89 |         self.gcr_pull_secret_name = env_config.gcr_pull_secret_name
 90 |         self.image_tag = retrieve_commit_hash()
 91 | 
 92 |         self._full_refresh = (
 93 |             False  # used to trigger DBT full refresh, modified via execute() override
 94 |         )
 95 | 
 96 |         self.arguments = [self.parse_bash_command()]
 97 | 
 98 |         super(DbtK8Operator, self).__init__(
 99 |             name=kwargs["task_id"],
100 |             cmds=["/bin/bash", "-c"],
101 |             arguments=self.arguments,
102 |             get_logs=True,
103 |             namespace="default",
104 |             image=f"eu.gcr.io/{env_config.infra_project_id}/dop-dbt:{self.image_tag}",
105 |             is_delete_operator_pod=True,
106 |             env_vars={
107 |                 "DOP_PROJECT_ID": env_config.project_id,
108 |                 "DOP_LOCATION": env_config.location,
109 |             },
110 |             image_pull_secrets=self.gcr_pull_secret_name,
111 |             affinity=node_pool_affinity,
112 |             *args,
113 |             **kwargs,
114 |         )
115 | 
116 |     def execute(self, context):
117 |         """
118 |         Override the parent method to ingest required contexts
119 |         """
120 |         dag_run_conf = context["dag_run"].conf if context["dag_run"].conf else {}
121 |         full_refresh = dag_run_conf.get("full_refresh", False)
122 | 
123 |         self._full_refresh = full_refresh
124 | 
125 |         logging.info(f"### IS FULL REFRESH ENABLED: {self._full_refresh}")
126 | 
127 |         self.arguments = [self.parse_bash_command(context=context)]
128 | 
129 |         logging.info(f"### Updated arguments: {self.arguments}")
130 | 
131 |         super(DbtK8Operator, self).execute(context=context)
132 | 
133 |     def parse_bash_command(self, context=None):
134 |         full_refresh_cmd = ""
135 |         if self.target != "run":
136 |             full_refresh_cmd = ""
137 |         elif self.dbt_arguments:
138 |             if self._full_refresh and "--full-refresh" not in [
139 |                 arg.get("option") for arg in self.dbt_arguments
140 |             ]:
141 |                 full_refresh_cmd = "--full-refresh"
142 |         elif self._full_refresh:
143 |             full_refresh_cmd = "--full-refresh"
144 | 
145 |         cmd_for_additional_arguments = ""
146 | 
147 |         # docs arguments are only used to copy files to GCS, not in the task execution
148 |         if self.dbt_arguments and self.target != "docs generate":
149 |             cmd_for_additional_arguments = dbt_operator_helper.implode_arguments(
150 |                 dbt_arguments=self.dbt_arguments
151 |             )
152 | 
153 |         cmd_to_run_dbt = (
154 |             f"pipenv run dbt --no-use-colors {self.target} --project-dir ./{self.dbt_project_name}"
155 |             f" --vars {dbt_operator_helper.parsed_cmd_airflow_context_vars(context=context)}"
156 |             f" {cmd_for_additional_arguments}"
157 |             f" {full_refresh_cmd};"
158 |             f" gsutil cp /home/{DBT_USER}/{self.dbt_project_name}/{DBT_RUN_RESULTS_PATH} gs://{os.getenv('GCS_BUCKET')}/dbt/{DBT_RUN_RESULTS_PATH}"
159 |         )
160 | 
161 |         if self.target == "docs generate":
162 |             command = self.copy_docs_to_gcs_command()
163 |             if command:
164 |                 cmd_to_run_dbt += f"; {command}"
165 | 
166 |         return cmd_to_run_dbt
167 | 
168 |     def copy_docs_to_gcs_command(self):
169 |         """
170 |         Generate gsutil command line to copy doc files generated with dbt docs generate to GCS
171 |         """
172 |         command = []
173 |         gcs_bucket = dbt_operator_helper.extract_argument(
174 |             self.dbt_arguments, "--bucket"
175 |         )
176 |         if not gcs_bucket:
177 |             logging.warning("No bucket argument provided. Skipping copy to GCS")
178 |             return ""
179 |         gcs_path = dbt_operator_helper.extract_argument(
180 |             self.dbt_arguments, "--bucket-path", ""
181 |         )
182 | 
183 |         for doc_file in DBT_DOC_FILES:
184 |             doc_file_path = (
185 |                 f"/home/{DBT_USER}/{self.dbt_project_name}/{DBT_DOC_FOLDER}/{doc_file}"
186 |             )
187 |             logging.info(f"Copying {doc_file} to gs://{gcs_bucket}/{gcs_path}")
188 |             command.append(
189 |                 f"gsutil cp {doc_file_path} gs://{gcs_bucket}/{gcs_path}/{doc_file}"
190 |             )
191 |         return ";".join(command)
192 | 
193 |     def post_execute(self, context, result=None):
194 |         """
195 |         This hook is triggered right after self.execute() is called.
196 |         It is passed the execution context and any results returned by the
197 |         operator.
198 |         """
199 |         dbt_operator_helper.save_run_results_in_bq(
200 |             env_config.project_id,
201 |             self.dbt_project_name,
202 |             f"gs://{os.getenv('GCS_BUCKET')}/dbt/{DBT_RUN_RESULTS_PATH}",
203 |         )
204 | 


--------------------------------------------------------------------------------
/dags/dop/airflow_module/operator/dbt_operator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import List, Dict
  4 | 
  5 | from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
  6 | from airflow.operators.bash_operator import BashOperator
  7 | from airflow.sensors.base_sensor_operator import apply_defaults
  8 | from dop.airflow_module.operator import dbt_operator_helper
  9 | from dop.component.configuration.env import env_config
 10 | 
 11 | # List of files generated by dbt docs generate
 12 | # https://docs.getdbt.com/reference/commands/cmd-docs
 13 | DBT_DOC_FILES = ["index.html", "manifest.json", "catalog.json"]
 14 | DBT_DOC_FOLDER = "target"
 15 | DBT_RUN_RESULTS_PATH = "target/run_results.json"
 16 | 
 17 | 
 18 | class DbtOperator(BashOperator):
 19 |     template_fields = (
 20 |         "action",
 21 |         "target",
 22 |         "dbt_project_path",
 23 |         "dbt_version",
 24 |         "dbt_arguments",
 25 |         "bash_command",
 26 |     )
 27 |     ui_color = "#FF694B"
 28 | 
 29 |     @apply_defaults
 30 |     def __init__(
 31 |         self,
 32 |         dbt_project_name: str,
 33 |         dbt_version: str,
 34 |         dbt_arguments: List[Dict],
 35 |         *args,
 36 |         **kwargs,
 37 |     ):
 38 |         """
 39 |         :param dbt_project_name: the name for the dbt project name inline with what's defined in `.dbt-project-repos.json`
 40 |         :param dbt_version: a supported DBT version, version must be >= 0.19.1
 41 |         :param args:
 42 |         :param kwargs: must contain the Task entity
 43 |         """
 44 | 
 45 |         task = kwargs["task"]
 46 |         self.dbt_project_path = os.path.sep.join(
 47 |             [env_config.dbt_projects_path, dbt_project_name]
 48 |         )
 49 |         self.dbt_project_name = dbt_project_name
 50 |         self.dbt_version = dbt_version
 51 |         self.action = task.kind.action
 52 |         self.target = task.kind.target
 53 |         self.dbt_arguments = dbt_arguments
 54 | 
 55 |         self._full_refresh = (
 56 |             False  # used to trigger DBT full refresh, modified via execute() override
 57 |         )
 58 | 
 59 |         super(DbtOperator, self).__init__(
 60 |             bash_command=self.parse_bash_command(), *args, **kwargs
 61 |         )
 62 | 
 63 |     def execute(self, context):
 64 |         """
 65 |         Override the parent method to ingest required contexts
 66 |         """
 67 |         dag_run_conf = context["dag_run"].conf if context["dag_run"].conf else {}
 68 |         full_refresh = dag_run_conf.get("full_refresh", False)
 69 | 
 70 |         self._full_refresh = full_refresh
 71 | 
 72 |         logging.info(f"### IS FULL REFRESH ENABLED: {self._full_refresh}")
 73 | 
 74 |         self.bash_command = self.parse_bash_command(context=context)
 75 |         super(DbtOperator, self).execute(context=context)
 76 | 
 77 |     def parse_bash_command(self, context=None):
 78 |         """
 79 |         Create a virtualenv and run DBT. Virtualenv is removed regardless if the script is successful or not
 80 |         """
 81 | 
 82 |         full_refresh_cmd = ""
 83 |         if self.target != "run":
 84 |             full_refresh_cmd = ""
 85 |         elif self.dbt_arguments:
 86 |             if self._full_refresh and "--full-refresh" not in [
 87 |                 arg.get("option") for arg in self.dbt_arguments
 88 |             ]:
 89 |                 full_refresh_cmd = "--full-refresh"
 90 |         elif self._full_refresh:
 91 |             full_refresh_cmd = "--full-refresh"
 92 | 
 93 |         set_err_handling = "set -xe"
 94 |         trap = """
 95 |         trap 'catch $? $LINENO' ERR
 96 |         catch() {
 97 |           echo "Script errored, removing virtualenv"
 98 |           rm -rf $TMP_DIR
 99 |           exit 1
100 |         }
101 |         """
102 |         cmd_for_tmp_dir = "export TMP_DIR=$(mktemp -d)"
103 |         cmd_to_print_tmp_dir = "echo TMP_DIR is: $TMP_DIR"
104 |         cmd_for_virtualenv = "virtualenv -p python3 $TMP_DIR"
105 |         dbt_init = f"PYTHONPATH={env_config.dag_path} python {env_config.dag_path}/dop/component/helper/dbt_init.py --tmp_dir=$TMP_DIR --project_name={self.dbt_project_name}"
106 |         cmd_for_activating_virtualenv = "source $TMP_DIR/bin/activate"
107 |         install_pip_deps = f"pip install dbt=={self.dbt_version}"
108 | 
109 |         cmd_for_additional_arguments = ""
110 | 
111 |         # docs arguments are only used to copy files to GCS, not in the task execution
112 |         if self.dbt_arguments and self.target != "docs generate":
113 |             cmd_for_additional_arguments = dbt_operator_helper.implode_arguments(
114 |                 dbt_arguments=self.dbt_arguments
115 |             )
116 | 
117 |         cmd_to_run_dbt = (
118 |             f"dbt clean --project-dir {self.dbt_project_path} --profiles-dir $TMP_DIR/.dbt"
119 |             f" && dbt deps --project-dir {self.dbt_project_path}"
120 |             f" && dbt --no-use-colors {self.target} --project-dir {self.dbt_project_path}"
121 |             f" --profiles-dir $TMP_DIR/.dbt"
122 |             f" --vars {dbt_operator_helper.parsed_cmd_airflow_context_vars(context=context)}"
123 |             f" {cmd_for_additional_arguments}"
124 |             f" {full_refresh_cmd}"
125 |         )
126 | 
127 |         cmd_to_remove_tmp_dir = "rm -rf $TMP_DIR"
128 | 
129 |         return "\n".join(
130 |             [
131 |                 set_err_handling,
132 |                 trap,
133 |                 cmd_for_tmp_dir,
134 |                 cmd_to_print_tmp_dir,
135 |                 cmd_for_virtualenv,
136 |                 dbt_init,  # setup dbt profiles.yml & service account secret from Secret Manager
137 |                 cmd_for_activating_virtualenv,
138 |                 install_pip_deps,
139 |                 cmd_to_run_dbt,
140 |                 cmd_to_remove_tmp_dir,
141 |             ]
142 |         )
143 | 
144 |     def post_execute(self, context, result=None):
145 |         """
146 |         This hook is triggered right after self.execute() is called.
147 |         It is passed the execution context and any results returned by the
148 |         operator.
149 |         """
150 |         if self.target == "docs generate":
151 |             gcs_bucket = dbt_operator_helper.extract_argument(
152 |                 self.dbt_arguments, "--bucket"
153 |             )
154 |             if not gcs_bucket:
155 |                 logging.warning("No bucket argument provided. Skipping copy to GCS")
156 |             gcs_path = dbt_operator_helper.extract_argument(
157 |                 self.dbt_arguments, "--bucket-path", ""
158 |             )
159 |             logging.info(f"Copying dbt docs JSON files to GCS bucket {gcs_bucket}")
160 |             dbt_operator_helper.copy_docs_to_gcs(
161 |                 gcs_bucket, gcs_path, self.dbt_project_path
162 |             )
163 |         
164 |         dbt_operator_helper.save_run_results_in_bq(
165 |             env_config.project_id,
166 |             self.dbt_project_name,
167 |             f"{self.dbt_project_path}/{DBT_RUN_RESULTS_PATH}",
168 |         )
169 | 
170 |     def copy_docs_to_gcs(self, bucket: str, bucket_path: str, project_path: str):
171 |         """
172 |         Copy doc files generated with dbt docs generate to GCS
173 | 
174 |         :param bucket: Bucket where the doc files will be copied
175 |         :param bucket_path: Path in the bucket
176 |         :param project_path: Local project folder
177 |         """
178 |         hook = GoogleCloudStorageHook()
179 |         for doc_file in DBT_DOC_FILES:
180 |             doc_file_path = f"{project_path}/{DBT_DOC_FOLDER}/{doc_file}"
181 |             if os.path.exists(doc_file_path):
182 |                 logging.info(
183 |                     f"{doc_file} found. Copying to gs://{bucket}/{bucket_path}"
184 |                 )
185 |                 hook.upload(
186 |                     bucket,
187 |                     object=f"{bucket_path}/{doc_file}" if bucket_path else doc_file,
188 |                     filename=doc_file_path,
189 |                     mime_type="text/html"
190 |                     if doc_file.endswith(".html")
191 |                     else "application/json",
192 |                 )
193 |             else:
194 |                 logging.warning(f"{doc_file} not found. Skipping")
195 | 


--------------------------------------------------------------------------------
/dags/dop/airflow_module/operator/dbt_operator_helper.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import json
  3 | import logging
  4 | import pathlib
  5 | 
  6 | from google.cloud import bigquery
  7 | from google.cloud import storage
  8 | from urllib.parse import urlparse
  9 | from google.cloud.exceptions import NotFound
 10 | 
 11 | DBT_RUN_RESULTS_TABLE = "run_results"
 12 | DBT_RUN_RESULTS_SCHEMA_FILE = "run_results_schema.json"
 13 | 
 14 | 
 15 | def implode_arguments(dbt_arguments, filter_func=None):
 16 |     filtered_dbt_arguments = (
 17 |         filter_func(dbt_arguments) if filter_func else dbt_arguments
 18 |     )
 19 |     return " ".join(
 20 |         [
 21 |             " ".join(
 22 |                 [
 23 |                     argument["option"],
 24 |                     "" if argument.get("value") is None else argument.get("value"),
 25 |                 ]
 26 |             )
 27 |             for argument in filtered_dbt_arguments
 28 |         ]
 29 |     )
 30 | 
 31 | 
 32 | def parsed_cmd_airflow_context_vars(context):
 33 |     cmd = '"{'
 34 |     context_vars = ["ds", "ds_nodash", "ts", "ts_nodash", "ts_nodash_with_tz"]
 35 |     if context:
 36 |         var_list = [f"'{v}'" + f": '{context[v]}'" for v in context_vars]
 37 |     else:
 38 |         var_list = [f"'{v}'" + ": '{{ " + v + " }}'" for v in context_vars]
 39 |     cmd += ",".join(var_list)
 40 | 
 41 |     cmd += '}"'
 42 | 
 43 |     return cmd
 44 | 
 45 | 
 46 | def extract_argument(dbt_arguments: list, name: str, default_value: str = None):
 47 |     """
 48 |     Extract an argument from the argument list. Format is
 49 |     [
 50 |         {'option': 'OPTION1', 'value': 'VALUE1'},
 51 |         {'option': 'OPTION2', 'value': 'VALUE2'},
 52 |         ...
 53 |     ]
 54 | 
 55 |     :param dbt_arguments: Argument list
 56 |     :param name: Argument to extract
 57 |     :param default_value: Default value to return if not present
 58 |     """
 59 |     return next(
 60 |         (arg.get("value") for arg in dbt_arguments if arg.get("option") == name),
 61 |         default_value,
 62 |     )
 63 | 
 64 | def save_run_results_in_bq(project_id, dbt_project_name, run_results_path):
 65 |     """
 66 |     Load run_results json file in BigQuery. As a first step is checked if the table
 67 |     already exists in the schema and if not, it's created.
 68 | 
 69 |     To fit BQ schema, the field metadata.env (JSON object) must be serialised
 70 |     and results.message converted to string,
 71 |     because depending on the task it can be an integer or a string
 72 |     """
 73 |     table_id = f"{project_id}.{dbt_project_name}.{DBT_RUN_RESULTS_TABLE}"
 74 |     client = bigquery.Client(project=project_id)
 75 |     check_run_results_table(client, table_id)
 76 | 
 77 |     run_results = {}
 78 |     if run_results_path.startswith("gs://"):
 79 |         storage_client = storage.Client()
 80 |         bucket, path = _parse_gcs_url(run_results_path)
 81 |         bucket = storage_client.get_bucket(bucket)
 82 |         blob = bucket.blob(path)
 83 |         run_results = json.loads(blob.download_as_string())
 84 |     else:
 85 |         with open(run_results_path) as run_results_file:
 86 |             run_results = json.load(run_results_file)
 87 | 
 88 |     if run_results["metadata"]["env"]:
 89 |         run_results["metadata"]["env"] = json.dumps(run_results["metadata"]["env"])
 90 |     else:
 91 |         del run_results["metadata"]["env"]
 92 |     for item in run_results["results"]:
 93 |         item["message"] = str(item["message"])
 94 | 
 95 |     data_as_file = io.StringIO(json.dumps(run_results))
 96 |     job_config = bigquery.LoadJobConfig(
 97 |         source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
 98 |     )
 99 |     job = client.load_table_from_file(data_as_file, table_id, job_config=job_config)
100 |     try:
101 |         result = job.result()  # Waits for table load to complete.
102 |         logging.info("Pushed {} rows into run_results table".format(result.output_rows))
103 |     except Exception:
104 |         logging.info(f"Error loading run_results to BigQuery: {job.errors}")
105 | 
106 | 
107 | def check_run_results_table(client, table_id):
108 |     """
109 |     Check if run_results table exists in BigQuery, and if not create it
110 |     """
111 |     try:
112 |         client.get_table(table_id)
113 |     except NotFound:
114 |         print("Table {} is not found.".format(table_id))
115 |         current_folder = pathlib.Path(__file__).parent.absolute()
116 |         schema = client.schema_from_json(
117 |             f"{current_folder}/{DBT_RUN_RESULTS_SCHEMA_FILE}"
118 |         )
119 |         table = bigquery.Table(table_id, schema=schema)
120 |         client.create_table(table)
121 | 
122 | 
123 | def _parse_gcs_url(gsurl):
124 |     """
125 |     Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a
126 |     tuple containing the corresponding bucket and blob.
127 |     """
128 |     parsed_url = urlparse(gsurl)
129 |     bucket = parsed_url.netloc
130 |     blob = parsed_url.path.lstrip("/")
131 |     return bucket, blob
132 | 
133 | 


--------------------------------------------------------------------------------
/dags/dop/airflow_module/operator/run_results_schema.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "mode": "NULLABLE",
  4 |     "name": "elapsed_time",
  5 |     "type": "FLOAT"
  6 |   },
  7 |   {
  8 |     "fields": [
  9 |       {
 10 |         "mode": "REPEATED",
 11 |         "name": "models",
 12 |         "type": "STRING"
 13 |       },
 14 |       {
 15 |         "mode": "NULLABLE",
 16 |         "name": "version_check",
 17 |         "type": "BOOLEAN"
 18 |       },
 19 |       {
 20 |         "mode": "NULLABLE",
 21 |         "name": "rpc_method",
 22 |         "type": "STRING"
 23 |       },
 24 |       {
 25 |         "mode": "NULLABLE",
 26 |         "name": "vars",
 27 |         "type": "STRING"
 28 |       },
 29 |       {
 30 |         "mode": "NULLABLE",
 31 |         "name": "log_format",
 32 |         "type": "STRING"
 33 |       },
 34 |       {
 35 |         "mode": "NULLABLE",
 36 |         "name": "project_dir",
 37 |         "type": "STRING"
 38 |       },
 39 |       {
 40 |         "mode": "NULLABLE",
 41 |         "name": "profiles_dir",
 42 |         "type": "STRING"
 43 |       },
 44 |       {
 45 |         "mode": "NULLABLE",
 46 |         "name": "which",
 47 |         "type": "STRING"
 48 |       },
 49 |       {
 50 |         "mode": "NULLABLE",
 51 |         "name": "use_cache",
 52 |         "type": "BOOLEAN"
 53 |       },
 54 |       {
 55 |         "mode": "NULLABLE",
 56 |         "name": "use_colors",
 57 |         "type": "BOOLEAN"
 58 |       },
 59 |       {
 60 |         "mode": "NULLABLE",
 61 |         "name": "write_json",
 62 |         "type": "BOOLEAN"
 63 |       },
 64 |       {
 65 |         "mode": "NULLABLE",
 66 |         "name": "schema",
 67 |         "type": "BOOLEAN"
 68 |       },
 69 |       {
 70 |         "mode": "NULLABLE",
 71 |         "name": "data",
 72 |         "type": "BOOLEAN"
 73 |       },
 74 |       {
 75 |         "mode": "NULLABLE",
 76 |         "name": "full_refresh",
 77 |         "type": "BOOLEAN"
 78 |       }
 79 |     ],
 80 |     "mode": "NULLABLE",
 81 |     "name": "args",
 82 |     "type": "RECORD"
 83 |   },
 84 |   {
 85 |     "fields": [
 86 |       {
 87 |         "mode": "NULLABLE",
 88 |         "name": "message",
 89 |         "type": "STRING"
 90 |       },
 91 |       {
 92 |         "fields": [
 93 |           {
 94 |             "description": "bq-datetime",
 95 |             "mode": "NULLABLE",
 96 |             "name": "completed_at",
 97 |             "type": "TIMESTAMP"
 98 |           },
 99 |           {
100 |             "description": "bq-datetime",
101 |             "mode": "NULLABLE",
102 |             "name": "started_at",
103 |             "type": "TIMESTAMP"
104 |           },
105 |           {
106 |             "mode": "NULLABLE",
107 |             "name": "name",
108 |             "type": "STRING"
109 |           }
110 |         ],
111 |         "mode": "REPEATED",
112 |         "name": "timing",
113 |         "type": "RECORD"
114 |       },
115 |       {
116 |         "mode": "NULLABLE",
117 |         "name": "thread_id",
118 |         "type": "STRING"
119 |       },
120 |       {
121 |         "fields": [
122 |           {
123 |             "mode": "NULLABLE",
124 |             "name": "bytes_processed",
125 |             "type": "INTEGER"
126 |           },
127 |           {
128 |             "mode": "NULLABLE",
129 |             "name": "code",
130 |             "type": "STRING"
131 |           },
132 |           {
133 |             "mode": "NULLABLE",
134 |             "name": "_message",
135 |             "type": "STRING"
136 |           },
137 |           {
138 |             "mode": "NULLABLE",
139 |             "name": "rows_affected",
140 |             "type": "INTEGER"
141 |           }
142 |         ],
143 |         "mode": "NULLABLE",
144 |         "name": "adapter_response",
145 |         "type": "RECORD"
146 |       },
147 |       {
148 |         "mode": "NULLABLE",
149 |         "name": "unique_id",
150 |         "type": "STRING"
151 |       },
152 |       {
153 |         "mode": "NULLABLE",
154 |         "name": "execution_time",
155 |         "type": "FLOAT"
156 |       },
157 |       {
158 |         "mode": "NULLABLE",
159 |         "name": "status",
160 |         "type": "STRING"
161 |       }
162 |     ],
163 |     "mode": "REPEATED",
164 |     "name": "results",
165 |     "type": "RECORD"
166 |   },
167 |   {
168 |     "fields": [
169 |       {
170 |         "mode": "NULLABLE",
171 |         "name": "env",
172 |         "type": "STRING"
173 |       },
174 |       {
175 |         "mode": "NULLABLE",
176 |         "name": "invocation_id",
177 |         "type": "STRING"
178 |       },
179 |       {
180 |         "mode": "NULLABLE",
181 |         "name": "dbt_version",
182 |         "type": "STRING"
183 |       },
184 |       {
185 |         "description": "bq-datetime",
186 |         "mode": "NULLABLE",
187 |         "name": "generated_at",
188 |         "type": "TIMESTAMP"
189 |       },
190 |       {
191 |         "mode": "NULLABLE",
192 |         "name": "dbt_schema_version",
193 |         "type": "STRING"
194 |       }
195 |     ],
196 |     "mode": "NULLABLE",
197 |     "name": "metadata",
198 |     "type": "RECORD"
199 |   }
200 | ]
201 | 


--------------------------------------------------------------------------------
/dags/dop/component/configuration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/dags/dop/component/configuration/__init__.py


--------------------------------------------------------------------------------
/dags/dop/component/configuration/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | DOP_DBT_USER = "dop-dbt-user"
 4 | DOP_DOCKER_USER = "dop-docker-user"
 5 | 
 6 | 
 7 | class EnvConfig:
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     @property
12 |     def environment(self):
13 |         return os.environ["DOP_ENVIRONMENT"]
14 | 
15 |     @property
16 |     def project_id(self):
17 |         return os.environ["DOP_PROJECT_ID"]
18 | 
19 |     @property
20 |     def dag_path(self):
21 |         return os.path.sep.join(
22 |             [self.service_project_path, "embedded_dop", "source", "dags"]
23 |         )
24 | 
25 |     @property
26 |     def location(self):
27 |         return os.environ["DOP_LOCATION"]
28 | 
29 |     @property
30 |     def orchestration_path(self):
31 |         return os.path.sep.join(
32 |             [self.service_project_path, "embedded_dop", "orchestration"]
33 |         )
34 | 
35 |     @property
36 |     def is_sandbox_environment(self):
37 |         return bool(os.environ.get("DOP_SANDBOX_ENVIRONMENT", False))
38 | 
39 |     @property
40 |     def service_project_path(self):
41 |         return os.environ["DOP_SERVICE_PROJECT_PATH"]
42 | 
43 |     @property
44 |     def infra_project_id(self):
45 |         return os.environ["DOP_INFRA_PROJECT_ID"]
46 | 
47 |     @property
48 |     def gcr_pull_secret_name(self):
49 |         return os.environ.get("DOP_GCR_PULL_SECRET_NAME", None)
50 | 
51 |     @property
52 |     def dbt_projects_path(self):
53 |         """
54 |         An Alias of service_project_path because it is also the DBT project paths by convention.
55 |         This may however be changed to have its own environment variable in the future if it is required to
56 |         differentiate the DBT projects setup from the service project path itself
57 |         :return:
58 |         """
59 |         return self.service_project_path
60 | 
61 | 
62 | env_config = EnvConfig()
63 | 


--------------------------------------------------------------------------------
/dags/dop/component/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/dags/dop/component/helper/__init__.py


--------------------------------------------------------------------------------
/dags/dop/component/helper/dbt_init.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | 
 5 | from dop.component.helper import dbt_profile
 6 | 
 7 | if __name__ == "__main__":
 8 |     logging.getLogger().setLevel(logging.INFO)
 9 |     parser = argparse.ArgumentParser(description="Process arguments")
10 |     parser.add_argument(
11 |         "--tmp_dir",
12 |         required=True,
13 |         type=str,
14 |         help="the TMP DIR where the python virtual environment is created",
15 |     )
16 |     parser.add_argument(
17 |         "--project_name", required=True, type=str, help="DBT project name"
18 |     )
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     os.mkdir(os.path.sep.join([args.tmp_dir, ".dbt"]))
23 | 
24 |     logging.info(f"Creating DBT profiles.yml in {args.tmp_dir}")
25 |     dbt_profile.setup_and_save_profiles(
26 |         project_name=args.project_name, profile_path=args.tmp_dir
27 |     )
28 | 
29 |     logging.info("Done.")
30 | 


--------------------------------------------------------------------------------
/dags/dop/component/helper/dbt_profile.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | from dop.component.transformation.common.parser.yaml_parser import (
 5 |     yaml_to_dict,
 6 |     dict_to_yaml,
 7 | )
 8 | from dop.component.configuration.env import env_config
 9 | from dop.component.configuration import env
10 | 
11 | 
12 | def setup_profiles(project_name):
13 |     project_id = env_config.project_id
14 |     location = env_config.location
15 |     # the profile is generated dynamically at runtime, therefore multiple target profiles are not required
16 |     target = "all"
17 |     target_type = "bigquery"
18 |     path_to_dbt_project_yml = os.path.sep.join(
19 |         [env_config.dbt_projects_path, project_name, "dbt_project.yml"]
20 |     )
21 | 
22 |     bq_profile = {}
23 | 
24 |     if not os.path.isfile(path_to_dbt_project_yml):
25 |         raise RuntimeError(
26 |             f"Profile yaml `{path_to_dbt_project_yml}` does not exist, "
27 |             f"was the DBT option `project` in the orchestration config set correctly?"
28 |         )
29 | 
30 |     with open(path_to_dbt_project_yml) as fp:
31 |         profile_id = yaml_to_dict(fp.read())["profile"]
32 |         bq_profile[profile_id] = {
33 |             "target": target,
34 |             "outputs": {
35 |                 target: {
36 |                     "type": target_type,
37 |                     "method": "oauth",
38 |                     "project": project_id,
39 |                     "schema": str(project_name).replace("-", "_"),
40 |                     # TODO: Is this the right default, should `schema` be passed in via an environment variable?
41 |                     "threads": 1,
42 |                     "timeout_seconds": 300,
43 |                     "location": location,
44 |                     "priority": "interactive",
45 |                     "impersonate_service_account": f"{env.DOP_DBT_USER}@{project_id}.iam.gserviceaccount.com",
46 |                 }
47 |             },
48 |         }
49 | 
50 |     profile = dict_to_yaml(bq_profile)
51 |     logging.info(f"DBT Profile: {profile}")
52 | 
53 |     return profile
54 | 
55 | 
56 | def setup_and_save_profiles(project_name, profile_path):
57 |     profile_yml = setup_profiles(project_name=project_name)
58 |     path_to_profile_yml = os.path.sep.join([profile_path, ".dbt", "profiles.yml"])
59 |     logging.info(f"Updating: {path_to_profile_yml}")
60 |     with open(path_to_profile_yml, "w+") as fp:
61 |         fp.write(profile_yml)
62 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/common/adapter/model.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | 
3 | 
4 | @dataclass(frozen=True)
5 | class Argument:
6 |     name: str
7 |     type: str
8 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/common/adapter/relation.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | class RelationValueError(ValueError):
 5 |     pass
 6 | 
 7 | 
 8 | @dataclass(frozen=True)
 9 | class BaseRelation:
10 |     database: str
11 |     schema: str
12 |     identifier: str
13 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/common/adapter/schema.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | import copy
  3 | 
  4 | from dataclasses import dataclass
  5 | from croniter import croniter
  6 | from typing import List, Optional
  7 | from decimal import Decimal
  8 | 
  9 | from marshmallow import validate, post_load, Schema, fields
 10 | 
 11 | TASK_KIND_MATERI = "materialization"
 12 | TASK_KIND_ASSERT = "assertion"
 13 | TASK_KIND_INVOKE = "invocation"
 14 | TASK_KIND_AIRFLOW_OPERATOR = "airflow_operator"
 15 | TASK_KIND_DBT = "dbt"
 16 | 
 17 | NATIVE_TASK_KIND = [TASK_KIND_MATERI, TASK_KIND_ASSERT, TASK_KIND_INVOKE]
 18 | CUSTOM_TASK_KIND = [TASK_KIND_DBT]
 19 | 
 20 | 
 21 | def dbt_argument_validation_mapper(option, value):
 22 |     allowed_options = [
 23 |         "-m",
 24 |         "-x",
 25 |         "--fail-fast",
 26 |         "--threads",
 27 |         "--exclude",
 28 |         "--full-refresh",
 29 |         "--bucket",
 30 |         "--bucket-path",
 31 |     ]
 32 |     if option not in allowed_options:
 33 |         raise DbtTaskException(
 34 |             f"Supported DBT command line argument options are: {allowed_options}, `{option}` supplied"
 35 |         )
 36 | 
 37 | 
 38 | def dbt_validation_func(task):
 39 |     allowed_options = ["run", "test", "docs generate"]
 40 |     if task.kind.target not in allowed_options:
 41 |         raise DbtTaskException(
 42 |             f"DBT task.kind.target must be one of {allowed_options}, `{task.kind.target}` supplied"
 43 |         )
 44 | 
 45 |     # check version
 46 |     dbt_version = task.options.get("version")
 47 |     if not dbt_version:
 48 |         raise DbtTaskException("DBT version must be supplied in the configuration")
 49 | 
 50 |     v_major, v_minor, v_patch = dbt_version.split(".")
 51 |     if int(v_major) < 0 or Decimal(f"{v_minor}.{v_patch}") < Decimal("19.1"):
 52 |         raise DbtTaskException(
 53 |             f"DBT version must be >= 0.19.1, {dbt_version} is supplied"
 54 |         )
 55 | 
 56 |     # check DBT arguments, only allow certain arguments to be used
 57 |     arguments = task.options.get("arguments")
 58 | 
 59 |     if arguments:
 60 |         for argument in arguments:
 61 |             dbt_argument_validation_mapper(
 62 |                 option=argument.get("option"), value=argument.get("value")
 63 |             )
 64 | 
 65 | 
 66 | def materialization_validation_func(task):
 67 |     allowed_options = ["table", "view", "udf", "stored_procedure", "schema"]
 68 |     if task.kind.target not in allowed_options:
 69 |         raise MaterializationTaskException(
 70 |             f"Materialization task.kind.target must be one of {allowed_options}, `{task.kind.target}`supplied"
 71 |         )
 72 | 
 73 | 
 74 | def assertion_validation_func(task):
 75 |     allowed_options = ["assertion", "assertion_sensor"]
 76 |     if task.kind.target not in allowed_options:
 77 |         raise AssertionTaskException(
 78 |             f"Assertion task.kind.target must be one of {allowed_options}, {task.kind.target} supplied"
 79 |         )
 80 | 
 81 | 
 82 | def invocation_validation_func(task):
 83 |     allowed_options = ["stored_procedure"]
 84 |     if task.kind.target not in allowed_options:
 85 |         raise InvocationTaskException(
 86 |             f"Invocation task.kind.target must be one of {allowed_options}, {task.kind.target} supplied"
 87 |         )
 88 | 
 89 | 
 90 | def data_validation_mapper(task):
 91 |     if task.kind.action == TASK_KIND_DBT:
 92 |         return dbt_validation_func
 93 |     elif task.kind.action == TASK_KIND_MATERI:
 94 |         return materialization_validation_func
 95 |     elif task.kind.action == TASK_KIND_ASSERT:
 96 |         return assertion_validation_func
 97 |     elif task.kind.action == TASK_KIND_INVOKE:
 98 |         return invocation_validation_func
 99 | 
100 |     return None
101 | 
102 | 
103 | class InvalidDagConfig(ValueError):
104 |     pass
105 | 
106 | 
107 | class DbtTaskException(InvalidDagConfig):
108 |     pass
109 | 
110 | 
111 | class MaterializationTaskException(InvalidDagConfig):
112 |     pass
113 | 
114 | 
115 | class AssertionTaskException(InvalidDagConfig):
116 |     pass
117 | 
118 | 
119 | class InvocationTaskException(InvalidDagConfig):
120 |     pass
121 | 
122 | 
123 | class IsValidCron(validate.Validator):
124 |     default_message = "Not a valid Cron Expression"
125 | 
126 |     def __call__(self, value) -> typing.Any:
127 |         message = (
128 |             f"The schedule_interval expression `{value}` must be a valid CRON expression: "
129 |             "validate it here https://crontab.guru/"
130 |         )
131 |         if not croniter.is_valid(value):
132 |             raise validate.ValidationError(message)
133 | 
134 |         return value
135 | 
136 | 
137 | @dataclass
138 | class Partitioning:
139 |     field: str
140 |     data_type: str
141 | 
142 | 
143 | class PartitioningSchema(Schema):
144 |     field = fields.String(validate=validate.OneOf(["date"]))
145 |     data_type = fields.String(
146 |         validate=validate.OneOf(["timestamp", "datetime", "date"])
147 |     )
148 | 
149 | 
150 | @dataclass
151 | class Kind:
152 |     action: str
153 |     target: str
154 | 
155 | 
156 | class KindSchema(Schema):
157 |     action = fields.String(
158 |         validate=validate.OneOf(
159 |             [
160 |                 TASK_KIND_MATERI,
161 |                 TASK_KIND_ASSERT,
162 |                 TASK_KIND_INVOKE,
163 |                 TASK_KIND_DBT,
164 |                 TASK_KIND_AIRFLOW_OPERATOR,
165 |             ]
166 |         )
167 |     )
168 |     target = fields.String()
169 | 
170 | 
171 | @dataclass
172 | class Task:
173 |     kind: Kind
174 |     database: Optional[str]
175 |     schema: Optional[str]
176 |     identifier: str
177 |     partitioning: Optional[Partitioning]
178 |     dependencies: List[str]
179 |     options: dict
180 | 
181 | 
182 | class TaskSchema(Schema):
183 |     kind = fields.Nested(KindSchema, required=True)
184 |     database = fields.String(required=False)
185 |     schema = fields.String(required=False)
186 |     identifier = fields.String(required=True)
187 |     partitioning = fields.Nested(PartitioningSchema, required=False, missing=None)
188 |     dependencies = fields.List(cls_or_instance=fields.Str(), required=False, missing=[])
189 |     options = fields.Dict(required=False, missing={})
190 | 
191 | 
192 | @dataclass
193 | class DagConfig:
194 |     enabled: bool
195 |     timezone: str
196 |     schedule_interval: str
197 |     params: Optional[dict]
198 |     database: str
199 |     schema: str
200 |     tasks: List[Task]
201 | 
202 | 
203 | class DagConfigSchema(Schema):
204 |     enabled = fields.Bool(required=False, missing=True)
205 |     timezone = fields.Str(required=True)
206 |     schedule_interval = fields.Str(validate=IsValidCron(), missing=None)
207 |     params = fields.Dict(missing={})
208 |     database = fields.Str(required=True)
209 |     schema = fields.Str(required=True)
210 |     tasks = fields.List(cls_or_instance=fields.Nested(TaskSchema), required=True)
211 | 
212 |     @post_load
213 |     def make_dag_config(self, data, **kwargs):
214 |         data_with_objects = copy.deepcopy(data)
215 |         tasks = []
216 |         for task in data_with_objects["tasks"]:
217 |             task["kind"] = Kind(**task["kind"])
218 |             task["database"] = (
219 |                 task["database"]
220 |                 if "database" in task
221 |                 else data_with_objects["database"]
222 |             )
223 |             task["schema"] = (
224 |                 task["schema"] if "schema" in task else data_with_objects["schema"]
225 |             )
226 |             task["partitioning"] = (
227 |                 Partitioning(**task["partitioning"])
228 |                 if task["partitioning"] is not None
229 |                 else task["partitioning"]
230 |             )
231 | 
232 |             task_entity = Task(**task)
233 | 
234 |             # Additional validation
235 |             validation_func = data_validation_mapper(task=task_entity)
236 |             if validation_func:
237 |                 validation_func(task_entity)
238 | 
239 |             tasks.append(task_entity)
240 | 
241 |         data_with_objects["tasks"] = tasks
242 |         return DagConfig(**data_with_objects)
243 | 
244 | 
245 | def load_dag_schema(payload) -> DagConfig:
246 |     schema = DagConfigSchema()
247 |     result = schema.load(payload)
248 | 
249 |     if result.errors:
250 |         raise InvalidDagConfig(result.errors)
251 | 
252 |     return result.data
253 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/common/parser/yaml_parser.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import yaml
 4 | 
 5 | # Use LibYAML bindings if installed, much faster than the pure Python version
 6 | # See https://pyyaml.org/wiki/PyYAMLDocumentation for details
 7 | try:
 8 |     from yaml import CLoader as Loader
 9 | except ImportError:
10 |     from yaml import Loader  # type: ignore
11 | 
12 | 
13 | def dict_to_yaml(dct: dict) -> str:
14 |     """
15 |     Convert a dictionary to a YAML serialised string
16 | 
17 |     :param dct: Dictionary to be converted
18 |     :return: serialised dictionary in YAML format
19 |     """
20 |     yml: dict = yaml.load(json.dumps(dct), Loader=Loader)
21 |     return yaml.dump(yml)
22 | 
23 | 
24 | def yaml_to_dict(yml: str) -> dict:
25 |     """
26 |     Convert a YAML serialised string to a dictionary
27 | 
28 |     Using unsafe_load to allow complex python types in YAML.
29 |     For example in an airflow sensor operator we can define as param
30 |     execution_date: !!python/object/apply:datetime.timedelta [1]
31 | 
32 |     As we are defining the yaml and it cannot be modified by third parties,
33 |     there is no risk of injection and execution of arbitrary code
34 | 
35 |     :param yml: YAML serialised string to be converted
36 |     :return: Converted dictionary
37 |     """
38 |     return yaml.unsafe_load(yml)
39 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/common/templating/jinja.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | from jinja2 import FileSystemLoader, Environment
 5 | from dop import definitions
 6 | 
 7 | 
 8 | def log(msg, info=False):
 9 |     if info:
10 |         logging.info(msg)
11 |     else:
12 |         logging.debug(msg)
13 |     return ""
14 | 
15 | 
16 | def raise_error(error):
17 |     raise RuntimeError(error)
18 | 
19 | 
20 | class RunnerEnvironment:
21 |     def __init__(self, runner):
22 |         if runner not in definitions.SUPPORTED_TRANSFORMATION_RUNNERS:
23 |             raise NotImplementedError(
24 |                 f'Transformation Runner "{runner}" is not supported'
25 |             )
26 | 
27 |         self._runner = runner
28 | 
29 |     def _get_runner_template_paths(self):
30 |         common_template_path = os.path.join(
31 |             definitions.ROOT_DIR,
32 |             "component",
33 |             "transformation",
34 |             "common",
35 |             "templating",
36 |             "template",
37 |         )
38 | 
39 |         runner_base_path = os.path.join(
40 |             definitions.ROOT_DIR,
41 |             "component",
42 |             "transformation",
43 |             "runner",
44 |             self._runner,
45 |             "template",
46 |             "macro",
47 |         )
48 | 
49 |         if not os.path.isdir(runner_base_path):
50 |             raise RuntimeError(f"Path `{runner_base_path}` does not exist")
51 | 
52 |         if not os.path.isdir(common_template_path):
53 |             raise RuntimeError(f"Path `{common_template_path}` does not exist")
54 | 
55 |         return [common_template_path, runner_base_path]
56 | 
57 |     def get_env(self):
58 |         template_loader = FileSystemLoader(self._get_runner_template_paths())
59 |         runner_env = Environment(loader=template_loader, extensions=["jinja2.ext.do"])
60 | 
61 |         runner_env.globals["log"] = log
62 |         runner_env.globals["raise"] = raise_error
63 | 
64 |         return runner_env
65 | 
66 | 
67 | def get_runner_environment(runner):
68 |     return RunnerEnvironment(runner=runner).get_env()
69 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/common/templating/template/global.sql:
--------------------------------------------------------------------------------
1 | {% macro is_incremental() -%}
2 |     {% if not dag_run.conf or dag_run.conf.get('full_refresh') != true %}
3 |         true
4 |     {% endif %}
5 | {%- endmacro -%}
6 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/runner/bigquery/adapter/impl.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import jinja2
  3 | 
  4 | from typing import Optional, Dict, Any, List
  5 | from google.cloud import bigquery
  6 | from google.cloud.exceptions import GoogleCloudError, NotFound
  7 | 
  8 | from dop.component.configuration import env
  9 | from dop.component.transformation.common.parser import yaml_parser
 10 | from dop.component.transformation.common.templating import jinja
 11 | from dop.component.transformation.runner.bigquery.adapter.model import (
 12 |     TableOptionsConfig,
 13 |     PartitionConfig,
 14 |     UDFArgument,
 15 |     StoredProcedureArgument,
 16 | )
 17 | from dop.component.transformation.runner.bigquery.adapter.relation import (
 18 |     BigQueryRelation as Relation,
 19 |     RelationHelper,
 20 | )
 21 | from dop.component.util import auth
 22 | 
 23 | 
 24 | def get_query_job_config(
 25 |     destination,
 26 |     write_disposition=bigquery.WriteDisposition.WRITE_EMPTY,
 27 |     create_disposition=bigquery.CreateDisposition.CREATE_NEVER,
 28 | ):
 29 |     job_config = bigquery.QueryJobConfig()
 30 |     job_config.destination = destination
 31 |     job_config.write_disposition = write_disposition
 32 |     job_config.create_disposition = create_disposition
 33 | 
 34 |     return job_config
 35 | 
 36 | 
 37 | def execute_job_with_error_logging(job):
 38 |     if job.dry_run:
 39 |         logging.info(
 40 |             f"Total GB it will process: {job.total_bytes_processed / 1024 / 1024 / 1024}"
 41 |         )
 42 |         return
 43 | 
 44 |     try:
 45 |         job.result()
 46 |         logging.info("Affected: {} rows".format(job.num_dml_affected_rows))
 47 |         logging.info("Job completed...")
 48 |     except GoogleCloudError as e:
 49 |         logging.error(e)
 50 |         logging.error(job.error_result)
 51 |         logging.error(job.errors)
 52 |         raise e
 53 | 
 54 | 
 55 | def get_database():
 56 |     return env.env_config.project_id
 57 | 
 58 | 
 59 | def get_bq_client(project_id, location, credentials):
 60 |     return bigquery.client.Client(
 61 |         project=project_id, location=location, credentials=credentials
 62 |     )
 63 | 
 64 | 
 65 | def get_query_runner(options: Optional[Dict[str, Any]]):
 66 |     dry_run = options.get("dry_run", True)
 67 |     project_id = options["project_id"]
 68 |     location = options["location"]
 69 |     credentials = None
 70 | 
 71 |     if not project_id:
 72 |         raise ValueError("BigQuery requires a project_id")
 73 | 
 74 |     if env.env_config.is_sandbox_environment:
 75 |         credentials = auth.ServiceAccountImpersonationCredentialManager(
 76 |             source_sa_name=env.DOP_DOCKER_USER, project_id=project_id
 77 |         ).get_target_credentials()
 78 |         logging.info(
 79 |             f"Using service account impersonation, and service account `{env.DOP_DOCKER_USER}` is now active"
 80 |         )
 81 | 
 82 |     client = get_bq_client(
 83 |         project_id=project_id, location=location, credentials=credentials
 84 |     )
 85 |     jinja_environment = jinja.get_runner_environment(runner="bigquery")
 86 |     return QueryRunner(
 87 |         client=client, jinja_environment=jinja_environment, dry_run=dry_run
 88 |     )
 89 | 
 90 | 
 91 | class QueryRunner:
 92 |     def __init__(
 93 |         self,
 94 |         client: bigquery.client.Client,
 95 |         jinja_environment: jinja2.Environment,
 96 |         dry_run=True,
 97 |     ):
 98 |         self._client = client
 99 |         self._jinja_environment = jinja_environment
100 |         self._dry_run = dry_run
101 |         self._relation_helper = RelationHelper(client=self._client)
102 | 
103 |     def write_append(self, query, relation):
104 |         job_config = get_query_job_config(
105 |             destination=relation,
106 |             write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
107 |         )
108 | 
109 |         logging.info(f"Appending data to {relation} using query: {query}")
110 | 
111 |         query_job = self._client.query(query=query, job_config=job_config)
112 |         execute_job_with_error_logging(job=query_job)
113 | 
114 |     def replace_or_upsert(
115 |         self, query: str, relation: Relation, options: Optional[Dict[str, Any]] = None
116 |     ):
117 |         """
118 |         TODO: this has quite a bit of duplication, needs tidying up
119 |         This does a full replacement or an upsert to the target relation.
120 |         The function has the following behaviour
121 |         1. When the target table does not exist, it creates the table using schema inferred by the query results
122 |         2. If the target table is already there, it writes into a temp table first and then merge to the target table so the process is atomic and will break if the schema is changed
123 | 
124 |         :param query: SQL Query
125 |         :param relation: BigQuery relation to write truncate
126 |         :param options: For options such as specifying partitions, forcing full refresh etc
127 |         Range based Partition:
128 |         options={'partition_key': 'key', 'partition_data_type': 'int64', 'partition_range': {'start': 0, 'end': 1000, 'interval': 100 } }
129 | 
130 |         Date based Partition:
131 |         options={'partition_key': 'key', 'partition_data_type': 'datetime'}
132 | 
133 |         """
134 | 
135 |         def query_cleaned(q):
136 |             if q.strip()[:-1] == ";":
137 |                 q = q.strip()[:-1]
138 |                 if q.strip()[:-1] == ";":
139 |                     raise RuntimeError(
140 |                         f"Query {query} should not be followed by `;` at the very end"
141 |                     )
142 | 
143 |             return q
144 | 
145 |         query = query_cleaned(q=query)
146 | 
147 |         options = {} if not options else options
148 |         full_refresh = options.get("full_refresh", False)
149 | 
150 |         tmp_relation = Relation(
151 |             database=relation.database,
152 |             schema=relation.schema,
153 |             identifier="_tmp_" + relation.identifier,
154 |         )
155 | 
156 |         template_create_or_replace = self._jinja_environment.from_string(
157 |             """
158 |             {% import 'materialization/table_create_or_replace.sql' as materialise_table %}
159 |             {{ materialise_table.create_or_replace(query, relation_helper, options) }}
160 |         """
161 |         )
162 | 
163 |         table_options_config = TableOptionsConfig(
164 |             options={
165 |                 "expiration_timestamp": "TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 12 hour)"
166 |             }
167 |         )
168 | 
169 |         partition_config = PartitionConfig.create(options=options)
170 | 
171 |         rendered_create_or_replace = template_create_or_replace.render(
172 |             query=query,
173 |             relation_helper=self._relation_helper,
174 |             options={
175 |                 "relation": relation,
176 |                 "tmp_relation": tmp_relation,
177 |                 "table_options_config": table_options_config,
178 |                 "partition_config": partition_config,
179 |                 "full_refresh": full_refresh,
180 |             },
181 |         )
182 | 
183 |         logging.info("Running Query: {}".format(rendered_create_or_replace))
184 | 
185 |         job_config = bigquery.QueryJobConfig(dry_run=self._dry_run)
186 |         query_job = self._client.query(
187 |             query=rendered_create_or_replace, job_config=job_config
188 |         )
189 | 
190 |         execute_job_with_error_logging(job=query_job)
191 | 
192 |         template_upsert = self._jinja_environment.from_string(
193 |             """
194 |                     {% import 'materialization/table_upsert.sql' as materialise_table %}
195 |                     {{ materialise_table.upsert(query, relation_helper, options) }}
196 |                 """
197 |         )
198 | 
199 |         rendered_upsert = template_upsert.render(
200 |             query=query,
201 |             relation_helper=self._relation_helper,
202 |             options={
203 |                 "relation": relation,
204 |                 "tmp_relation": tmp_relation,
205 |                 "partition_config": partition_config,
206 |                 "full_refresh": full_refresh,
207 |             },
208 |         )
209 | 
210 |         logging.info("Running the Upsert Query: {}".format(rendered_upsert))
211 | 
212 |         job_config = bigquery.QueryJobConfig(dry_run=self._dry_run)
213 |         query_job = self._client.query(query=rendered_upsert, job_config=job_config)
214 | 
215 |         execute_job_with_error_logging(job=query_job)
216 | 
217 |     def recreate_view(self, query, relation: Relation):
218 |         full_table_id = f"{relation.database}.{relation.schema}.{relation.identifier}"
219 |         view = bigquery.table.Table(table_ref=full_table_id)
220 |         view.view_query = query
221 | 
222 |         logging.info("Executing query: {}".format(query))
223 | 
224 |         try:
225 |             self._client.delete_table(view)
226 |         except NotFound:
227 |             logging.info(
228 |                 "View {} not found, ignore the delete operation".format(relation)
229 |             )
230 |             pass
231 | 
232 |         self._client.create_table(view)
233 | 
234 |         logging.info("View: {} has been created".format(relation))
235 | 
236 |     def create_schema(self, project_id, dataset_id, exists_ok=True):
237 |         # TODO: add support to dataset level TTL
238 |         full_dataset_id = f"{project_id}.{dataset_id}"
239 |         dataset = bigquery.dataset.Dataset(dataset_ref=full_dataset_id)
240 | 
241 |         self._client.create_dataset(dataset=dataset, exists_ok=exists_ok)
242 |         logging.info(
243 |             "New Dataset {} already exists or has been created".format(full_dataset_id)
244 |         )
245 | 
246 |     def recreate_udf(self, arguments: List[Dict], query, relation: Relation):
247 |         if type(arguments) != list:
248 |             raise TypeError(
249 |                 "arguments for UDF must be a list of entities with `name` and `type`. "
250 |                 "Please refer to the documentation for an example"
251 |             )
252 | 
253 |         query = self.render_udf_query(
254 |             arguments=arguments, query=query, relation=relation
255 |         )
256 |         logging.info("Creating the UDF using: {}".format(query))
257 | 
258 |         job_config = bigquery.QueryJobConfig(dry_run=self._dry_run)
259 |         query_job = self._client.query(query=query, job_config=job_config)
260 | 
261 |         execute_job_with_error_logging(job=query_job)
262 | 
263 |         logging.info("UDF: {} has been created".format(relation))
264 | 
265 |     def recreate_stored_procedure(self, arguments: List[Dict], query, relation):
266 |         if type(arguments) != list:
267 |             raise TypeError(
268 |                 "arguments for Stored Procedure must be a list of entities with `name` and `type`. "
269 |                 "Please refer to the documentation for an example"
270 |             )
271 | 
272 |         query = self.render_stored_procedure_query(
273 |             arguments=arguments, query=query, relation=relation
274 |         )
275 |         logging.info(f"Creating the Stored Procedure using query: {query}")
276 | 
277 |         job_config = bigquery.QueryJobConfig()
278 |         query_job = self._client.query(query=query, job_config=job_config)
279 |         execute_job_with_error_logging(job=query_job)
280 | 
281 |         logging.info("Stored Procedure: {} has been created".format(relation))
282 | 
283 |     @staticmethod
284 |     def render_udf_query(arguments: List[Dict], query, relation):
285 |         arguments_models = [
286 |             UDFArgument(name=argument["name"], type=argument["type"])
287 |             for argument in arguments
288 |         ]
289 |         parsed_arguments = ",".join(
290 |             [f"{argument.name} {argument.type}" for argument in arguments_models]
291 |         )
292 |         rendered_query = (
293 |             f"""
294 | CREATE OR REPLACE FUNCTION {relation}({parsed_arguments}) AS
295 | (
296 |    """
297 |             + query
298 |             + """
299 | )
300 | """
301 |         )
302 | 
303 |         return rendered_query
304 | 
305 |     @staticmethod
306 |     def render_stored_procedure_query(arguments: List[Dict], query, relation):
307 |         arguments_models = [
308 |             StoredProcedureArgument(name=argument["name"], type=argument["type"])
309 |             for argument in arguments
310 |         ]
311 |         parsed_arguments = ",".join(
312 |             [f"{argument.name} {argument.type}" for argument in arguments_models]
313 |         )
314 |         rendered_query = f"""
315 | CREATE OR REPLACE PROCEDURE {relation}({parsed_arguments})
316 | BEGIN
317 |   {query};
318 | END;
319 | """
320 | 
321 |         return rendered_query
322 | 
323 |     def assertion(self, query):
324 |         def compile_assertion_results(rows):
325 |             assertion_results = []
326 |             has_failure = False
327 |             reserved_keys = ["success", "description"]
328 |             for row in rows:
329 |                 assertion_result = {
330 |                     "success": None,
331 |                     "description": None,
332 |                     "other_asserted_values": {},
333 |                 }
334 |                 if not row["success"]:
335 |                     has_failure = True
336 | 
337 |                 assertion_result["success"] = row["success"]
338 |                 assertion_result["description"] = row["description"]
339 |                 for key, value in row.items():
340 |                     if key not in reserved_keys:
341 |                         assertion_result["other_asserted_values"][key] = value
342 | 
343 |                 assertion_results.append(assertion_result)
344 | 
345 |             return {"has_failure": has_failure, "assertion_results": assertion_results}
346 | 
347 |         job_config = bigquery.QueryJobConfig()
348 |         query_job = self._client.query(query=query, job_config=job_config)
349 | 
350 |         logging.info("Running assertion using query: {}".format(query))
351 | 
352 |         try:
353 |             results = compile_assertion_results(rows=query_job.result())
354 | 
355 |             logging.info(
356 |                 "\n\n#### Assertion Report ####\n\n"
357 |                 + yaml_parser.dict_to_yaml(results["assertion_results"])
358 |                 + "\n\n#### Assertion Report ####\n\n"
359 |             )
360 | 
361 |             if results["has_failure"]:
362 |                 raise AssertionError(
363 |                     'Assertion failed, check the "ASSERTION RESULTS" section for more details'
364 |                 )
365 | 
366 |         except GoogleCloudError as e:
367 |             logging.error(e)
368 |             logging.error(query_job.error_result)
369 |             logging.error(query_job.errors)
370 |             raise e
371 | 
372 |     def call_stored_procedure(self, query):
373 |         job_config = bigquery.QueryJobConfig()
374 | 
375 |         sp = f"""
376 |             BEGIN
377 |                 {query}
378 |             END;
379 |             """
380 |         logging.info(f"Calling Stored Procedure(s) :{query}")
381 | 
382 |         query_job = self._client.query(query=sp, job_config=job_config)
383 |         execute_job_with_error_logging(job=query_job)
384 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/runner/bigquery/adapter/model.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional, Dict, Any
 3 | 
 4 | from dop.component.transformation.common.adapter.model import Argument
 5 | 
 6 | 
 7 | @dataclass
 8 | class PartitionConfig:
 9 |     field: str
10 |     data_type: str = "date"
11 |     range: Optional[Dict[str, Any]] = None
12 | 
13 |     def render(self, alias: Optional[str] = None):
14 |         column: str = self.field
15 |         if alias:
16 |             column = f"{alias}.{self.field}"
17 | 
18 |         if self.data_type in ("timestamp", "datetime"):
19 |             return f"date({column})"
20 |         else:
21 |             return column
22 | 
23 |     @staticmethod
24 |     def create(options):
25 |         if not options.get("partition_key") or not options.get("partition_data_type"):
26 |             return None
27 |         elif options.get("partition_data_type") == "int64":
28 |             if (
29 |                 not options.get("partition_key")
30 |                 or not options.get("partition_range")
31 |                 or not options.get("partition_range").get("start")
32 |                 or not options.get("partition_range").get("end")
33 |                 or not options.get("partition_range").get("interval")
34 |             ):
35 |                 raise RuntimeError(f"Invalid partition options: {options}")
36 |         elif options.get("partition_data_type") in ["date", "timestamp", "datetime"]:
37 |             if not options.get("partition_key"):
38 |                 raise RuntimeError(f"Invalid partition options: {options}")
39 |         else:
40 |             raise NotImplementedError(
41 |                 f'Partition data type: {options.get("partition_data_type")} is not supported'
42 |             )
43 | 
44 |         return PartitionConfig(
45 |             field=options.get("partition_key"),
46 |             data_type=options.get("partition_data_type"),
47 |             range=options.get("partition_range"),
48 |         )
49 | 
50 | 
51 | @dataclass
52 | class TableOptionsConfig:
53 |     options: Optional[Dict[str, Any]] = None
54 | 
55 | 
56 | @dataclass(frozen=True)
57 | class UDFArgument(Argument):
58 |     pass
59 | 
60 | 
61 | @dataclass(frozen=True)
62 | class StoredProcedureArgument(Argument):
63 |     pass
64 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/runner/bigquery/adapter/relation.py:
--------------------------------------------------------------------------------
  1 | from dop.component.transformation.common.adapter.relation import (
  2 |     BaseRelation,
  3 |     RelationValueError,
  4 | )
  5 | 
  6 | from google.cloud import bigquery
  7 | from dataclasses import dataclass
  8 | 
  9 | from dop.component.transformation.runner.bigquery.adapter.model import PartitionConfig
 10 | 
 11 | 
 12 | @dataclass(frozen=True)
 13 | class BigQueryRelation(BaseRelation):
 14 |     database: str
 15 |     schema: str
 16 |     identifier: str
 17 | 
 18 |     def __post_init__(self):
 19 |         if (
 20 |             not self.database
 21 |             or not self.schema
 22 |             or not self.identifier
 23 |             or any(
 24 |                 [len(self.database) < 1, len(self.schema) < 1, len(self.identifier) < 1]
 25 |             )
 26 |         ):
 27 |             raise RelationValueError(
 28 |                 f"database: `{self.database}`, schema: `{self.schema}` "
 29 |                 f"and identifier: `{self.identifier}` must not be empty"
 30 |             )
 31 | 
 32 |     def __repr__(self):
 33 |         return f"`{self.database}.{self.schema}.{self.identifier}`"
 34 | 
 35 | 
 36 | class RelationHelper:
 37 |     def __init__(self, client: bigquery.client.Client):
 38 |         self._client = client
 39 | 
 40 |     def check_relation_exists(self, relation: BaseRelation):
 41 |         query = f"""
 42 |         SELECT * FROM {relation.database}.{relation.schema}.INFORMATION_SCHEMA.TABLES
 43 |         WHERE table_name = '{relation.identifier}';
 44 | """
 45 | 
 46 |         for _ in self._client.query(query=query):
 47 |             return True
 48 | 
 49 |         return False
 50 | 
 51 |     def has_same_partition_definition(
 52 |         self, partition_config: PartitionConfig, relation: BaseRelation
 53 |     ):
 54 |         existing_partition_definition = self.partition_definition(relation=relation)
 55 | 
 56 |         if not partition_config or not existing_partition_definition:
 57 |             return True
 58 |         elif (
 59 |             partition_config.field == existing_partition_definition["column_name"]
 60 |             and partition_config.data_type == existing_partition_definition["data_type"]
 61 |         ):
 62 |             return True
 63 |         else:
 64 |             return False
 65 | 
 66 |     def partition_definition(self, relation: BaseRelation):
 67 |         query = f"""
 68 |         SELECT column_name, data_type FROM {relation.database}.{relation.schema}.INFORMATION_SCHEMA.COLUMNS
 69 |         WHERE table_name = '{relation.identifier}' AND is_partitioning_column='YES' LIMIT 1;
 70 | """
 71 | 
 72 |         for row in self._client.query(query=query):
 73 |             return {"column_name": row["column_name"], "data_type": row["data_type"]}
 74 | 
 75 |         return None
 76 | 
 77 |     def check_if_schemas_match(
 78 |         self, tmp_relation: BaseRelation, relation: BaseRelation
 79 |     ):
 80 |         query_template = """
 81 |                 SELECT CONCAT(coalesce(column_name,''),coalesce(is_nullable,''),coalesce(data_type,''), coalesce(is_partitioning_column, '')) as col_schema_idendifer
 82 |                 FROM {r.database}.{r.schema}.INFORMATION_SCHEMA.COLUMNS
 83 |                 WHERE table_name = '{r.identifier}' ORDER BY ordinal_position;
 84 | """
 85 |         tmp_schema = [
 86 |             x["col_schema_idendifer"]
 87 |             for x in self._client.query(query=query_template.format(r=tmp_relation))
 88 |         ]
 89 |         target_schema = [
 90 |             x["col_schema_idendifer"]
 91 |             for x in self._client.query(query=query_template.format(r=relation))
 92 |         ]
 93 | 
 94 |         if tmp_schema == target_schema:
 95 |             return True
 96 |         return False
 97 | 
 98 |     def get_columns_of_relation(self, relation: BaseRelation):
 99 |         query = f"""
100 |                 SELECT column_name FROM {relation.database}.{relation.schema}.INFORMATION_SCHEMA.COLUMNS
101 |                 WHERE table_name = '{relation.identifier}' ORDER BY ordinal_position;
102 | """
103 | 
104 |         return [x["column_name"] for x in self._client.query(query=query)]
105 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/runner/bigquery/template/macro/adapter.sql:
--------------------------------------------------------------------------------
 1 | {% macro partition_by(partition_config) -%}
 2 |     {%- if partition_config is none -%}
 3 |       {% do return('') %}
 4 |     {%- elif partition_config.data_type | lower in ('date','timestamp','datetime') -%}
 5 |         partition by {{ partition_config.render() }}
 6 |     {%- elif partition_config.data_type | lower in ('int64') -%}
 7 |         {%- set range = partition_config.range -%}
 8 |         partition by range_bucket(
 9 |             {{ partition_config.field }},
10 |             generate_array({{ range.start}}, {{ range.end }}, {{ range.interval }})
11 |         )
12 |     {%- endif -%}
13 | {%- endmacro -%}
14 | 
15 | {% macro table_options(table_options_config) %}
16 |     OPTIONS({% for opt_key, opt_val in table_options_config.options.items() %}
17 |       {{ opt_key }}={{ opt_val }}{{ "," if not loop.last }}
18 |     {% endfor %})
19 | {%- endmacro -%}
20 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/runner/bigquery/template/macro/materialization/table_create_or_replace.sql:
--------------------------------------------------------------------------------
 1 | {% import 'adapter.sql' as adapter %}
 2 | 
 3 | {% macro create_or_replace(query, relation_helper, options) %}
 4 |     {%- set relation = options['relation'] -%}
 5 |     {%- set tmp_relation = options['tmp_relation'] -%}
 6 |     {%- set relation_exists = relation_helper.check_relation_exists(relation) -%}
 7 |     {%- set table_options_config = options['table_options_config'] -%}
 8 |     {%- set partition_config = options['partition_config'] -%}
 9 |     {%- set full_refresh = options['full_refresh'] -%}
10 | 
11 |     {# -- Always try to drop the tmp table #}
12 |     drop table if exists {{ tmp_relation }};
13 | 
14 |     {# -- Drop table first if partition definition is different and it's a full refresh #}
15 |     {%- if full_refresh and relation_helper.has_same_partition_definition(partition_config, relation)  -%}
16 |         drop table if exists {{ relation }};
17 |     {%- endif -%}
18 | 
19 |     {# -- Create / Replace table #}
20 |     create or replace table
21 |     {%- if relation_exists and not full_refresh -%}
22 |         {{ tmp_relation }}
23 |     {%- else -%}
24 |         {{ relation }}
25 |     {%- endif -%}
26 | 
27 |     {# -- Partition Block #}
28 |     {%- if partition_config is not none -%}
29 |         {{ space }} {{ adapter.partition_by(partition_config) }}
30 |     {%- endif -%}
31 | 
32 |     {# -- Table Options Block #}
33 |     {%- if table_options_config is not none -%}
34 |         {{ space }} {{ adapter.table_options(table_options_config) }}
35 |     {%- endif -%}
36 | 
37 |     {# -- Main query block #}
38 |     AS (
39 |         {{ query }}
40 |     );
41 | {% endmacro %}
42 | 


--------------------------------------------------------------------------------
/dags/dop/component/transformation/runner/bigquery/template/macro/materialization/table_upsert.sql:
--------------------------------------------------------------------------------
 1 | {% import 'adapter.sql' as adapter %}
 2 | 
 3 | {% macro upsert(query, relation_helper, options) %}
 4 |     {%- set relation = options['relation'] -%}
 5 |     {%- set tmp_relation = options['tmp_relation'] -%}
 6 |     {%- set relation_exists = relation_helper.check_relation_exists(relation) -%}
 7 |     {%- set tmp_relation_exists = relation_helper.check_relation_exists(tmp_relation) -%}
 8 |     {%- set columns_of_relation = relation_helper.get_columns_of_relation(relation) -%}
 9 |     {%- set partition_config = options['partition_config'] -%}
10 |     {%- set full_refresh = options['full_refresh'] -%}
11 |     {%- set incremental_time_partitioned = partition_config.data_type | lower in ('date','timestamp','datetime') -%}
12 | 
13 |     {# -- Only run the merge query if tmp relation is produced and we already have an existing relation. This is not applicable for full refresh #}
14 |     {%- if tmp_relation_exists and relation_exists and not full_refresh -%}
15 |         {%- if not tmp_relation_exists -%}
16 |             {{ raise('`{{ tmp_relation }}` must exist before an upsert can be done') }}
17 |         {%- endif -%}
18 | 
19 |         {%- if not relation_helper.check_if_schemas_match(tmp_relation, relation) -%}
20 |             {{ raise('Schema is backwards incompatible, when making schema changes, a full refresh is required') }}
21 |         {%- endif -%}
22 | 
23 |         {# -- For time based partition only, workout the partitions to be replaced #}
24 |         {%- if incremental_time_partitioned -%}
25 |             declare dbt_partitions_for_replacement array<date>;
26 |             set (dbt_partitions_for_replacement) = (
27 |                   select as struct
28 |                       array_agg(distinct {{ partition_config.render() }})
29 |                   from {{ tmp_relation }}
30 |               );
31 |         {%- endif -%}
32 | 
33 |         merge into {{ relation }} as target_relation
34 |             using (
35 |             select * from {{ tmp_relation }}
36 |           ) as tmp_relation
37 |             on FALSE
38 | 
39 |         when not matched by source
40 |             {%- if incremental_time_partitioned -%}
41 |             {{ space }} and {{ partition_config.render(alias='target_relation') }} in unnest(dbt_partitions_for_replacement)
42 |             {%- endif -%}
43 |             {{ space }} then delete
44 | 
45 |         when not matched then insert
46 |         ({% for col in columns_of_relation %}`{{ col }}`{{ "," if not loop.last }}{% endfor %})
47 |         VALUES ({% for col in columns_of_relation %}`{{ col }}`{{ "," if not loop.last }}{% endfor %});
48 | 
49 |     {%- endif -%}
50 |     {# -- Always try to drop the tmp table #}
51 |     drop table if exists {{ tmp_relation }};
52 | {% endmacro %}
53 | 


--------------------------------------------------------------------------------
/dags/dop/component/util/auth.py:
--------------------------------------------------------------------------------
 1 | from google.auth import impersonated_credentials, default
 2 | 
 3 | 
 4 | class ServiceAccountImpersonationCredentialManager:
 5 |     target_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
 6 | 
 7 |     def __init__(self, source_sa_name, project_id):
 8 |         self._source_sa_name = source_sa_name
 9 |         self._project_id = project_id
10 | 
11 |     def get_target_credentials(self):
12 |         impersonated_sa = (
13 |             f"{self._source_sa_name}@{self._project_id}.iam.gserviceaccount.com"
14 |         )
15 |         source_credentials, _ = default()
16 | 
17 |         target_credentials = impersonated_credentials.Credentials(
18 |             source_credentials=source_credentials,
19 |             target_principal=impersonated_sa,
20 |             target_scopes=self.target_scopes,
21 |             delegates=[],
22 |             lifetime=500,
23 |         )
24 | 
25 |         return target_credentials
26 | 


--------------------------------------------------------------------------------
/dags/dop/component/util/secret_manager.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from google.cloud import secretmanager
 4 | 
 5 | 
 6 | def secret_client(credentials):
 7 |     return secretmanager.SecretManagerServiceClient(credentials=credentials)
 8 | 
 9 | 
10 | def access_secret(project_id, secret_id, credentials, version="latest"):
11 |     # Build the resource name of the secret version.
12 |     client = secret_client(credentials=credentials)
13 |     name = client.secret_version_path(project_id, secret_id, version)
14 |     # Access the secret version.
15 |     response = client.access_secret_version(name)
16 | 
17 |     payload = response.payload.data.decode("UTF-8")
18 |     logging.info("Accessing secret version of {}".format(response.name))
19 | 
20 |     return payload
21 | 


--------------------------------------------------------------------------------
/dags/dop/definitions.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
4 | SUPPORTED_TRANSFORMATION_RUNNERS = ["bigquery"]
5 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |   postgres:
 4 |     image: postgres:13.2-alpine
 5 |     container_name: dop_postgres
 6 |     restart: always
 7 |     environment:
 8 |       - POSTGRES_USER=airflow
 9 |       - POSTGRES_PASSWORD=airflow
10 |       - POSTGRES_DB=airflow
11 |     logging:
12 |       options:
13 |         max-size: 10m
14 |         max-file: "3"
15 | 
16 |   webserver:
17 |     build: infrastructure/docker/composer_${AIRFLOW_VERSION}
18 |     container_name: dop_webserver
19 |     restart: always
20 |     depends_on:
21 |         - postgres
22 |     environment:
23 |       - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
24 |       - AIRFLOW__CORE__EXECUTOR=LocalExecutor
25 |       - AIRFLOW__CORE__LOGGING_LEVEL=INFO
26 |       - GOOGLE_APPLICATION_CREDENTIALS=/secret/gcp-credentials/application_default_credentials.json
27 |       - DOP_SANDBOX_ENVIRONMENT=true # set to true if running locally on a laptop, this enables certain features such as service account impersonation
28 |       - DOP_DEVELOPER_MODE=true # enable developer mode so that WIP code can be used in docker
29 | 
30 |       # The following environment environment variables need to be set on both the docker local environment as well as the composer environment
31 |       - DOP_SERVICE_PROJECT_PATH=/opt/airflow/dags/dop_service_project # The absolute directory of the service project path. Each DBT project in this path should be within their folder and must be valid. I.e. on Docker, this could be /opt/airflow/dags/dop/dbt-projects. On Composer this could be anywhere under the `/home/airflow/gcs/dags` or `/home/airflow/gcs/data` directory
32 |       - DOP_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP project_id - to be used as the project where data will be consumed & persisted
33 |       - DOP_LOCATION=${LOCATION?location_is_undefined} # GCP region - to be used to persist all data
34 |       - DOP_INFRA_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP infrastructure project id, for local development this isn't used so leaving it as the same as gcp service project id
35 |     logging:
36 |       options:
37 |         max-size: 10m
38 |         max-file: "3"
39 |     volumes:
40 |       - ./examples/service_project:/opt/airflow/dags/dop_service_project
41 |       - ./dags:/opt/airflow/dags
42 |       - ./infrastructure/docker/developer_only/.airflowignore:/opt/airflow/dags/.airflowignore
43 |       - ~/.config/gcloud/application_default_credentials.json:/secret/gcp-credentials/application_default_credentials.json # mount application default credentials only so no keys as used
44 |     ports:
45 |       - "8082:8080"
46 |     command: webserver
47 |     healthcheck:
48 |       test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"]
49 |       interval: 30s
50 |       timeout: 30s
51 |       retries: 3
52 | 


--------------------------------------------------------------------------------
/docs/a_typical_dop_orchestration_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/a_typical_dop_orchestration_flow.png


--------------------------------------------------------------------------------
/docs/dop_docker_account_impersonation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/dop_docker_account_impersonation.png


--------------------------------------------------------------------------------
/docs/dop_service_project_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/dop_service_project_architecture.png


--------------------------------------------------------------------------------
/docs/example_dag_with_dbt_running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/example_dag_with_dbt_running.png


--------------------------------------------------------------------------------
/docs/grant_service_account_user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/grant_service_account_user.png


--------------------------------------------------------------------------------
/docs/local_airflow_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/local_airflow_ui.png


--------------------------------------------------------------------------------
/docs/set-variables-ide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/set-variables-ide.png


--------------------------------------------------------------------------------
/docs/trigger_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/trigger_dag.png


--------------------------------------------------------------------------------
/docs/trigger_full_refresh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/docs/trigger_full_refresh.png


--------------------------------------------------------------------------------
/examples/service_project/.gcloudignore:
--------------------------------------------------------------------------------
 1 | # Ignore the following when packaging & deploying using rsync
 2 | docker-compose-dop.yml
 3 | Makefile
 4 | README.md
 5 | .git
 6 | .gitignore
 7 | .gcloudignore
 8 | **/__pycache__
 9 | embedded_dop/source/.git
10 | embedded_dop/source/docs
11 | embedded_dop/source/examples
12 | embedded_dop/source/tests
13 | embedded_dop/source/.gitignore
14 | embedded_dop/source/README.md
15 | embedded_dop/source/LICENSE.md
16 | dbt_start/logs
17 | dbt_start/target
18 | 


--------------------------------------------------------------------------------
/examples/service_project/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore the DOP source code, this is pulled in dynamically from the DOP source repository
2 | embedded_dop/source
3 | 


--------------------------------------------------------------------------------
/examples/service_project/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: build down up up-follow logs exec restart
  2 | 
  3 | include .env
  4 | export
  5 | 
  6 | DOP_GIT_SSH_REPO_PATH := #{REPLACE WITH THE SSH GIT REPO PATH OF DOP}
  7 | DOP_PROJECT_ID := #{REPLACE WITH A GCP PROJECT ID WHERE DOP WILL EXECUTE ALL JOBS}
  8 | DOP_LOCATION := #{REPLACE WITH A GCP REGION WHERE DATA WILL BE PERSISTED BY DOP}
  9 | DOP_ARTIFACTS_BUCKET := #{REPLACE WITH ARTIFACT BUCKET NAME}
 10 | DOP_INFRA_PROJECT_ID := #{REPLACE WITH THE GCP INFRASTRUCTURE PROJECT ID WHERE BUILD ARTIFACTS ARE STORED}
 11 | 
 12 | REPO_BASE_NAME := $(shell basename `git rev-parse --show-toplevel`)
 13 | BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
 14 | HASH := $(shell git rev-parse HEAD | head -c7)
 15 | SERVICE_PROJECT_ABS_PATH := $(shell pwd)
 16 | DATETIME := $(shell date '+%Y%m%d-%H%M%S')
 17 | 
 18 | LOCAL_DOP_EMBEDDED_SOURCE_PATH := ./embedded_dop/source
 19 | DOP_TAG_NAME := master # Designed to be overwritten if a TAG or a different branch needs to be used for deployment
 20 | AIRFLOW_VERSION := 1.10.10
 21 | 
 22 | 
 23 | ENVS := PROJECT_ID=$(DOP_PROJECT_ID) \
 24 | 	LOCATION=$(DOP_LOCATION) \
 25 | 	SERVICE_PROJECT_ABS_PATH=$(SERVICE_PROJECT_ABS_PATH)
 26 | 
 27 | validate:
 28 | 	if [ -z ${DOP_GIT_SSH_REPO_PATH} ]; then \
 29 | 	echo "DOP_GIT_SSH_REPO_PATH must be defined. Aborting";\
 30 | 	exit 1; \
 31 | 	elif [ -z ${DOP_PROJECT_ID} ]; then \
 32 | 	echo "DOP_PROJECT_ID must be defined. Aborting";\
 33 | 	exit 1; \
 34 | 	elif [ -z ${DOP_LOCATION} ]; then \
 35 | 	echo "DOP_LOCATION must be defined. Aborting";\
 36 | 	exit 1; \
 37 | 	elif [ -z ${DOP_ARTIFACTS_BUCKET} ]; then \
 38 | 	echo "DOP_ARTIFACTS_BUCKET must be defined. Aborting";\
 39 | 	exit 1; \
 40 | 	elif [ -z ${DOP_INFRA_PROJECT_ID} ]; then \
 41 | 	echo "DOP_INFRA_PROJECT_ID must be defined. Aborting";\
 42 | 	exit 1; \
 43 | 	fi
 44 | 
 45 | validate-deploy:
 46 | 	if [ -z ${DEPLOY_BUCKET_NAME} ]; then \
 47 | 	echo "DEPLOY_BUCKET_NAME must be defined. Aborting";\
 48 | 	exit 1; \
 49 | 	elif [ -z ${DOP_ARTIFACT_ID} ]; then \
 50 | 	echo "DOP_ARTIFACT_ID must be defined. Aborting";\
 51 | 	exit 1; \
 52 | 	fi
 53 | 
 54 | clean:
 55 | 	docker rm -f $(docker ps -a | grep dop_ | awk '{print $1}')
 56 | 
 57 | git-checkout-dop: validate
 58 | 	git clone $(DOP_GIT_SSH_REPO_PATH) $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) 2> /dev/null || git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) clean -fdx && git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) remote update && (git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) reset --hard origin/$(DOP_TAG_NAME) || git -C $(LOCAL_DOP_EMBEDDED_SOURCE_PATH) checkout $(DOP_TAG_NAME))
 59 | 
 60 | build: git-checkout-dop
 61 | 	$(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml up -d --build webserver
 62 | 
 63 | down: validate
 64 | 	$(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml down
 65 | 
 66 | up: git-checkout-dop
 67 | 	$(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml up -d
 68 | 
 69 | up-follow: validate
 70 | 	$(ENVS) docker-compose -f $(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/docker/docker-compose-dop.yml up
 71 | 
 72 | restart:
 73 | 	make down && make up
 74 | 
 75 | logs:
 76 | 	docker logs dop_webserver -f
 77 | 
 78 | exec:
 79 | 	docker exec -it dop_webserver /bin/bash -c "source ./script/exec_entrypoint.sh; /bin/bash"
 80 | 
 81 | executor-example-dbt-run: validate
 82 | 	docker run --workdir "/home/dbtuser/dbt_start" --env DOP_PROJECT_ID=$(DOP_PROJECT_ID) --env DOP_LOCATION=$(DOP_LOCATION) dop-dbt:latest /bin/bash -c "pipenv run dbt run"
 83 | 
 84 | build-dbt-image:
 85 | 	gcloud builds submit \
 86 | 		--substitutions SHORT_SHA=$(HASH),_DATETIME=$(DATETIME) \
 87 |         --config=$(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/cloudbuild/build-dbt.yaml \
 88 |         --project=$(DOP_INFRA_PROJECT_ID) \
 89 |         .
 90 | 
 91 | build-artifact: git-checkout-dop build-dbt-image
 92 | 	gcloud builds submit \
 93 | 	    --substitutions SHORT_SHA=$(HASH),BRANCH_NAME=$(BRANCH),REPO_NAME=$(REPO_BASE_NAME),_CLOUDBUILD_ARTIFACTS_BUCKET_NAME=$(DOP_ARTIFACTS_BUCKET),_DATETIME=$(DATETIME) \
 94 |         --config=$(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/cloudbuild/build.yaml \
 95 |         --project=$(DOP_INFRA_PROJECT_ID) \
 96 |         .
 97 | 
 98 | deploy: validate validate-deploy
 99 | 	gcloud builds submit \
100 | 	    --substitutions REPO_NAME=$(REPO_BASE_NAME),_DOP_ARTIFACT_ID=$(DOP_ARTIFACT_ID),_CLOUDBUILD_ARTIFACTS_BUCKET_NAME=$(DOP_ARTIFACTS_BUCKET),_DEPLOY_BUCKET_NAME=$(DEPLOY_BUCKET_NAME) \
101 |         --config=$(LOCAL_DOP_EMBEDDED_SOURCE_PATH)/infrastructure/cloudbuild/deploy.yaml \
102 |         --project=$(DOP_INFRA_PROJECT_ID) \
103 |         .
104 | 


--------------------------------------------------------------------------------
/examples/service_project/README.md:
--------------------------------------------------------------------------------
  1 | Table of contents
  2 | =================
  3 | * [DOP Service Project Architecture](#dop-service-project-architecture)
  4 | * [Boilerplate structure explained](#boilerplate-structure-explained)
  5 |    * [DBT Projects](#dbt-projects)
  6 |    * [The embedded_dop directory](#the-embedded_dop-directory)
  7 |       * [executor_config](#executor_config)
  8 |       * [orchestration](#orchestration)
  9 |       * [source](#source)
 10 |    * [The Makefile](#the-makefile)
 11 | * [Use DOP on Docker](#use-dop-on-docker)
 12 | * [Deploy to Cloud Composer](#deploy-to-cloud-composer)
 13 |    * [Build Artifact](#build-artifact)
 14 |    * [Deploy](#deploy)
 15 |       * [Important steps to follow for deploying to an existing Composer Cluster](#important-steps-to-follow-for-deploying-to-an-existing-composer-cluster)
 16 | * [DOP Orchestration - How To Use](#dop-orchestration---how-to-use)
 17 |    * [Create a new DOP Orchestration DAG](#create-a-new-dop-orchestration-dag)
 18 |    * [Task definitions](#task-definitions)
 19 |       * [Native Transformation - Materialization](#native-transformation---materialization)
 20 |       * [Native Transformation - Assertion](#native-transformation---assertion)
 21 |       * [Native Transformation - Invocation](#native-transformation---invocation)
 22 |       * [DBT Task](#dbt-task)
 23 |    * [Full Refresh](#full-refresh)
 24 | 
 25 | The `service_project` directory can be used as a boilerplate to setup DOP on an existing GIT repository (the service project repository).
 26 | You can copy and paste everything in this directory (including `.gcloudignore` and `.gitignore` as these are required for DOP to function correctly).
 27 | If you already have a `Makefile` or any other conflicting files, you may need to move things around by merging those files or moving i.e. the `Makefile` into `embedded_dop`
 28 | 
 29 | **Please note that this boilerplate is optimised for running DBT jobs inside DOP alongside native capabilities. If you don't use DBT, some automation may not work as expected.**
 30 | 
 31 | ## DOP Service Project Architecture
 32 | This explains how DOP functions and how it can be integrated into your existing git repositories
 33 | ![service_project_architecture](../../docs/dop_service_project_architecture.png)
 34 | 
 35 | ## Boilerplate structure explained
 36 | 
 37 | ### DBT Projects
 38 | Currently the setup is optimised to orchestrate and run DBT jobs on Google Cloud and this directory can be used as a template in your service project to quickly setup DOP with multiple DBT projects.
 39 | 
 40 | For this to work, the service project repository must contain one or multiple DBT projects, each of them in their own folder.
 41 | For example,
 42 | ```
 43 | dbt_project_1/dbt_project.yaml
 44 | dbt_project_1/...
 45 | 
 46 | dbt_project_2/dbt_project.yaml
 47 | dbt_project_2/...
 48 | ```
 49 | 
 50 | ### The embedded_dop directory
 51 | There are three main folders within this directory.
 52 | 
 53 | #### executor_config
 54 | This folder contains files required to build docker containers to be used on Cloud Composer and invoked via the Airflow K8 Pod Operator.
 55 | 
 56 | For example,
 57 | - the `dbt/config.yaml` file contains instructions to tell the build process where to locate the DBT projects inside this repository
 58 | - `Pipfile` and `Pipfile.lock` are used to maintain and lock Python dependencies so what's installed inside the docker container is always the same after each build
 59 | 
 60 | Currently this is not used on the sandbox environment when running locally with Docker Compose, DBT is still installed on the fly using dynamically created Python Virtual environment but this may change in the future.
 61 | 
 62 | #### orchestration
 63 | This folder contains some example orchestration jobs and it shows how DOP can be used to orchestrate the flow between DBT jobs, native transformations and any executors added in the future.
 64 | It's probably a good idea to look through these examples which will give you a good idea on how to orchestrate workload in DOP.
 65 | 
 66 | #### source
 67 | This is a directory reserved to store the DOP code, you won't see it in this repository because it's ignored by version control but when the build process runs, the DOP source code will be checked out to here.
 68 | 
 69 | ### The Makefile
 70 | The `Makefile` contains instructions to automate the whole initialization process for DOP including checking out the DOP repository as well as defining required variables.
 71 | For the Makefile to work, placeholders (as defined in `#{}`) must be replaced with real values in the Makefile or overwritten via make command arguments.
 72 | 
 73 | ## Use DOP on Docker
 74 | You can now use DOP on your laptop (Linux / Mac only for now, Windows instructions is in the works) by following the instructions as below.
 75 | 
 76 | Running it for the first time (this builds the docker image from scratch and may take a while, you can check where it got to with `make logs`)
 77 | ```
 78 | make build
 79 | ```
 80 | 
 81 | Once it's up and running, you can access the UI on
 82 | ```
 83 | http://localhost:8082
 84 | ```
 85 | 
 86 | Bring down the docker environment
 87 | ```
 88 | make down
 89 | ```
 90 | 
 91 | Subsequent runs to bring up the docker environment
 92 | ```
 93 | make up
 94 | ```
 95 | 
 96 | And to get into the docker container itself (useful for debugging), run
 97 | ```
 98 | make exec
 99 | ```
100 | 
101 | ## Deploy to Cloud Composer
102 | There is a light weight semi-automated deployment process built using Cloud Build, to deploy to an existing Composer Cluster, follow instructions as below.
103 | 
104 | ### Build Artifact
105 | ```
106 | make build-artifact
107 | ```
108 | This will produce an artifact id pointed to the most recent build.
109 | 
110 | By default this will build the artifact using the DOP `master` branch, if a different branch or tag is required, this can be overwritten by using `DOP_TAG_NAME`, i.e. `DOP_TAG_NAME=v0.1.0`.
111 | 
112 | ```
113 | make build-artifact DOP_TAG_NAME={}
114 | ```
115 | 
116 | It is **very important** to consider using a tag made on the DOP source repository for a Production deployment so that the DOP version won't accidentally change when making a service project deployment.
117 | 
118 | 
119 | ### Deploy
120 | ```
121 | make deploy DEPLOY_BUCKET_NAME={} DOP_ARTIFACT_ID={}
122 | ```
123 | `DEPLOY_BUCKET_NAME`: This is the bucket name for Cloud Composer i.e. `us-central1-dop-sandbox-us-xxxxxxxx-bucket`
124 | `DOP_ARTIFACT_ID`: Use the most recent artifact id produced by the `Build Artifact` step or any historical artifact ids to rollback
125 | 
126 | #### Important steps to follow for deploying to an existing Composer Cluster
127 | If you are deploying DOP to an existing composer cluster which often already has other DAGs running,
128 | it is important to set some exclusions in your existing deployment process.
129 | 
130 | A typical deployment to Cloud Composer involves doing a rsync to a GCS bucket, in order to make sure the DOP service project path is not removed in this process add the following exclusions in the rsync.
131 | ```
132 | export SERVICE_PROJECT_NAME=dop_<service project repository name> && gsutil -m rsync -r -d -x "^$SERVICE_PROJECT_NAME" gs://<path to dags>/dags gs://$BUCKET_NAME/dags
133 | ```
134 | 
135 | ## DOP Orchestration - How To Use
136 | If you prefer to just give it a go without reading documentation, see the example DAG [embedded_dop/orchestration/example_covid19](embedded_dop/orchestration/example_covid19).
137 | Try playing around by changing the `config.yaml` file and see changes reflected in the Airflow GUI on your local environment
138 | 
139 | ### Create a new DOP Orchestration DAG
140 | 1. Create a new folder under [embedded_dop/orchestration](embedded_dop/orchestration), the folder name will be the name of the DAG, with a `dop__` prefix. i.e. `dop__<folder name>`.
141 | 1. Create a configuration file inside the new folder and call it `config.yaml`. The Airflow DAG is automatically generated by parsing the `config.yaml` file. The config file has the following structure
142 |     ```
143 |     enabled: <Enable or disable the DAG, once disabled, the DAG will no longer appear in the Airflow UI, valie values are: true|false>
144 |     schedule_interval: <Valid CRON expression used for DAG schedule, i.e. "0 4 * * *" or null if no schedule is required>
145 |     timezone: <Timezone for the schedule, i.e. "Europe/London">
146 |     schema: <Default schema to be used to persist data, only applicable to native transformations i.e. "dop_sandbox_us">
147 |     tasks: <Tasks to be orchestreated, see task definitions below>
148 |     ```
149 | ### Task definitions
150 | The following kinds of tasks are currently supported
151 | 
152 | #### Native Transformation - Materialization
153 | This task kind is designed to persist structure or data. Supported options are
154 | ```
155 |   - identifier: <task id>
156 |     kind:
157 |       action: materialization
158 |       target: <schema, udf, table, view, stored_procedure>
159 |     dependencies:
160 |       <a list of one or more dependencies, defined by using task id>
161 | ```
162 | Targets
163 | - schema (for BQ, this is creating a dataset): create a schema
164 | - udf (with dynamic arguments): create a UDF from a SQL script
165 | - table: create a table by materializing the output from a SQL script
166 | - view: create a view from a SQL script
167 | - stored_procedure (with dynamic arguments): create a stored procedure from a SQL script
168 | 
169 | Features
170 | - Delta management using a date/timestamp partitioned column
171 | - Automatic schema inference by query results with schema backwards compatibility checks and stops the execution when schema is backwards incompatible
172 | - A full refresh can be triggered to do a full rebuild from sources
173 | 
174 | For the Materialization task, `identifer` must match to a SQL file located in the `/sql` folder.
175 | To see a live example on how to configure each task, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml).
176 | 
177 | #### Native Transformation - Assertion
178 | This task kind can be used to check data quality. Supported options are
179 | ```
180 |   - identifier: <task id>
181 |     kind:
182 |       action: assertion
183 |       target: assertion
184 |     dependencies:
185 |       <a list of one or more dependencies, defined by using task id>
186 | ```
187 | For the Assertion task, `identifer` must match to a SQL file located in the `/sql` folder and in the `SELECT` statement of the assertion SQL, it must return at least two columns as shown below,
188 | ```
189 | SELECT
190 |        <anything that can returna boolean>                             AS success,
191 |        <a text string to explain what this assertion is for>           AS description
192 | ```
193 | The Airflow task will fail if `success` is evaluated as `false`
194 | 
195 | To see a live example on how to configure each task, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml).
196 | 
197 | #### Native Transformation - Invocation
198 | This task kind can be used to trigger something that has already been created, i.e. a stored procedure. Supported options are
199 | ```
200 |   - identifier: <task id>
201 |     kind:
202 |       action: invocation
203 |       target: stored_procedure
204 |     dependencies:
205 |       <a list of one or more dependencies, defined by using task id>
206 | ```
207 | To see a live example on how to configure each task, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml).
208 | 
209 | #### DBT Task
210 | This task kind can be used to trigger a DBT job. The DBT jobs runs in a Python Virtual Environment when executed locally but runs in containers on Cloud Composer.
211 | You don't have to worry about this as a user because the user experience on this between the two environments are almost identical.
212 | ```
213 |   - identifier: <task id>
214 |     kind:
215 |       action: dbt
216 |       target: <test|run>
217 |     options:
218 |       project: <the project folder name of DBT in the service repository, for the example DAG this can be either `dbt_start` or `dbt_start_two`>
219 |       version: <the DBT version at or above 0.19.1. Please note this maybe deprecated in newer versions so that only one DBT version needs to be maintained>
220 |       arguments:
221 |         - option: <a valid dbt argument, '-m', '-x', '--fail-fast', '--threads', '--exclude', '--full-refresh' are currently supported>
222 |           value: <a value goes with the argument>
223 |         - ... <multiple arguments can be used together>
224 |     dependencies:
225 |       <a list of one or more dependencies, defined by using task id>
226 | ```
227 | Under `options` you may optionally specify arguments for a DBT job, this can be very useful for breaking down a very large DBT job into smaller chunks, making it easier to maintain.
228 | 
229 | Some of the ideas are
230 | - Make use of [tags](https://docs.getdbt.com/reference/resource-configs/tags). For example, use tags to split DBT tasks into logical groups and run them in separate steps.
231 | - Create a dedicated DAG for `full refresh` with tags so each DBT logically group can be refreshed separately to save cost as well as making refresh faster
232 | 
233 | Keep in mind try not to over engineer the solution, only try to split the DBT job if it makes sense to do so and solving a real issue (i.e. rebuilding the whole thing costs too much / takes too long or, without tags the job is unmaintainable and very hard to identify area of failures)
234 | 
235 | To see a live example on how to configure this, go to [embedded_dop/orchestration/example_covid19/config.yaml](embedded_dop/orchestration/example_covid19/config.yaml).
236 | 
237 | ### Full Refresh
238 | This is an example of a full refresh (overwriting existing schema & data), you can pass in a JSON payload using the trigger dag function in the Airflow GUI.
239 | 
240 | Please note that regardless of native transformations or DBT jobs (or any other task with full refresh support), using the `{"full_refresh" : true}` flag will force a full refresh on all applicable tasks.
241 | 
242 | ![Trigger DAG](../../docs/trigger_dag.png)
243 | 
244 | ![Set DAG configuration options](../../docs/trigger_full_refresh.png)
245 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | target/
3 | dbt_modules/
4 | logs/
5 | venv
6 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/analysis/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/analysis/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/data/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | # Name your project! Project names should contain only lowercase characters
 2 | # and underscores. A good package name should reflect your organization's
 3 | # name or the intended use of these models
 4 | name: 'dop_test'
 5 | version: '1.0.0'
 6 | config-version: 2
 7 | 
 8 | # This setting configures which "profile" dbt uses for this project.
 9 | profile: 'dbt_start'
10 | 
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `source-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | source-paths: ["models"]
15 | analysis-paths: ["analysis"]
16 | test-paths: ["tests"]
17 | data-paths: ["data"]
18 | macro-paths: ["macros"]
19 | snapshot-paths: ["snapshots"]
20 | 
21 | target-path: "target"  # directory which will store compiled SQL files
22 | clean-targets:         # directories to be removed by `dbt clean`
23 |   - "target"
24 |   - "dbt_modules"
25 | 
26 | 
27 | # Configuring models
28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
29 | 
30 | # In this example config, we tell dbt to build all models in the example/ directory
31 | # as tables. These settings can be overridden in the individual model files
32 | # using the `{{ config(...) }}` macro.
33 | models:
34 |   dop_test:
35 |     +materialized: table
36 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/macros/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/models/aggregation_a/covid19_cases_by_country.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'incremental',
 4 |     incremental_strategy = 'insert_overwrite',
 5 |     partition_by = {'field': 'date', 'data_type': 'date'}
 6 |   )
 7 | }}
 8 | 
 9 | with covid_cases as (
10 | 
11 |     SELECT date, country_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested
12 |     FROM {{ ref('stg_covid19_cases') }}
13 |     {% if is_incremental() %}
14 |         where date >= "{{ var('ds') }}"
15 |     {% endif %}
16 |     GROUP BY date, country_name
17 | 
18 | )
19 | select * from covid_cases
20 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/models/aggregation_b/covid19_cases_by_country_and_region.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'incremental',
 4 |     incremental_strategy = 'insert_overwrite',
 5 |     partition_by = {'field': 'date', 'data_type': 'date'}
 6 |   )
 7 | }}
 8 | 
 9 | with covid_cases as (
10 | 
11 |     SELECT date, country_name, subregion1_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested
12 |     FROM {{ ref('stg_covid19_cases') }}
13 |     {% if is_incremental() %}
14 |         where date >= "{{ var('ds') }}"
15 |     {% endif %}
16 |     GROUP BY date, country_name, subregion1_name
17 | 
18 | )
19 | select * from covid_cases
20 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/models/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: jaffle_shop
 5 |     database: bigquery-public-data
 6 |     schema: covid19_open_data
 7 |     loader: BigQuery # informational only (free text)
 8 |     loaded_at_field: CAST(date as timestamp) # configure for all sources
 9 | 
10 |     # meta fields are rendered in auto-generated documentation
11 |     meta:
12 |       contains_pii: false
13 |       owner: "@google"
14 | 
15 |     # Add tags to this source
16 |     tags:
17 |       - covid19
18 | 
19 |     quoting:
20 |       database: false
21 |       schema: false
22 |       identifier: false
23 | 
24 |     tables:
25 |       - name: covid19_open_data
26 |         freshness: # make this a little more strict
27 |           warn_after: {count: 1, period: day}
28 |           error_after: {count: 2, period: day}
29 | 
30 | models:
31 |   - name: stg_covid19_cases
32 |     description: covid19 global open data
33 |   - name: covid19_cases_by_country
34 |     description: Global cases by country
35 |     columns:
36 |       - name: country_name
37 |         description: Country name
38 |         tests:
39 |           - not_null
40 |   - name: covid19_cases_by_country_and_region
41 |     description: Global cases by regions in countries
42 |     columns:
43 |       - name: country_name
44 |         description: Country name
45 |         tests:
46 |           - not_null
47 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/models/staging/stg_covid19_cases.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'incremental',
 4 |     incremental_strategy = 'insert_overwrite',
 5 |     partition_by = {'field': 'date', 'data_type': 'date'}
 6 |   )
 7 | }}
 8 | 
 9 | with covid_cases as (
10 | 
11 |     select * REPLACE(CAST(new_recovered AS INT64) as new_recovered, CAST(new_tested AS INT64) as new_tested)
12 |     from `{{ source('jaffle_shop', 'covid19_open_data') }}`
13 |     {% if is_incremental() %}
14 |         where date >= "{{ var('ds') }}" -- Same as DML merge for a range but the first run is always historical load and second run is incremental / don't need historical DAG anymore
15 |     {% endif %}
16 | 
17 | )
18 | select * from covid_cases
19 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/snapshots/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start/tests/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | target/
3 | dbt_modules/
4 | logs/
5 | venv
6 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/analysis/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/analysis/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/data/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | # Name your project! Project names should contain only lowercase characters
 2 | # and underscores. A good package name should reflect your organization's
 3 | # name or the intended use of these models
 4 | name: 'dop_test_two'
 5 | version: '1.0.0'
 6 | config-version: 2
 7 | 
 8 | # This setting configures which "profile" dbt uses for this project.
 9 | profile: 'dbt_start_two'
10 | 
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `source-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | source-paths: ["models"]
15 | analysis-paths: ["analysis"]
16 | test-paths: ["tests"]
17 | data-paths: ["data"]
18 | macro-paths: ["macros"]
19 | snapshot-paths: ["snapshots"]
20 | 
21 | target-path: "target"  # directory which will store compiled SQL files
22 | clean-targets:         # directories to be removed by `dbt clean`
23 |   - "target"
24 |   - "dbt_modules"
25 | 
26 | 
27 | # Configuring models
28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
29 | 
30 | # In this example config, we tell dbt to build all models in the example/ directory
31 | # as tables. These settings can be overridden in the individual model files
32 | # using the `{{ config(...) }}` macro.
33 | models:
34 |   dop_test_two:
35 |     +materialized: table
36 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/macros/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/models/aggregation_a/covid19_cases_by_country.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'incremental',
 4 |     incremental_strategy = 'insert_overwrite',
 5 |     partition_by = {'field': 'date', 'data_type': 'date'}
 6 |   )
 7 | }}
 8 | 
 9 | with covid_cases as (
10 | 
11 |     SELECT date, country_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested
12 |     FROM {{ ref('stg_covid19_cases') }}
13 |     {% if is_incremental() %}
14 |         where date >= "{{ var('ds') }}"
15 |     {% endif %}
16 |     GROUP BY date, country_name
17 | 
18 | )
19 | select * from covid_cases
20 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/models/aggregation_b/covid19_cases_by_country_and_region.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'incremental',
 4 |     incremental_strategy = 'insert_overwrite',
 5 |     partition_by = {'field': 'date', 'data_type': 'date'}
 6 |   )
 7 | }}
 8 | 
 9 | with covid_cases as (
10 | 
11 |     SELECT date, country_name, subregion1_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested
12 |     FROM {{ ref('stg_covid19_cases') }}
13 |     {% if is_incremental() %}
14 |         where date >= "{{ var('ds') }}"
15 |     {% endif %}
16 |     GROUP BY date, country_name, subregion1_name
17 | 
18 | )
19 | select * from covid_cases
20 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/models/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: jaffle_shop
 5 |     database: bigquery-public-data
 6 |     schema: covid19_open_data
 7 |     loader: BigQuery # informational only (free text)
 8 |     loaded_at_field: CAST(date as timestamp) # configure for all sources
 9 | 
10 |     # meta fields are rendered in auto-generated documentation
11 |     meta:
12 |       contains_pii: false
13 |       owner: "@google"
14 | 
15 |     # Add tags to this source
16 |     tags:
17 |       - covid19
18 | 
19 |     quoting:
20 |       database: false
21 |       schema: false
22 |       identifier: false
23 | 
24 |     tables:
25 |       - name: covid19_open_data
26 |         freshness: # make this a little more strict
27 |           warn_after: {count: 1, period: day}
28 |           error_after: {count: 2, period: day}
29 | 
30 | models:
31 |   - name: stg_covid19_cases
32 |     description: covid19 global open data
33 |   - name: covid19_cases_by_country
34 |     description: Global cases by country
35 |     columns:
36 |       - name: country_name
37 |         description: Country name
38 |         tests:
39 |           - not_null
40 |   - name: covid19_cases_by_country_and_region
41 |     description: Global cases by regions in countries
42 |     columns:
43 |       - name: country_name
44 |         description: Country name
45 |         tests:
46 |           - not_null
47 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/models/staging/stg_covid19_cases.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'incremental',
 4 |     incremental_strategy = 'insert_overwrite',
 5 |     partition_by = {'field': 'date', 'data_type': 'date'}
 6 |   )
 7 | }}
 8 | 
 9 | with covid_cases as (
10 | 
11 |     select * REPLACE(CAST(new_recovered AS INT64) as new_recovered, CAST(new_tested AS INT64) as new_tested)
12 |     from `{{ source('jaffle_shop', 'covid19_open_data') }}`
13 |     {% if is_incremental() %}
14 |         where date >= "{{ var('ds') }}" -- Same as DML merge for a range but the first run is always historical load and second run is incremental / don't need historical DAG anymore
15 |     {% endif %}
16 | 
17 | )
18 | select * from covid_cases
19 | 


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/snapshots/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/dbt_start_two/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/dbt_start_two/tests/.gitkeep


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/executor_config/dbt/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | pipenv = "*"
10 | install = "*"
11 | dbt = "0.19.1"
12 | 
13 | [requires]
14 | python_version = "3.7"
15 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/executor_config/dbt/config.yaml:
--------------------------------------------------------------------------------
1 | # This config file is required to initialise all DBT projects
2 | 
3 | dbt_projects:
4 |   - project_path: dbt_start # must match to the directory path of the DBT project under the service project. I.e. dbt_start or dbt/dbt_start
5 |   - project_path: dbt_start_two
6 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/dummy_upstream_dependency/config.yaml:
--------------------------------------------------------------------------------
 1 | enabled: true
 2 | schedule_interval: "0 4 * * *"
 3 | timezone: "Europe/London"
 4 | schema: dop_sandbox_us
 5 | 
 6 | tasks:
 7 |   - identifier: i_am_the_dummy_dependency
 8 |     kind:
 9 |       action: airflow_operator
10 |       target: airflow.operators.dummy_operator.DummyOperator
11 |     options:
12 |       arguments:


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/config.yaml:
--------------------------------------------------------------------------------
  1 | enabled: true
  2 | schedule_interval: "0 4 * * *"
  3 | timezone: "Europe/London"
  4 | schema: sample_dataset
  5 | params:
  6 |   value_a:
  7 |     - 1
  8 |     - 2
  9 |     - 3
 10 | tasks:
 11 |   - identifier: dim_date
 12 |     kind:
 13 |       action: materialization
 14 |       target: table
 15 |   - identifier: salesforce_marketing_cloud_is_ready
 16 |     kind:
 17 |       action: assertion
 18 |       target: assertion
 19 |   - identifier: staging_salesforce_marketing_cloud
 20 |     kind:
 21 |       action: materialization
 22 |       target: table
 23 |     dependencies:
 24 |       - salesforce_marketing_cloud_is_ready
 25 |       - dim_date
 26 |   - identifier: fact_transaction
 27 |     kind:
 28 |       action: materialization
 29 |       target: table
 30 |     dependencies:
 31 |       - dim_customer
 32 |       - staging_salesforce_marketing_cloud
 33 |   - identifier: dim_product
 34 |     kind:
 35 |       action: materialization
 36 |       target: table
 37 |     dependencies:
 38 |       - staging_salesforce_marketing_cloud
 39 |       - dim_customer
 40 |   - identifier: dim_voucher
 41 |     kind:
 42 |       action: materialization
 43 |       target: table
 44 |     dependencies:
 45 |       - staging_salesforce_marketing_cloud
 46 |   - identifier: fact_customer_activity
 47 |     kind:
 48 |       action: materialization
 49 |       target: table
 50 |     dependencies:
 51 |       - staging_salesforce_marketing_cloud
 52 |       - dim_customer
 53 |   - identifier: salesforce_service_cloud_is_ready
 54 |     kind:
 55 |       action: assertion
 56 |       target: assertion
 57 |   - identifier: staging_salesforce_service_cloud
 58 |     kind:
 59 |       action: materialization
 60 |       target: table
 61 |     dependencies:
 62 |       - salesforce_service_cloud_is_ready
 63 |   - identifier: dim_customer
 64 |     kind:
 65 |       action: materialization
 66 |       target: table
 67 |     dependencies:
 68 |       - staging_salesforce_service_cloud
 69 |       - dim_date
 70 |   - identifier: dim_customer_assertion
 71 |     kind:
 72 |       action: assertion
 73 |       target: assertion
 74 |     dependencies:
 75 |       - dim_customer
 76 |   - identifier: fact_newly_registered_customer
 77 |     kind:
 78 |       action: materialization
 79 |       target: table
 80 |     dependencies:
 81 |       - staging_salesforce_service_cloud
 82 |   - identifier: dim_customer_subscription
 83 |     kind:
 84 |       action: materialization
 85 |       target: table
 86 |     dependencies:
 87 |       - dim_customer
 88 |   - identifier: dim_customer_subscription_assertion
 89 |     kind:
 90 |       action: assertion
 91 |       target: assertion
 92 |     dependencies:
 93 |       - dim_customer_subscription
 94 |   - identifier: zend_desk_is_ready
 95 |     kind:
 96 |       action: assertion
 97 |       target: assertion
 98 |     dependencies:
 99 |       - dim_date
100 |   - identifier: staging_zend_desk
101 |     kind:
102 |       action: materialization
103 |       target: table
104 |     dependencies:
105 |       - zend_desk_is_ready
106 |   - identifier: zend_desk_ticket_assignments
107 |     kind:
108 |       action: materialization
109 |       target: table
110 |     dependencies:
111 |       - staging_zend_desk
112 |   - identifier: zend_desk_ticket_comments
113 |     kind:
114 |       action: materialization
115 |       target: table
116 |     dependencies:
117 |       - staging_zend_desk
118 |   - identifier: zend_desk_ticket_priority_changes
119 |     kind:
120 |       action: materialization
121 |       target: table
122 |     dependencies:
123 |       - staging_zend_desk
124 |   - identifier: zend_desk_ticket_summary
125 |     kind:
126 |       action: materialization
127 |       target: table
128 |     dependencies:
129 |       - staging_zend_desk
130 |   - identifier: zend_desk_ticket_user_issues
131 |     kind:
132 |       action: materialization
133 |       target: table
134 |     dependencies:
135 |       - staging_zend_desk
136 |   - identifier: list_of_users_require_attention
137 |     kind:
138 |       action: materialization
139 |       target: table
140 |     dependencies:
141 |       - zend_desk_ticket_user_issues
142 |       - zend_desk_ticket_summary
143 |       - zend_desk_ticket_priority_changes
144 |       - zend_desk_ticket_comments
145 |       - zend_desk_ticket_assignments
146 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_assertion.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_assertion.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription_assertion.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_customer_subscription_assertion.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_date.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_date.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_product.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_product.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_voucher.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/dim_voucher.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_customer_activity.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_customer_activity.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_newly_registered_customer.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_newly_registered_customer.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_transaction.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/fact_transaction.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/list_of_users_require_attention.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/list_of_users_require_attention.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_marketing_cloud_is_ready.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_marketing_cloud_is_ready.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_service_cloud_is_ready.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/salesforce_service_cloud_is_ready.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_marketing_cloud.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_marketing_cloud.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_service_cloud.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_salesforce_service_cloud.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_zend_desk.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/staging_zend_desk.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_is_ready.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_is_ready.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_assignments.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_assignments.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_comments.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_comments.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_priority_changes.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_priority_changes.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_summary.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_summary.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_user_issues.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/examples/service_project/embedded_dop/orchestration/example_complex_design/sql/zend_desk_ticket_user_issues.sql


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/config.yaml:
--------------------------------------------------------------------------------
  1 | enabled: true
  2 | schedule_interval: "0 4 * * *"
  3 | timezone: "Europe/London"
  4 | schema: dop_sandbox_us
  5 | params:
  6 |   value_a:
  7 |     - 1
  8 |     - 2
  9 |     - 3
 10 | tasks:
 11 |   - identifier: assert_upstream_data_is_ready
 12 |     kind:
 13 |       action: assertion
 14 |       target: assertion
 15 | 
 16 |   - identifier: create_schema_dop_sandbox_us
 17 |     kind:
 18 |       action: materialization
 19 |       target: schema
 20 |     schema: dop_sandbox_us
 21 |     dependencies:
 22 |       - assert_upstream_data_is_ready
 23 | 
 24 |   - identifier: udf_empty_str_as_null
 25 |     kind:
 26 |       action: materialization
 27 |       target: udf
 28 |     options:
 29 |       arguments:
 30 |         - name: str
 31 |           type: STRING
 32 |     dependencies:
 33 |       - create_schema_dop_sandbox_us
 34 | 
 35 |   - identifier: udf_unpivot
 36 |     kind:
 37 |       action: materialization
 38 |       target: udf
 39 |     options:
 40 |       arguments:
 41 |         - name: x
 42 |           type: ANY TYPE
 43 |         - name: col_regex
 44 |           type: STRING
 45 |     dependencies:
 46 |       - create_schema_dop_sandbox_us
 47 | 
 48 |   - identifier: stg_covid19
 49 |     kind:
 50 |       action: materialization
 51 |       target: table
 52 |     partitioning:
 53 |       field: date
 54 |       data_type: date
 55 |     dependencies:
 56 |       - create_schema_dop_sandbox_us
 57 | 
 58 |   - identifier: covid19_by_country
 59 |     kind:
 60 |       action: materialization
 61 |       target: table
 62 |     partitioning:
 63 |       field: date
 64 |       data_type: date
 65 |     dependencies:
 66 |       - stg_covid19
 67 | 
 68 |   - identifier: covid19_by_country_and_region
 69 |     kind:
 70 |       action: materialization
 71 |       target: table
 72 |     partitioning:
 73 |       field: date
 74 |       data_type: date
 75 |     dependencies:
 76 |       - stg_covid19
 77 | 
 78 |   - identifier: view_covid19_by_country_and_region
 79 |     kind:
 80 |       action: materialization
 81 |       target: view
 82 |     dependencies:
 83 |       - covid19_by_country_and_region
 84 | 
 85 |   - identifier: data_quality_checks
 86 |     kind:
 87 |       action: assertion
 88 |       target: assertion
 89 |     dependencies:
 90 |       - covid19_by_country
 91 |       - covid19_by_country_and_region
 92 | 
 93 |   - identifier: sp_all_countries_and_regions
 94 |     kind:
 95 |       action: materialization
 96 |       target: stored_procedure
 97 |     options:
 98 |       arguments:
 99 |         - name: execution_date
100 |           type: DATE
101 |     dependencies:
102 |       - stg_covid19
103 | 
104 |   - identifier: invoke_sp_all_countries_and_regions
105 |     kind:
106 |       action: invocation
107 |       target: stored_procedure
108 |     dependencies:
109 |       - sp_all_countries_and_regions
110 | 
111 |   - identifier: create_schema_dbt_start
112 |     kind:
113 |       action: materialization
114 |       target: schema
115 |     schema: dbt_start
116 |     dependencies:
117 |       - assert_upstream_data_is_ready
118 | 
119 |   - identifier: dbt_start_staging
120 |     kind:
121 |       action: dbt
122 |       target: run
123 |     options:
124 |       project: dbt_start
125 |       version: 0.19.1
126 |       arguments:
127 |         - option: -m
128 |           value: staging
129 |     dependencies:
130 |       - create_schema_dbt_start
131 | 
132 |   - identifier: dbt_start_aggregation_a
133 |     kind:
134 |       action: dbt
135 |       target: run
136 |     options:
137 |       project: dbt_start
138 |       version: 0.19.1
139 |       arguments:
140 |         - option: -m
141 |           value: aggregation_a
142 |     dependencies:
143 |       - dbt_start_staging
144 | 
145 |   - identifier: dbt_start_aggregation_b
146 |     kind:
147 |       action: dbt
148 |       target: run
149 |     options:
150 |       project: dbt_start
151 |       version: 0.19.1
152 |       arguments:
153 |         - option: -m
154 |           value: aggregation_b
155 |     dependencies:
156 |       - dbt_start_aggregation_a
157 | 
158 |   - identifier: dbt_start_test
159 |     kind:
160 |       action: dbt
161 |       target: test
162 |     options:
163 |       project: dbt_start
164 |       version: 0.19.1
165 |     dependencies:
166 |       - dbt_start_aggregation_b
167 | 
168 |   - identifier: dbt_start_docs
169 |     kind:
170 |       action: dbt
171 |       target: docs generate
172 |     options:
173 |       project: dbt_start
174 |       version: 0.19.1
175 |       arguments:
176 |         - option: --bucket
177 |           value: datatonic-uk-dop-dev-diego
178 |         - option: --bucket-path
179 |           value: dbt
180 |     dependencies:
181 |       - dbt_start_test
182 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/assert_upstream_data_is_ready.sql:
--------------------------------------------------------------------------------
 1 | {% set source_data = 'bigquery-public-data.covid19_open_data.covid19_open_data' %}
 2 | 
 3 | with covid_cases as (
 4 | 
 5 |     SELECT * FROM `{{ source_data }}`
 6 |     {% if is_incremental() %}
 7 |         where date >= DATE("{{ ds }}")
 8 |     {% endif %}
 9 | 
10 | )
11 | SELECT COUNT(*) > 0                                                    AS success,
12 |        COUNT(*)                                                        AS num_of_records,
13 |        'Do we have data available in `{{ source_data }}`?'             AS description
14 | FROM covid_cases
15 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/covid19_by_country.sql:
--------------------------------------------------------------------------------
 1 | with covid_cases as (
 2 | 
 3 |     SELECT date, country_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested
 4 |     FROM `dop_sandbox_us.stg_covid19`
 5 |     {% if is_incremental() %}
 6 |         where date >= DATE("{{ ds }}")
 7 |     {% endif %}
 8 |     GROUP BY date, country_name
 9 | 
10 | )
11 | select * from covid_cases
12 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/covid19_by_country_and_region.sql:
--------------------------------------------------------------------------------
 1 | with covid_cases as (
 2 | 
 3 |     SELECT date, country_name, subregion1_name, sum(coalesce(new_confirmed, 0)) as new_confirmed, sum(coalesce(new_deceased,0)) as new_deceased, sum(coalesce(new_recovered,0)) as new_recovered, sum(coalesce(new_tested,0)) as new_tested
 4 |     FROM `dop_sandbox_us.stg_covid19`
 5 |     {% if is_incremental() %}
 6 |         where date >= DATE("{{ ds }}")
 7 |     {% endif %}
 8 |     GROUP BY date, country_name, subregion1_name
 9 | 
10 | )
11 | select * from covid_cases
12 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/data_quality_checks.sql:
--------------------------------------------------------------------------------
 1 | SELECT COUNT(*) > 0                                                    AS success,
 2 |        COUNT(*)                                                        AS num_of_records,
 3 |        'Do we have data in covid19_by_country for date >= "{{ ds }}"?' AS description
 4 | FROM `dop_sandbox_us.covid19_by_country`
 5 | {% if is_incremental() %}
 6 | WHERE date >= "{{ ds }}"
 7 | {% endif %}
 8 | 
 9 | UNION ALL
10 | 
11 | SELECT COUNT(*) > 0                                                              AS success,
12 |        COUNT(*)                                                                  AS num_of_records,
13 |        'Do we have data in covid19_by_country_and_region for date >= "{{ ds }}"' AS description
14 | FROM `dop_sandbox_us.covid19_by_country_and_region`
15 | {% if is_incremental() %}
16 | WHERE date >= "{{ ds }}"
17 | {% endif %}
18 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/invoke_sp_all_countries_and_regions.sql:
--------------------------------------------------------------------------------
1 | call {{ task['schema'] }}.sp_all_countries_and_regions(DATE('{{ ds }}'));
2 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/sp_all_countries_and_regions.sql:
--------------------------------------------------------------------------------
1 | SELECT distinct country_name, subregion1_name
2 | FROM `bigquery-public-data.covid19_open_data.covid19_open_data`
3 | WHERE date >= execution_date
4 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/stg_covid19.sql:
--------------------------------------------------------------------------------
 1 | with covid_cases as (
 2 | 
 3 |     SELECT * REPLACE(CAST(new_recovered AS INT64) as new_recovered, CAST(new_tested AS INT64) as new_tested)
 4 |     FROM `bigquery-public-data.covid19_open_data.covid19_open_data`
 5 |     {% if is_incremental() %}
 6 |         where date >= DATE("{{ ds }}")
 7 |     {% endif %}
 8 | 
 9 | )
10 | select * from covid_cases
11 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/udf_empty_str_as_null.sql:
--------------------------------------------------------------------------------
1 | IF(TRIM(str) = '', NULL, str)
2 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/udf_unpivot.sql:
--------------------------------------------------------------------------------
 1 | (
 2 |      SELECT
 3 |    ARRAY_AGG(STRUCT(
 4 |      REGEXP_EXTRACT(y, '[^"]*') AS key
 5 |    , REGEXP_EXTRACT(y, r':([^"]*)\"?[,}\]]') AS value
 6 |    ))
 7 |   FROM UNNEST((
 8 |     SELECT REGEXP_EXTRACT_ALL(json,col_regex||r'[^:]+:\"?[^"]+\"?') arr
 9 |     FROM (SELECT TO_JSON_STRING(x) json))) y
10 | )
11 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_covid19/sql/view_covid19_by_country_and_region.sql:
--------------------------------------------------------------------------------
1 | SELECT * FROM `dop_sandbox_us.covid19_by_country_and_region`
2 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_dataflow_template/config.yaml:
--------------------------------------------------------------------------------
 1 | # To run this example:
 2 | # - Set enabled to true
 3 | # - Add Dataflow Admin role to Composer service account
 4 | # - Replace PROJECT_ID, REGION, TEMP_BUCKET and OUTPUT_BUCKET placeholders
 5 | enabled: false
 6 | schedule_interval: "0 * * * *"
 7 | timezone: "Europe/London"
 8 | schema: sample_dataset
 9 | 
10 | tasks:
11 |   - identifier: dummy_start_operator
12 |     kind:
13 |       action: airflow_operator
14 |       target: airflow.operators.dummy_operator.DummyOperator
15 | 
16 |   - identifier: dataflow_template
17 |     kind:
18 |       action: airflow_operator
19 |       target: airflow.contrib.operators.dataflow_operator.DataflowTemplateOperator
20 |     options:
21 |       arguments:
22 |         template: gs://dataflow-templates/latest/Word_Count
23 |         job_name: word_count
24 |         dataflow_default_options:
25 |           project: PROJECT_ID
26 |           region: REGION
27 |           tempLocation: gs://TEMP_BUCKET/dataflow/staging/
28 |         parameters:
29 |           inputFile: gs://dataflow-samples/shakespeare/kinglear.txt
30 |           output: gs://OUTPUT_BUCKET/word_count/output
31 |     dependencies:
32 |       - dummy_start_operator
33 | 
34 |   - identifier: dummy_end_operator
35 |     kind:
36 |       action: airflow_operator
37 |       target: airflow.operators.dummy_operator.DummyOperator
38 |     dependencies:
39 |       - dataflow_template
40 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_external_task_sensor/config.yaml:
--------------------------------------------------------------------------------
 1 | enabled: true
 2 | schedule_interval: "0 * * * *"
 3 | timezone: "Europe/London"
 4 | schema: sample_dataset
 5 | 
 6 | tasks:
 7 |   - identifier: external_task_sensor
 8 |     kind:
 9 |       action: airflow_operator
10 |       target: airflow.sensors.external_task_sensor.ExternalTaskSensor
11 |     options:
12 |       arguments:
13 |         external_dag_id: dop__example_upstream_dependency
14 |         external_task_id: upstream_dependency
15 |         execution_delta: !!python/object/apply:datetime.timedelta [0, 300]
16 | 
17 |   - identifier: dummy_operator
18 |     kind:
19 |       action: airflow_operator
20 |       target: airflow.operators.dummy_operator.DummyOperator
21 |     dependencies:
22 |       - external_task_sensor
23 | 


--------------------------------------------------------------------------------
/examples/service_project/embedded_dop/orchestration/example_upstream_dependency/config.yaml:
--------------------------------------------------------------------------------
 1 | enabled: true
 2 | schedule_interval: "55 * * * *"
 3 | timezone: "Europe/London"
 4 | schema: sample_dataset
 5 | 
 6 | tasks:
 7 |   - identifier: upstream_dependency
 8 |     kind:
 9 |       action: airflow_operator
10 |       target: airflow.operators.dummy_operator.DummyOperator
11 | 


--------------------------------------------------------------------------------
/infrastructure/cloudbuild/build-dbt.yaml:
--------------------------------------------------------------------------------
 1 | steps:
 2 |   - name: 'gcr.io/cloud-builders/docker'
 3 |     entrypoint: 'bash'
 4 |     args: ['-c', 'docker pull eu.gcr.io/${PROJECT_ID}/dop-dbt:latest || exit 0']
 5 |   - name: 'gcr.io/cloud-builders/docker'
 6 |     entrypoint: 'bash'
 7 |     args: [
 8 |       '-c',
 9 |       'docker build . -f embedded_dop/source/infrastructure/executor/dbt/Dockerfile -t eu.gcr.io/${PROJECT_ID}/dop-dbt:${SHORT_SHA}-${_DATETIME} -t eu.gcr.io/${PROJECT_ID}/dop-dbt:latest --cache-from eu.gcr.io/${PROJECT_ID}/dop-dbt:latest'
10 |     ]
11 | images: ['eu.gcr.io/${PROJECT_ID}/dop-dbt:${SHORT_SHA}-${_DATETIME}', 'eu.gcr.io/${PROJECT_ID}/dop-dbt:latest']
12 | 


--------------------------------------------------------------------------------
/infrastructure/cloudbuild/build.yaml:
--------------------------------------------------------------------------------
 1 | steps:
 2 |   - name: 'gcr.io/cloud-builders/docker'
 3 |     id: 'Generate Artifact ID'
 4 |     entrypoint: '/bin/bash'
 5 |     args: [
 6 |       "-c",
 7 |       'echo -n ${REPO_NAME}_${BRANCH_NAME}_${SHORT_SHA}_${BUILD_ID} > .artifact_id'
 8 |     ]
 9 |   - name: 'gcr.io/cloud-builders/docker'
10 |     id: 'Generate Commit Hash. This is used by other services, do not modify.'
11 |     entrypoint: '/bin/bash'
12 |     args: [
13 |       "-c",
14 |       'echo -n ${SHORT_SHA}-${_DATETIME} > .commit-hash'
15 |     ]
16 |   - name: 'gcr.io/cloud-builders/gsutil'
17 |     id: 'Store Artifact, `dop_` is added to the REPO_NAME as a prefix to avoid naming conflict'
18 |     entrypoint: '/bin/bash'
19 |     args: [
20 |       "-c",
21 |       'gsutil -m rsync -r -d . gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/$(cat .artifact_id)/dags/dop_${REPO_NAME}'
22 |     ]
23 |   - name: 'gcr.io/cloud-builders/docker'
24 |     id: 'Display Artifact ID - This can be used to deploy'
25 |     entrypoint: '/bin/bash'
26 |     args: [
27 |       "-c",
28 |       'cat .artifact_id'
29 |     ]
30 | 


--------------------------------------------------------------------------------
/infrastructure/cloudbuild/deploy.yaml:
--------------------------------------------------------------------------------
 1 | steps:
 2 |   - name: 'gcr.io/cloud-builders/docker'
 3 |     id: 'Display Artifact ID'
 4 |     entrypoint: '/bin/bash'
 5 |     args: [
 6 |       "-c",
 7 |       'echo ${_DOP_ARTIFACT_ID}'
 8 |     ]
 9 |   - name: 'gcr.io/cloud-builders/gsutil'
10 |     id: 'List files in the dags/ folder'
11 |     entrypoint: '/bin/bash'
12 |     args: [
13 |       "-c",
14 |       'gsutil ls gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/${_DOP_ARTIFACT_ID}/dags'
15 |     ]
16 |   - name: 'gcr.io/cloud-builders/gsutil'
17 |     id: 'Deploy'
18 |     entrypoint: '/bin/bash'
19 |     args: [
20 |       "-c",
21 |       'gsutil -m rsync -r -d gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/${_DOP_ARTIFACT_ID}/dags/dop_${REPO_NAME} gs://${_DEPLOY_BUCKET_NAME}/dags/dop_${REPO_NAME}'
22 |     ]
23 |   - name: 'gcr.io/cloud-builders/gsutil'
24 |     id: 'Log Deployment'
25 |     entrypoint: '/bin/bash'
26 |     args: [
27 |       "-c",
28 |       'touch ${_DOP_ARTIFACT_ID} && gsutil -m cp ${_DOP_ARTIFACT_ID} gs://${_CLOUDBUILD_ARTIFACTS_BUCKET_NAME}/deploys/$(date -u "+%Y-%m-%d_%H-%M-%S")/'
29 |     ]
30 | 


--------------------------------------------------------------------------------
/infrastructure/dbt-docs/README.md:
--------------------------------------------------------------------------------
 1 | # DBT docs
 2 | 
 3 | DBT documentation is generated using `dbt docs generate`. This command generates in the `target` folder of the project HTML documentation that can be served with any web server.
 4 | 
 5 | DBT also provides a command to serve documentation `dbt docs serve`. It starts a web server in the port 8080 (the port number can be changes using the `--port` param).
 6 | 
 7 | More information about these commands can be found at
 8 | 
 9 |     https://docs.getdbt.com/reference/commands/cmd-docs
10 | 
11 | 
12 | ## Generating DBT docs
13 | 
14 | DOP provides the option to generate DBT docs using a task in the `config.yaml` file.
15 | This task generates the files and copy the files a bucket provided in the task arguments
16 | 
17 |       - identifier: dbt_start_docs
18 |         kind:
19 |           action: dbt
20 |           target: docs generate
21 |         options:
22 |           project: PROJECT_NAME
23 |           version: DBT_VERSION
24 |           arguments:
25 |             - option: --bucket
26 |               value: DBT_DOCS_BUCKET
27 |             - option: --bucket-path
28 |               value: DBT_DOCS_PATH
29 | 
30 | The option `--bucket-path` is optional. If not present, files will be copied to the root folder of the bucket
31 | 
32 | 
33 | ## Serving a static website in Google Cloud
34 | 
35 | Once the files have been generated, Google Cloud provides several option to serve them as a static website:
36 | 
37 |     https://cloud.google.com/architecture/web-serving-overview
38 | 
39 | 
40 | ### GCS static site
41 | 
42 | The simplest option is to mark the bucket as public, and files can be accessed using the URL
43 | 
44 |     https://storage.googleapis.com/DBT_DOCS_BUCKET/DBT_DOCS_PATH/index.html
45 | 
46 | This is the simplest configuration option, but the website will be public to anyone that knows the URL.
47 | 
48 | To use an HTTPS custom domain, a load balancer is required
49 | 
50 |     https://cloud.google.com/storage/docs/hosting-static-website
51 | 
52 | GCS doesn't provide the option to have a private website requiring authentication.
53 | There is a [feature request](https://issuetracker.google.com/issues/114133245?pli=1) to implement this functionality, but it's not implemented yet
54 | 
55 | To allow access to only authenticated users, an App Engine application is provided, described in the next section
56 | 
57 | 
58 | ## AppEngine
59 | 
60 | AppEngine allows authentication, but it doesn't allow writing in the filesystem.
61 | 
62 | DBT task described about creates files in GCS, but these files cannot be copied to the local disk to be served as static resources
63 | 
64 | The chosen approach has been to create a Flask application that reads docs files from GCS and serve them directly.
65 | This application is in the `app-engine` folder
66 | 
67 | To avoid reading the files every request, a [cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)
68 | mechanism has been implemented that is cleared periodically.
69 | 
70 | Cache duration and bucket can be configured as environment parameters.
71 | If not set in `app.yaml`, the following default values are used:
72 | 
73 | - BUCKET_NAME: Default AppEngine bucket (PROJECT_NAME.appspot.com)
74 | - BUCKET_PATH: Empty. Files will be stored in the root folder
75 | - CACHE_MAX_AGE_IN_SECONDS: 300 seconds. If previous request was received more than 5 minutes ago, cache is cleared
76 | 
77 | 
78 | This app can be easily deployed running `gcloud app deploy` from the `app-engine` folder.
79 | More information about deploying AppEngine can be found at
80 | 
81 |     https://cloud.google.com/appengine/docs/standard/python3/testing-and-deploying-your-app
82 | 
83 | 
84 | By default, App Engine has public access, but it can be easily configured to use authentication using Identity-Aware Proxy
85 | 
86 | To configure it follow the instructions at
87 | 
88 |     https://cloud.google.com/iap/docs/app-engine-quickstart
89 | 
90 | To give access to all the users of the organisation, add as member `allAuthenticatedUsers` with role `IAP-secured Web App User`, and to restrict the access, apply the role just to the groups or users that should have access.
91 | 


--------------------------------------------------------------------------------
/infrastructure/dbt-docs/app-engine/.gcloudignore:
--------------------------------------------------------------------------------
 1 | # This file specifies files that are *not* uploaded to Google Cloud Platform
 2 | # using gcloud. It follows the same syntax as .gitignore, with the addition of
 3 | # "#!include" directives (which insert the entries of the given .gitignore-style
 4 | # file at that point).
 5 | #
 6 | # For more information, run:
 7 | #   $ gcloud topic gcloudignore
 8 | #
 9 | .gcloudignore
10 | # If you would like to upload your .git directory, .gitignore file or files
11 | # from your .gitignore file, remove the corresponding line
12 | # below:
13 | .git
14 | .gitignore
15 | 
16 | # Python pycache:
17 | __pycache__/
18 | # Ignored by the build system
19 | /setup.cfg
20 | 


--------------------------------------------------------------------------------
/infrastructure/dbt-docs/app-engine/app.yaml:
--------------------------------------------------------------------------------
1 | runtime: python39
2 | 


--------------------------------------------------------------------------------
/infrastructure/dbt-docs/app-engine/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from functools import lru_cache
 4 | 
 5 | from flask import Flask
 6 | from google.cloud import storage
 7 | 
 8 | app = Flask(__name__)
 9 | 
10 | # Bucket where dbt docs are stored
11 | DBT_BUCKET_NAME = os.getenv("DBT_BUCKET_NAME")
12 | if not DBT_BUCKET_NAME:
13 |     project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
14 |     if not project_id:
15 |         raise ValueError(
16 |             "'GOOGLE_CLOUD_PROJECT' or 'BUCKET_NAME' env variable must be set"
17 |         )
18 |     DBT_BUCKET_NAME = f"{project_id}.appspot.com"
19 | 
20 | # Path in the bucket where dbt docs are stored
21 | DBT_BUCKET_PATH = os.getenv("DBT_BUCKET_PATH", "")
22 | 
23 | # Bucket files will be cached during this period
24 | CACHE_MAX_AGE_IN_SECONDS = os.getenv("CACHE_MAX_AGE_IN_SECONDS", 300)
25 | 
26 | storage_client = storage.Client()
27 | bucket = storage_client.bucket(DBT_BUCKET_NAME)
28 | # Used for cache expiration
29 | last_cache_reload = time.time()
30 | 
31 | 
32 | @app.route("/")
33 | def index():
34 |     """
35 |     Read index.html file from GCS bucket
36 |     """
37 |     return read_gcs_blob("index.html")
38 | 
39 | 
40 | @app.route("/catalog.json")
41 | def catalog():
42 |     """
43 |     Read catalog.json file from GCS bucket
44 |     """
45 |     return read_gcs_blob("catalog.json")
46 | 
47 | 
48 | @app.route("/manifest.json")
49 | def manifest():
50 |     """
51 |     Read manifest.json file from GCS bucket
52 |     """
53 |     return read_gcs_blob("manifest.json")
54 | 
55 | 
56 | @lru_cache(maxsize=3)
57 | def read_gcs_blob(name):
58 |     """
59 |     Read a blob from GCS
60 | 
61 |     :param name: blob to be read from GCS
62 |     :return: blob content
63 |     """
64 |     path = f"{DBT_BUCKET_PATH}/{name}" if DBT_BUCKET_PATH else name
65 |     blob = bucket.blob(path)
66 |     return blob.download_as_bytes().decode("utf-8")
67 | 
68 | 
69 | @app.before_request
70 | def before_request():
71 |     """
72 |     function to run before each request. Clear the cache if expired
73 |     """
74 |     global last_cache_reload
75 |     if time.time() - last_cache_reload > CACHE_MAX_AGE_IN_SECONDS:
76 |         print("Clearing cache")
77 |         read_gcs_blob.cache_clear()
78 |         last_cache_reload = time.time()
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     app.run(host="127.0.0.1", port=8080, debug=True)
83 | 


--------------------------------------------------------------------------------
/infrastructure/dbt-docs/app-engine/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.0.1
2 | google-cloud-storage==1.38.0
3 | 


--------------------------------------------------------------------------------
/infrastructure/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Composer versions
 2 | 
 3 | Not every deployment uses the same version. In order to be able to test and run
 4 | those environments there are several `composer_{AIRFLOW_VERSION}` folders.
 5 | 
 6 | Each of them contains the necessary elements to build build its docker image and
 7 | to be run without an entry point or a command. This way they are interchangeable
 8 | when they get loaded by the `docker-compose-dop.yaml`, used by the Data 
 9 | Engineers, or when loaded by the `docker-compose.yaml` used by the core DOP
10 | developers.
11 | 
12 | To use certain version it must be defined in one of the ways below:
13 | 
14 | 1. Declared as a make variable inline in the CLI:
15 | ```
16 | make build AIRFLOW_VERSION=1.10.15
17 | ```
18 | 2. Exported as an environment variable
19 | ```
20 | export AIRFLOW_VERSION=1.10.15
21 | make build
22 | ```
23 | 3. Defined in a `.env` file in the same folder as the Makefile to persist its
24 |     value between terminal sessions and make `make` easier to call
25 | ```
26 | echo 'AIRFLOW_VERSION=1.10.15' >> .env
27 | make build
28 | ```
29 | 
30 | It could also be declared in the Makefile itself. But it's better to split
31 | configuration and functionality.
32 | 
33 | # Requirements files
34 | ## composer-constrains.txt
35 | 
36 | This matches to a specific Cloud Composer version documented here https://cloud.google.com/composer/docs/concepts/versioning/composer-versions
37 | It is also used inside docker with the purpose to align pip packages with Composer as much as possible
38 | 
39 | ## requirements.txt
40 | This contains extra pip dependencies required by the local Airflow environment in Docker. It depends on `requirements.composer.txt`
41 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.10/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:1.10.10-2-python3.6
 2 | LABEL maintainer="Datatonic"
 3 | 
 4 | ARG AIRFLOW_HOME=/opt/airflow
 5 | ENV AIRFLOW_HOME=${AIRFLOW_HOME}
 6 | 
 7 | USER root
 8 | # Install dos2unix used to resolve windows line ending issues
 9 | # And gcc used in dbt packages compilation
10 | RUN apt-get update && apt-get install dos2unix gcc -y
11 | 
12 | USER airflow
13 | 
14 | # Install composer dependencies & additional required dependencies not included in Composer
15 | COPY composer_1.10.10/requirements.composer.txt /requirements.composer.txt
16 | COPY requirements.txt /pre-installed-requirements.txt
17 | RUN set -ex \
18 |     && pip install --user -r /pre-installed-requirements.txt
19 | 
20 | COPY --chown=airflow:airflow script/entrypoint.sh ${AIRFLOW_HOME}/script/entrypoint.sh
21 | COPY --chown=airflow:airflow script/exec_entrypoint.sh ${AIRFLOW_HOME}/script/exec_entrypoint.sh
22 | 
23 | # Resolve windows line ending issues
24 | RUN dos2unix -n ${AIRFLOW_HOME}/script/entrypoint.sh ${AIRFLOW_HOME}/script/entrypoint.sh
25 | RUN dos2unix -n ${AIRFLOW_HOME}/script/exec_entrypoint.sh ${AIRFLOW_HOME}/script/exec_entrypoint.sh
26 | 
27 | # allow execution of entrypoint script
28 | RUN chmod +x ${AIRFLOW_HOME}/script/entrypoint.sh
29 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.10/requirements.composer.txt:
--------------------------------------------------------------------------------
  1 | # Composer default versions
  2 | # mirroring version: composer-1.12.1-airflow-1.10.10
  3 | 
  4 | absl-py==0.9.0
  5 | alembic==1.4.2
  6 | amqp==2.6.0
  7 | apache-airflow-backport-providers-google==2020.6.24
  8 | apache-beam==2.23.0
  9 | apispec==1.3.3
 10 | argcomplete==1.11.1
 11 | astunparse==1.6.3
 12 | attrs==19.3.0
 13 | avro-python3==1.9.1
 14 | Babel==2.8.0
 15 | bcrypt==3.1.6
 16 | billiard==3.6.3.0
 17 | cached-property==1.5.1
 18 | cachetools==3.1.1
 19 | cattrs==0.9.0
 20 | celery==4.4.5
 21 | certifi==2019.11.28
 22 | cffi==1.14.0
 23 | chardet==3.0.4
 24 | click==6.7
 25 | colorama==0.4.3
 26 | colorlog==4.0.2
 27 | configparser==3.5.3
 28 | crcmod==1.7
 29 | croniter==0.3.31
 30 | cryptography==2.8
 31 | defusedxml==0.6.0
 32 | dill==0.3.1.1
 33 | docutils==0.16
 34 | fastavro==0.21.24
 35 | fasteners==0.15
 36 | Flask==1.1.1
 37 | Flask-Admin==1.5.4
 38 | Flask-AppBuilder==2.3.0
 39 | Flask-Babel==1.0.0
 40 | Flask-Bcrypt==0.7.1
 41 | Flask-Caching==1.3.3
 42 | Flask-JWT-Extended==3.24.1
 43 | Flask-Login==0.4.1
 44 | Flask-OpenID==1.2.5
 45 | Flask-SQLAlchemy==2.4.1
 46 | flask-swagger==0.2.13
 47 | Flask-WTF==0.14.3
 48 | flower==0.9.4
 49 | funcsigs==1.0.2
 50 | future==0.18.2
 51 | gast==0.3.3
 52 | google-ads==4.0.0
 53 | google-api-core==1.16.0
 54 | google-api-python-client==1.8.0
 55 | google-apitools==0.5.31
 56 | google-auth-httplib2==0.0.3
 57 | google-auth-oauthlib==0.4.1
 58 | google-auth==1.11.3
 59 | google-cloud-automl==1.0.1
 60 | google-cloud-automl==1.0.1
 61 | google-cloud-bigquery-datatransfer==1.1.0
 62 | google-cloud-bigquery==1.24.0
 63 | google-cloud-bigtable==1.0.0
 64 | google-cloud-container==0.4.0
 65 | google-cloud-core==1.3.0
 66 | google-cloud-datacatalog==0.7.0
 67 | google-cloud-dataproc==0.5.0
 68 | google-cloud-datastore==1.7.4
 69 | google-cloud-dlp==0.13.0
 70 | google-cloud-kms==1.4.0
 71 | google-cloud-language==1.3.0
 72 | google-cloud-logging==1.15.0
 73 | google-cloud-monitoring==1.0.0
 74 | google-cloud-pubsub==1.0.2
 75 | google-cloud-redis==1.0.0
 76 | google-cloud-redis==1.0.0
 77 | google-cloud-secret-manager==0.2.0
 78 | google-cloud-spanner==1.13.0
 79 | google-cloud-speech==1.3.2
 80 | google-cloud-storage==1.26.0
 81 | google-cloud-tasks==1.5.0
 82 | google-cloud-texttospeech==1.0.1
 83 | google-cloud-translate==2.0.1
 84 | google-cloud-videointelligence==1.13.0
 85 | google-cloud-vision==1.0.0
 86 | google-pasta==0.2.0
 87 | google-resumable-media==0.5.0
 88 | googleapis-common-protos==1.51.0
 89 | graphviz==0.13.2
 90 | grpc-google-iam-v1==0.12.3
 91 | grpcio==1.29.0
 92 | grpcio-gcp==0.2.2
 93 | gunicorn==19.10.0
 94 | h5py==2.10.0
 95 | httplib2==0.17.0
 96 | humanize==0.5.1
 97 | idna==2.9
 98 | importlib-metadata==1.5.0
 99 | iso8601==0.1.12
100 | itsdangerous==1.1.0
101 | Jinja2==2.10.3
102 | json-merge-patch==0.2
103 | jsonschema==3.2.0
104 | Keras-Preprocessing==1.1.2
105 | kombu==4.6.10
106 | kubernetes==11.0.0
107 | lazy-object-proxy==1.4.3
108 | lockfile==0.12.2
109 | Mako==1.1.2
110 | Markdown==2.6.11
111 | MarkupSafe==1.1.1
112 | marshmallow==2.19.5
113 | marshmallow-enum==1.5.1
114 | marshmallow-sqlalchemy==0.18.0
115 | mock==2.0.0
116 | monotonic==1.5
117 | mysqlclient==1.3.14
118 | numpy==1.18.2
119 | oauthlib==3.1.0
120 | opt-einsum==3.2.1
121 | pandas==0.25.3
122 | pandas-gbq==0.13.1
123 | pendulum==1.4.4
124 | # pip==19.0.2 not applicable for docker
125 | pipdeptree==1.0.0
126 | prison==0.1.3
127 | protobuf==3.11.3
128 | psutil==5.7.0
129 | pyasn1==0.4.8
130 | psycopg2-binary==2.8.4
131 | pyarrow==0.15.1
132 | pyasn1-modules==0.2.8
133 | pycparser==2.20
134 | pydata-google-auth==0.3.0
135 | pydot==1.4.1
136 | Pygments==2.6.1
137 | PyJWT==1.7.1
138 | pymongo==3.9.0
139 | pyOpenSSL==19.1.0
140 | pyrsistent==0.15.7
141 | python-daemon==2.1.2
142 | python-dateutil==2.8.1
143 | python-editor==1.0.4
144 | python-http-client==3.2.7
145 | python3-openid==3.1.0
146 | pytz==2019.3
147 | pytzdata==2019.3
148 | PyYAML==5.3.1
149 | redis==3.5.3
150 | requests==2.23.0
151 | requests-oauthlib==1.3.0
152 | rsa==4.0
153 | scipy==1.4.1
154 | sendgrid==5.6.0
155 | setproctitle==1.1.10
156 | setuptools==47.3.1
157 | six==1.14.0
158 | SQLAlchemy==1.3.15
159 | SQLAlchemy-JSONField==0.9.0
160 | SQLAlchemy-Utils==0.36.3
161 | statsd==3.3.0
162 | tabulate==0.8.6
163 | tenacity==4.12.0
164 | tensorboard==2.2.2
165 | tensorboard-plugin-wit==1.6.0.post3
166 | tensorflow==2.2.0
167 | tensorflow-estimator==2.2.0
168 | termcolor==1.1.0
169 | text-unidecode==1.2
170 | thrift==0.13.0
171 | tornado==6.0.4
172 | typing==3.7.4.1
173 | typing-extensions==3.7.4.1
174 | tzlocal==1.5.1
175 | unicodecsv==0.14.1
176 | uritemplate==3.0.1
177 | urllib3==1.25.8
178 | vine==1.3.0
179 | virtualenv==16.2.0
180 | websocket-client==0.57.0
181 | Werkzeug==0.16.1
182 | wheel==0.34.2
183 | wrapt==1.12.1
184 | WTForms==2.2.1
185 | zipp==1.2.0
186 | zope.deprecation==4.4.0
187 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.10/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.composer.txt
2 | # Additional / modified versions
3 | dataclasses==0.7
4 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.10/script/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | airflow initdb
3 | airflow scheduler &
4 | airflow webserver
5 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.10/script/exec_entrypoint.sh:
--------------------------------------------------------------------------------
 1 | export POSTGRES_HOST="postgres"
 2 | export POSTGRES_PORT="5432"
 3 | export POSTGRES_USER="airflow"
 4 | export POSTGRES_PASSWORD="airflow"
 5 | export POSTGRES_DB="airflow"
 6 | export POSTGRES_EXTRAS=""
 7 | 
 8 | AIRFLOW__CORE__SQL_ALCHEMY_CONN="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}"
 9 | AIRFLOW__CELERY__RESULT_BACKEND="db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}"
10 | 
11 | export AIRFLOW__CORE__SQL_ALCHEMY_CONN
12 | export AIRFLOW__CELERY__RESULT_BACKEND
13 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.14/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/airflow:1.10.14-python3.6
2 | 
3 | # Install composer dependencies & additional required dependencies not included in Composer
4 | COPY constrains-composer.txt requirements.txt ./
5 | RUN set -ex && pip install --user -r requirements.txt
6 | 
7 | ENTRYPOINT airflow initdb; airflow scheduler & airflow webserver
8 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.14/constrains-composer.txt:
--------------------------------------------------------------------------------
  1 | # Composer default versions
  2 | # mirroring version: composer-1.15.2-airflow-1.10.14
  3 | 
  4 | absl-py==0.11.0
  5 | alembic==1.4.3
  6 | amqp==2.6.1
  7 | apache-airflow-backport-providers-apache-beam==2021.2.5
  8 | apache-airflow-backport-providers-cncf-kubernetes==2021.2.5
  9 | apache-airflow-backport-providers-google==2021.2.5
 10 | apache-airflow-upgrade-check==1.0.0
 11 | apache-beam==2.27.0
 12 | apispec==1.3.3
 13 | appdirs==1.4.4
 14 | argcomplete==1.12.2
 15 | astunparse==1.6.3
 16 | attrs==20.3.0
 17 | avro-python3==1.9.2.1
 18 | Babel==2.9.0
 19 | bcrypt==3.2.0
 20 | billiard==3.6.3.0
 21 | cached-property==1.5.2
 22 | cachetools==4.1.1
 23 | cattrs==1.0.0
 24 | celery==4.4.7
 25 | certifi==2020.11.8
 26 | cffi==1.14.4
 27 | chardet==3.0.4
 28 | click==6.7
 29 | colorama==0.4.4
 30 | colorlog==4.0.2
 31 | configparser==3.5.3
 32 | crcmod==1.7
 33 | croniter==0.3.36
 34 | cryptography==3.2.1
 35 | dataclasses==0.8
 36 | defusedxml==0.6.0
 37 | dill==0.3.1.1
 38 | distlib==0.3.1
 39 | dnspython==1.16.0
 40 | docopt==0.6.2
 41 | docutils==0.15.2
 42 | email-validator==1.1.2
 43 | fastavro==1.2.0
 44 | fasteners==0.15
 45 | filelock==3.0.12
 46 | Flask==1.1.2
 47 | Flask-Admin==1.5.4
 48 | Flask-AppBuilder==2.3.4
 49 | Flask-Babel==1.0.0
 50 | Flask-Bcrypt==0.7.1
 51 | Flask-Caching==1.3.3
 52 | Flask-JWT-Extended==3.25.0
 53 | Flask-Login==0.4.1
 54 | Flask-OpenID==1.2.5
 55 | Flask-SQLAlchemy==2.4.4
 56 | flask-swagger==0.2.14
 57 | Flask-WTF==0.14.3
 58 | flower==0.9.5
 59 | funcsigs==1.0.2
 60 | future==0.18.2
 61 | gast==0.3.3
 62 | google-ads==4.0.0
 63 | google-api-core==1.26.0
 64 | google-api-python-client==1.12.8
 65 | google-apitools==0.5.31
 66 | google-auth==1.24.0
 67 | google-auth-httplib2==0.0.4
 68 | google-auth-oauthlib==0.4.2
 69 | google-cloud-automl==2.1.0
 70 | google-cloud-bigquery==2.4.0
 71 | google-cloud-bigquery-datatransfer==3.0.0
 72 | google-cloud-bigquery-storage==2.1.0
 73 | google-cloud-bigtable==1.6.0
 74 | google-cloud-build==2.0.0
 75 | google-cloud-container==1.0.1
 76 | google-cloud-core==1.4.3
 77 | google-cloud-datacatalog==3.0.0
 78 | google-cloud-dataproc==2.2.0
 79 | google-cloud-datastore==1.15.3
 80 | google-cloud-dlp==1.0.0
 81 | google-cloud-kms==2.2.0
 82 | google-cloud-language==1.3.0
 83 | google-cloud-logging==2.2.0
 84 | google-cloud-memcache==0.2.0
 85 | google-cloud-monitoring==2.0.0
 86 | google-cloud-os-login==2.1.0
 87 | google-cloud-pubsub==2.1.0
 88 | google-cloud-pubsublite==0.1.0
 89 | google-cloud-redis==2.0.0
 90 | google-cloud-secret-manager==1.0.0
 91 | google-cloud-spanner==1.19.1
 92 | google-cloud-speech==1.3.2
 93 | google-cloud-storage==1.33.0
 94 | google-cloud-tasks==2.1.0
 95 | google-cloud-texttospeech==1.0.1
 96 | google-cloud-translate==1.7.0
 97 | google-cloud-videointelligence==1.16.1
 98 | google-cloud-vision==1.0.0
 99 | google-cloud-workflows==0.2.0
100 | google-crc32c==1.0.0
101 | google-pasta==0.2.0
102 | google-resumable-media==1.1.0
103 | googleapis-common-protos==1.52.0
104 | graphviz==0.15
105 | grpc-google-iam-v1==0.12.3
106 | grpcio==1.33.2
107 | grpcio-gcp==0.2.2
108 | gunicorn==20.0.4
109 | h5py==2.10.0
110 | hdfs==2.5.8
111 | httplib2==0.17.4
112 | humanize==3.1.0
113 | idna==2.8
114 | importlib-metadata==2.1.0
115 | importlib-resources==1.5.0
116 | iso8601==0.1.13
117 | itsdangerous==1.1.0
118 | Jinja2==2.11.2
119 | json-merge-patch==0.2
120 | jsonschema==3.2.0
121 | Keras-Preprocessing==1.1.2
122 | kombu==4.6.11
123 | kubernetes==11.0.0
124 | lazy-object-proxy==1.4.3
125 | libcst==0.3.14
126 | lockfile==0.12.2
127 | Mako==1.1.3
128 | Markdown==2.6.11
129 | MarkupSafe==1.1.1
130 | marshmallow==2.21.0
131 | marshmallow-enum==1.5.1
132 | marshmallow-sqlalchemy==0.23.1
133 | mock==2.0.0
134 | monotonic==1.5
135 | mypy-extensions==0.4.3
136 | mysqlclient==1.3.14
137 | natsort==7.1.0
138 | numpy==1.19.4
139 | oauth2client==4.1.3
140 | oauthlib==3.1.0
141 | opt-einsum==3.3.0
142 | overrides==3.1.0
143 | packaging==20.7
144 | pandas==1.1.4
145 | pandas-gbq==0.14.1
146 | pbr==5.5.1
147 | pendulum==1.4.4
148 | pep562==1.0
149 | pip==20.1.1
150 | pipdeptree==1.0.0
151 | prison==0.1.3
152 | prometheus-client==0.8.0
153 | proto-plus==1.11.0
154 | protobuf==3.14.0
155 | psutil==5.7.3
156 | psycopg2-binary==2.8.6
157 | pyarrow==2.0.0
158 | pyasn1==0.4.8
159 | pyasn1-modules==0.2.8
160 | pycparser==2.20
161 | pydata-google-auth==1.1.0
162 | pydot==1.4.1
163 | Pygments==2.7.2
164 | PyJWT==1.7.1
165 | pymongo==3.10.1
166 | pyOpenSSL==20.0.0
167 | pyparsing==2.4.7
168 | pyrsistent==0.17.3
169 | python-daemon==2.2.4
170 | python-dateutil==2.8.1
171 | python-editor==1.0.4
172 | python-http-client==3.3.1
173 | python-nvd3==0.15.0
174 | python-slugify==4.0.1
175 | python3-openid==3.2.0
176 | pytz==2020.4
177 | pytzdata==2020.1
178 | PyYAML==5.3.1
179 | redis==3.5.3
180 | requests==2.25.0
181 | requests-oauthlib==1.3.0
182 | rsa==4.6
183 | scipy==1.4.1
184 | sendgrid==5.6.0
185 | setproctitle==1.2
186 | setuptools==51.0.0
187 | six==1.15.0
188 | SQLAlchemy==1.3.20
189 | SQLAlchemy-JSONField==0.9.0
190 | SQLAlchemy-Utils==0.36.8
191 | statsd==3.3.0
192 | tabulate==0.8.7
193 | tenacity==4.12.0
194 | tensorboard==2.2.2
195 | tensorboard-plugin-wit==1.7.0
196 | tensorflow==2.2.0
197 | tensorflow-estimator==2.2.0
198 | termcolor==1.1.0
199 | text-unidecode==1.3
200 | thrift==0.13.0
201 | tornado==5.1.1
202 | typing==3.7.4.3
203 | typing-extensions==3.7.4.3
204 | typing-inspect==0.6.0
205 | tzlocal==1.5.1
206 | unicodecsv==0.14.1
207 | uritemplate==3.0.1
208 | urllib3==1.25.11
209 | vine==1.3.0
210 | virtualenv==20.2.1
211 | websocket-client==0.54.0
212 | Werkzeug==0.16.1
213 | wheel==0.36.1
214 | wrapt==1.12.1
215 | WTForms==2.3.3
216 | zipp==3.4.0
217 | zope.deprecation==4.4.0
218 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.14/requirements.txt:
--------------------------------------------------------------------------------
1 | -r constrains-composer.txt
2 | # Additional / modified versions
3 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.15/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/airflow:1.10.15-python3.6
2 | 
3 | # Install composer dependencies & additional required dependencies not included in Composer
4 | COPY constrains-composer.txt requirements.txt ./
5 | RUN set -ex && pip install --user -r requirements.txt
6 | 
7 | ENTRYPOINT airflow db init; airflow scheduler & airflow webserver
8 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.15/constrains-composer.txt:
--------------------------------------------------------------------------------
  1 | # Composer default versions
  2 | # mirroring version: composer-1.16.1-airflow-1.10.15
  3 | 
  4 | absl-py==0.12.0
  5 | alembic==1.5.7
  6 | amqp==2.6.1
  7 | apache-airflow-backport-providers-apache-beam==2021.3.13
  8 | apache-airflow-backport-providers-cncf-kubernetes==2021.3.3
  9 | apache-airflow-backport-providers-google==2021.3.3
 10 | apache-beam==2.27.0
 11 | apispec==1.3.3
 12 | appdirs==1.4.4
 13 | argcomplete==1.12.2
 14 | astunparse==1.6.3
 15 | attrs==20.3.0
 16 | avro-python3==1.9.2.1
 17 | Babel==2.9.0
 18 | bcrypt==3.2.0
 19 | billiard==3.6.3.0
 20 | cached-property==1.5.2
 21 | cachetools==4.2.1
 22 | cattrs==1.0.0
 23 | celery==4.4.7
 24 | certifi==2020.12.5
 25 | cffi==1.14.5
 26 | chardet==4.0.0
 27 | click==6.7
 28 | colorama==0.4.4
 29 | colorlog==4.0.2
 30 | configparser==3.5.3
 31 | crcmod==1.7
 32 | croniter==0.3.37
 33 | cryptography==3.4.6
 34 | dataclasses==0.8
 35 | defusedxml==0.7.1
 36 | dill==0.3.1.1
 37 | distlib==0.3.1
 38 | dnspython==2.1.0
 39 | docopt==0.6.2
 40 | docutils==0.16
 41 | email-validator==1.1.2
 42 | fastavro==1.3.4
 43 | fasteners==0.16
 44 | filelock==3.0.12
 45 | Flask==1.1.2
 46 | Flask-Admin==1.5.4
 47 | Flask-AppBuilder==2.3.4
 48 | Flask-Babel==1.0.0
 49 | Flask-Bcrypt==0.7.1
 50 | Flask-Caching==1.3.3
 51 | Flask-JWT-Extended==3.25.1
 52 | Flask-Login==0.4.1
 53 | Flask-OpenID==1.2.5
 54 | Flask-SQLAlchemy==2.5.1
 55 | flask-swagger==0.2.14
 56 | Flask-WTF==0.14.3
 57 | flower==0.9.7
 58 | funcsigs==1.0.2
 59 | future==0.18.2
 60 | gast==0.3.3
 61 | google-ads==4.0.0
 62 | google-api-core==1.26.1
 63 | google-api-python-client==1.12.8
 64 | google-apitools==0.5.31
 65 | google-auth==1.28.0
 66 | google-auth-httplib2==0.1.0
 67 | google-auth-oauthlib==0.4.3
 68 | google-cloud-automl==2.2.0
 69 | google-cloud-bigquery==2.13.0
 70 | google-cloud-bigquery-datatransfer==3.1.0
 71 | google-cloud-bigquery-storage==2.1.0
 72 | google-cloud-bigtable==1.7.0
 73 | google-cloud-build==2.0.0
 74 | google-cloud-container==1.0.1
 75 | google-cloud-core==1.6.0
 76 | google-cloud-datacatalog==3.1.0
 77 | google-cloud-dataproc==2.3.0
 78 | google-cloud-datastore==1.15.3
 79 | google-cloud-dlp==1.0.0
 80 | google-cloud-kms==2.2.0
 81 | google-cloud-language==1.3.0
 82 | google-cloud-logging==2.2.0
 83 | google-cloud-memcache==0.3.0
 84 | google-cloud-monitoring==2.0.0
 85 | google-cloud-os-login==2.1.0
 86 | google-cloud-pubsub==2.3.0
 87 | google-cloud-pubsublite==0.3.0
 88 | google-cloud-redis==2.1.0
 89 | google-cloud-secret-manager==1.0.0
 90 | google-cloud-spanner==1.19.1
 91 | google-cloud-speech==1.3.2
 92 | google-cloud-storage==1.36.2
 93 | google-cloud-tasks==2.2.0
 94 | google-cloud-texttospeech==1.0.1
 95 | google-cloud-translate==1.7.0
 96 | google-cloud-videointelligence==1.16.1
 97 | google-cloud-vision==1.0.0
 98 | google-cloud-workflows==0.2.0
 99 | google-crc32c==1.1.2
100 | google-pasta==0.2.0
101 | google-resumable-media==1.2.0
102 | googleapis-common-protos==1.53.0
103 | graphviz==0.16
104 | greenlet==1.0.0
105 | grpc-google-iam-v1==0.12.3
106 | grpcio==1.36.1
107 | grpcio-gcp==0.2.2
108 | gunicorn==20.0.4
109 | h5py==2.10.0
110 | hdfs==2.6.0
111 | httplib2==0.17.4
112 | humanize==3.3.0
113 | idna==2.8
114 | importlib-metadata==2.1.1
115 | importlib-resources==1.5.0
116 | iso8601==0.1.14
117 | itsdangerous==1.1.0
118 | Jinja2==2.11.3
119 | json-merge-patch==0.2
120 | jsonschema==3.2.0
121 | Keras-Preprocessing==1.1.2
122 | kombu==4.6.11
123 | kubernetes==11.0.0
124 | lazy-object-proxy==1.4.3
125 | libcst==0.3.17
126 | lockfile==0.12.2
127 | Mako==1.1.4
128 | Markdown==2.6.11
129 | MarkupSafe==1.1.1
130 | marshmallow==2.21.0
131 | marshmallow-enum==1.5.1
132 | marshmallow-sqlalchemy==0.23.1
133 | mock==2.0.0
134 | monotonic==1.5
135 | mypy-extensions==0.4.3
136 | mysqlclient==1.3.14
137 | natsort==7.1.1
138 | numpy==1.19.5
139 | oauth2client==4.1.3
140 | oauthlib==3.1.0
141 | opt-einsum==3.3.0
142 | overrides==3.1.0
143 | packaging==20.9
144 | pandas==1.1.5
145 | pandas-gbq==0.14.1
146 | pbr==5.5.1
147 | pendulum==1.4.4
148 | pep562==1.0
149 | pip==20.1.1
150 | pipdeptree==1.0.0
151 | prison==0.1.3
152 | prometheus-client==0.8.0
153 | proto-plus==1.18.1
154 | protobuf==3.15.6
155 | psutil==5.8.0
156 | psycopg2-binary==2.8.6
157 | pyarrow==2.0.0
158 | pyasn1==0.4.8
159 | pyasn1-modules==0.2.8
160 | pycparser==2.20
161 | pydata-google-auth==1.1.0
162 | pydot==1.4.2
163 | Pygments==2.8.1
164 | PyJWT==1.7.1
165 | pymongo==3.11.3
166 | pyOpenSSL==20.0.1
167 | pyparsing==2.4.7
168 | pyrsistent==0.17.3
169 | python-daemon==2.3.0
170 | python-dateutil==2.8.1
171 | python-editor==1.0.4
172 | python-http-client==3.3.2
173 | python-nvd3==0.15.0
174 | python-slugify==4.0.1
175 | python3-openid==3.2.0
176 | pytz==2021.1
177 | pytzdata==2020.1
178 | PyYAML==5.4.1
179 | redis==3.5.3
180 | requests==2.25.1
181 | requests-oauthlib==1.3.0
182 | rsa==4.7.2
183 | scipy==1.4.1
184 | sendgrid==5.6.0
185 | setproctitle==1.2.2
186 | setuptools==54.2.0
187 | six==1.15.0
188 | SQLAlchemy==1.3.20
189 | SQLAlchemy-JSONField==0.9.0
190 | SQLAlchemy-Utils==0.36.8
191 | starkbank-ecdsa==1.1.0
192 | statsd==3.3.0
193 | tabulate==0.8.9
194 | tenacity==4.12.0
195 | tensorboard==2.2.2
196 | tensorboard-plugin-wit==1.8.0
197 | tensorflow==2.2.0
198 | tensorflow-estimator==2.2.0
199 | termcolor==1.1.0
200 | text-unidecode==1.3
201 | thrift==0.13.0
202 | tornado==5.1.1
203 | typing==3.7.4.3
204 | typing-extensions==3.7.4.3
205 | typing-inspect==0.6.0
206 | tzlocal==1.5.1
207 | unicodecsv==0.14.1
208 | uritemplate==3.0.1
209 | urllib3==1.26.4
210 | vine==1.3.0
211 | virtualenv==20.4.3
212 | websocket-client==0.58.0
213 | Werkzeug==0.16.1
214 | wheel==0.36.2
215 | wrapt==1.12.1
216 | WTForms==2.3.3
217 | zipp==3.4.1
218 | zope.deprecation==4.4.0
219 | 


--------------------------------------------------------------------------------
/infrastructure/docker/composer_1.10.15/requirements.txt:
--------------------------------------------------------------------------------
1 | -r constrains-composer.txt
2 | # Additional / modified versions
3 | 


--------------------------------------------------------------------------------
/infrastructure/docker/developer_only/.airflowignore:
--------------------------------------------------------------------------------
1 | dop_service_project
2 | 


--------------------------------------------------------------------------------
/infrastructure/docker/docker-compose-dop.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |   postgres:
 4 |     image: postgres:13.2-alpine
 5 |     container_name: dop_postgres
 6 |     restart: always
 7 |     environment:
 8 |       - POSTGRES_USER=airflow
 9 |       - POSTGRES_PASSWORD=airflow
10 |       - POSTGRES_DB=airflow
11 |     logging:
12 |       options:
13 |         max-size: 10m
14 |         max-file: "3"
15 | 
16 |   webserver:
17 |     build: composer_${AIRFLOW_VERSION}
18 |     container_name: dop_webserver
19 |     restart: always
20 |     entrypoint: ./script/entrypoint.sh
21 |     depends_on:
22 |         - postgres
23 |     environment:
24 |       - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
25 |       - AIRFLOW__CORE__EXECUTOR=LocalExecutor
26 |       - AIRFLOW__CORE__LOGGING_LEVEL=INFO
27 |       - GOOGLE_APPLICATION_CREDENTIALS=/secret/gcp-credentials/application_default_credentials.json
28 |       - DOP_SANDBOX_ENVIRONMENT=true # set to true if running locally on a laptop, this enables certain features such as service account impersonation
29 | 
30 |       # The following environment environment variables need to be set on both the docker local environment as well as the composer environment
31 |       - DOP_SERVICE_PROJECT_PATH=/opt/airflow/dags/dop_service_project # The absolute directory of the service project path. Each DBT project in this path should be within their folder and must be valid. I.e. on Docker, this could be /opt/airflow/dags/dop/dbt-projects. On Composer this could be anywhere under the `/home/airflow/gcs/dags` or `/home/airflow/gcs/data` directory
32 |       - DOP_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP project_id - to be used as the project where data will be consumed & persisted
33 |       - DOP_LOCATION=${LOCATION?location_is_undefined} # GCP region - to be used to persist all data
34 |       - DOP_INFRA_PROJECT_ID=${PROJECT_ID?project_id_is_undefined} # GCP infrastructure project id, for local development this isn't used so leaving it as the same as gcp service project id
35 |     logging:
36 |       options:
37 |         max-size: 10m
38 |         max-file: "3"
39 |     volumes:
40 |       - ${SERVICE_PROJECT_ABS_PATH}:/opt/airflow/dags/dop_service_project
41 |       - ~/.config/gcloud/application_default_credentials.json:/secret/gcp-credentials/application_default_credentials.json # mount application default credentials only so no keys as used
42 |       # - ${SERVICE_PROJECT_ABS_PATH}/plugins:/usr/local/airflow/plugins
43 |     ports:
44 |       - "8082:8080"
45 |     command: webserver
46 |     healthcheck:
47 |       test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"]
48 |       interval: 30s
49 |       timeout: 30s
50 |       retries: 3
51 | 


--------------------------------------------------------------------------------
/infrastructure/executor/dbt/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM google/cloud-sdk:slim
 2 | 
 3 | ENV LANG C.UTF-8
 4 | ENV LC_ALL C.UTF-8
 5 | 
 6 | ARG DBT_HOME=/home/dbtuser
 7 | ARG BUILD_DIR=/tmp/dbt_build_tmp
 8 | 
 9 | RUN apt-get update && apt-get install -y git
10 | 
11 | RUN set -ex \
12 |     && pip3 install PyYAML \
13 |     && pip3 install pipenv
14 | 
15 | RUN groupadd -g 999 dbtuser && useradd -r -u 999 -g dbtuser dbtuser
16 | WORKDIR ${DBT_HOME}
17 | 
18 | RUN chown -R dbtuser:dbtuser ${DBT_HOME}
19 | 
20 | USER dbtuser
21 | RUN mkdir ${DBT_HOME}/.dbt
22 | 
23 | RUN mkdir ${BUILD_DIR}
24 | 
25 | # Update pip dependencies
26 | COPY --chown=dbtuser:dbtuser ./embedded_dop/executor_config/dbt/Pipfile ./embedded_dop/executor_config/dbt/Pipfile.lock ./
27 | RUN pipenv sync
28 | 
29 | # store the whole service project repository in the .tmp folder and build what's required and then delete everything else
30 | COPY --chown=dbtuser:dbtuser ./ ${BUILD_DIR}
31 | RUN ls -al ${BUILD_DIR}
32 | 
33 | # initialise dbt
34 | RUN DBT_HOME=${DBT_HOME} BUILD_DIR=${BUILD_DIR} python3 ${BUILD_DIR}/embedded_dop/source/infrastructure/executor/dbt/init.py
35 | 
36 | # remote the build dir
37 | RUN rm -rf ${BUILD_DIR}
38 | 


--------------------------------------------------------------------------------
/infrastructure/executor/dbt/README.md:
--------------------------------------------------------------------------------
1 | # Building the DBT executor docker image
2 | This builds the production ready docker image to run with Kubernetes Pod Operator
3 | 
4 | ## How to start the build process
5 | See the Makefile in the `examples/service_project` folder for more details.
6 | 
7 | TODO: this container image is only used for production / cloud composer, but it would be better to bring this more inline with the local docker environment without compromising usability
8 | 


--------------------------------------------------------------------------------
/infrastructure/executor/dbt/init.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import yaml
  3 | import json
  4 | import shutil
  5 | import subprocess
  6 | 
  7 | DOP_DBT_USER = "dop-dbt-user"
  8 | 
  9 | try:
 10 |     from yaml import CLoader as Loader, CDumper as Dumper
 11 | except ImportError:
 12 |     from yaml import Loader, Dumper  # noqa: F401
 13 | 
 14 | 
 15 | def copy_and_overwrite(from_path, to_path):
 16 |     if os.path.exists(to_path):
 17 |         shutil.rmtree(to_path)
 18 |     shutil.copytree(from_path, to_path)
 19 | 
 20 | 
 21 | def yaml_to_dict(y):
 22 |     yml = yaml.load(y, Loader=Loader)
 23 |     return yml
 24 | 
 25 | 
 26 | def dict_to_yaml(d):
 27 |     yml = yaml.load(json.dumps(d), Loader=Loader)
 28 |     return yaml.dump(yml)
 29 | 
 30 | 
 31 | def build_profile_file_content(profile_ids):
 32 |     # the profile is generated dynamically at runtime, therefore multiple target profiles are not required
 33 |     target = "all"
 34 |     target_type = "bigquery"
 35 | 
 36 |     bq_profile = {}
 37 | 
 38 |     for profile_id in profile_ids:
 39 |         bq_profile[profile_id] = {
 40 |             "target": target,
 41 |             "outputs": {
 42 |                 target: {
 43 |                     "type": target_type,
 44 |                     "method": "oauth",
 45 |                     "project": '{{ env_var("DOP_PROJECT_ID") }}',
 46 |                     "schema": '{{ env_var("DOP_DBT_SCHEMA", "'
 47 |                     + str(profile_id).replace("-", "_")
 48 |                     + '") }}',
 49 |                     "threads": 1,
 50 |                     "timeout_seconds": 300,
 51 |                     "location": '{{ env_var("DOP_LOCATION") }}',
 52 |                     "priority": "interactive",
 53 |                     "impersonate_service_account": f"{DOP_DBT_USER}"
 54 |                     + '@{{ env_var("DOP_PROJECT_ID") }}.iam.gserviceaccount.com',
 55 |                 }
 56 |             },
 57 |         }
 58 | 
 59 |     profile = dict_to_yaml(bq_profile)
 60 | 
 61 |     return profile
 62 | 
 63 | 
 64 | def save_profile_yml(dbt_home, file_content):
 65 |     with open(os.path.sep.join([dbt_home, ".dbt", "profiles.yml"]), "w+") as fp:
 66 |         fp.write(file_content)
 67 | 
 68 | 
 69 | dbt_home = os.environ["DBT_HOME"]
 70 | build_dir = os.environ["BUILD_DIR"]
 71 | 
 72 | print(f"DBT_HOME: {dbt_home}")
 73 | print(f"BUILD_DIR: {build_dir}")
 74 | 
 75 | dop_config_path = os.path.sep.join(
 76 |     [build_dir, "embedded_dop", "executor_config", "dbt", "config.yaml"]
 77 | )
 78 | 
 79 | with open(dop_config_path) as fp_config:
 80 |     dop_config = yaml_to_dict(fp_config.read())
 81 |     # validation
 82 |     if not dop_config.get("dbt_projects"):
 83 |         raise RuntimeError("The `dbt_projects` section must be defined")
 84 | 
 85 |     dbt_configs = dop_config.get("dbt_projects")
 86 |     profile_ids = []
 87 |     dbt_projects_path = []
 88 |     for dbt_config in dbt_configs:
 89 |         # validation
 90 |         if not dbt_config.get("project_path"):
 91 |             raise RuntimeError("`project_path` must be defined for DBT")
 92 | 
 93 |         project_path = dbt_config.get("project_path")
 94 | 
 95 |         project_yml_path = os.path.sep.join(
 96 |             [build_dir, project_path, "dbt_project.yml"]
 97 |         )
 98 |         with open(project_yml_path) as fp_yml:
 99 |             profile_ids.append(yaml_to_dict(fp_yml.read()).get("profile"))
100 | 
101 |         # copy dbt projects to the dbt home location
102 |         to_path = os.path.sep.join([dbt_home, project_path])
103 |         copy_and_overwrite(
104 |             from_path=os.path.sep.join([build_dir, project_path]), to_path=to_path
105 |         )
106 | 
107 |         dbt_projects_path.append(to_path)
108 | 
109 |     # create the profiles yml file for all dbt projects
110 |     if profile_ids:
111 |         file_content = build_profile_file_content(profile_ids=profile_ids)
112 | 
113 |         print(f"profiles.yml: \n{file_content}")
114 | 
115 |         save_profile_yml(dbt_home=dbt_home, file_content=file_content)
116 | 
117 |     for dbt_project_path in dbt_projects_path:
118 |         for dbt_cmd in ["clean", "deps"]:
119 |             proc = subprocess.Popen(
120 |                 ["pipenv", "run", "dbt", dbt_cmd],
121 |                 stdout=subprocess.PIPE,
122 |                 cwd=dbt_project_path,
123 |             )
124 |             while True:
125 |                 line = proc.stdout.readline()
126 |                 if not line:
127 |                     break
128 |                 print(line.rstrip())
129 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/tests/__init__.py


--------------------------------------------------------------------------------
/tests/integration_tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdatatonic/dop/08b64972e9899971d5c4f892480aa0c067b53c3b/tests/integration_tests/.gitkeep


--------------------------------------------------------------------------------
/tests/unit_tests/component/transformation/common/adapter/test_schema.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from dags.dop.component.transformation.common.adapter import (
  3 |     schema as transformation_schema,
  4 | )
  5 | from dags.dop.component.transformation.common.adapter.schema import InvalidDagConfig
  6 | 
  7 | 
  8 | def test_dag_config_overall_validation():
  9 |     invalid_payload = {}
 10 |     schema = transformation_schema.DagConfigSchema()
 11 |     errors = schema.load(invalid_payload).errors
 12 | 
 13 |     assert errors.get("timezone")
 14 |     assert errors.get("tasks")
 15 | 
 16 | 
 17 | def test_dag_config_cron_validation():
 18 |     invalid_payload = {"schedule_interval": "0 *"}
 19 |     schema = transformation_schema.DagConfigSchema()
 20 |     errors = schema.load(invalid_payload).errors
 21 | 
 22 |     assert errors.get("schedule_interval")
 23 | 
 24 | 
 25 | def test_task_overall_config_validation():
 26 |     invalid_payload = {
 27 |         "schedule_interval": "0 1 * * *",
 28 |         "timezone": "Europe/London",
 29 |         "tasks": [{}],
 30 |     }
 31 |     schema = transformation_schema.DagConfigSchema()
 32 |     errors = schema.load(invalid_payload).errors
 33 | 
 34 |     assert errors["tasks"][0].get("identifier")
 35 |     assert errors["tasks"][0].get("kind")
 36 | 
 37 | 
 38 | def test_task_kind_config_validation():
 39 |     invalid_payload = {
 40 |         "schedule_interval": "0 1 * * *",
 41 |         "timezone": "Europe/London",
 42 |         "tasks": [
 43 |             {
 44 |                 "identifier": "stg_covid19",
 45 |                 "kind": {
 46 |                     "action": "materialization_invalid",
 47 |                     "target": "table_invalid",
 48 |                 },
 49 |             }
 50 |         ],
 51 |     }
 52 |     schema = transformation_schema.DagConfigSchema()
 53 |     errors = schema.load(invalid_payload).errors
 54 | 
 55 |     assert errors["tasks"][0].get("kind").get("action") == ["Not a valid choice."]
 56 |     assert errors["tasks"][0].get("kind").get("target") is None
 57 | 
 58 | 
 59 | def test_partitioning_validation_with_invalid_field():
 60 |     payload = generate_valid_schema()
 61 |     payload["tasks"] = [
 62 |         {"partitioning": {"field": "this_is_wrong", "data_type": "date"}}
 63 |     ]
 64 | 
 65 |     with pytest.raises(InvalidDagConfig):
 66 |         transformation_schema.load_dag_schema(payload)
 67 | 
 68 | 
 69 | def test_partitioning_validation_with_invalid_data_type():
 70 |     payload = generate_valid_schema()
 71 |     payload["tasks"] = [
 72 |         {"partitioning": {"field": "my_field", "data_type": "an_invalid_type"}}
 73 |     ]
 74 | 
 75 |     with pytest.raises(InvalidDagConfig):
 76 |         transformation_schema.load_dag_schema(payload)
 77 | 
 78 | 
 79 | def test_partitioning_validation_for_timestamp_type():
 80 |     valid_payload = generate_valid_schema()
 81 |     valid_payload["tasks"] = [
 82 |         {
 83 |             "partitioning": {"field": "date", "data_type": "timestamp"},
 84 |             "identifier": "stg_covid19",
 85 |             "schema": "dop_sandbox_us",
 86 |             "kind": {"action": "materialization", "target": "table"},
 87 |             "dependencies": ["a", "b", "c"],
 88 |         }
 89 |     ]
 90 | 
 91 |     dag_config = transformation_schema.load_dag_schema(valid_payload)
 92 | 
 93 |     assert isinstance(
 94 |         dag_config.tasks[0].partitioning, transformation_schema.Partitioning
 95 |     )
 96 |     assert dag_config.tasks[0].partitioning.data_type == "timestamp"
 97 | 
 98 | 
 99 | def test_partitioning_validation_for_datetime_type():
100 |     valid_payload = generate_valid_schema()
101 |     valid_payload["tasks"] = [
102 |         {
103 |             "partitioning": {"field": "date", "data_type": "datetime"},
104 |             "identifier": "stg_covid19",
105 |             "schema": "dop_sandbox_us",
106 |             "kind": {"action": "materialization", "target": "table"},
107 |             "dependencies": ["a", "b", "c"],
108 |         }
109 |     ]
110 | 
111 |     dag_config = transformation_schema.load_dag_schema(valid_payload)
112 | 
113 |     assert isinstance(
114 |         dag_config.tasks[0].partitioning, transformation_schema.Partitioning
115 |     )
116 |     assert dag_config.tasks[0].partitioning.data_type == "datetime"
117 | 
118 | 
119 | def test_partitioning_validation_for_date_type():
120 |     valid_payload = generate_valid_schema()
121 |     valid_payload["tasks"] = [
122 |         {
123 |             "partitioning": {"field": "date", "data_type": "date"},
124 |             "identifier": "stg_covid19",
125 |             "schema": "dop_sandbox_us",
126 |             "kind": {"action": "materialization", "target": "table"},
127 |             "dependencies": ["a", "b", "c"],
128 |         }
129 |     ]
130 | 
131 |     dag_config = transformation_schema.load_dag_schema(valid_payload)
132 | 
133 |     assert isinstance(
134 |         dag_config.tasks[0].partitioning, transformation_schema.Partitioning
135 |     )
136 |     assert dag_config.tasks[0].partitioning.data_type == "date"
137 | 
138 | 
139 | def test_schema_deserialization():
140 |     payload = generate_valid_schema()
141 | 
142 |     try:
143 |         transformation_schema.load_dag_schema(payload)
144 |     except transformation_schema.InvalidDagConfig as e:
145 |         pytest.fail(f"Should not raise exception InvalidDagConfig, error: {e}")
146 | 
147 |     dag_config = transformation_schema.load_dag_schema(payload)
148 |     assert isinstance(dag_config, transformation_schema.DagConfig)
149 |     assert isinstance(dag_config.tasks[0], transformation_schema.Task)
150 |     assert isinstance(dag_config.tasks[0].kind, transformation_schema.Kind)
151 |     assert isinstance(
152 |         dag_config.tasks[0].partitioning, transformation_schema.Partitioning
153 |     )
154 |     assert dag_config.tasks[0].dependencies == ["a", "b", "c"]
155 | 
156 | 
157 | def generate_valid_schema():
158 |     return {
159 |         "schedule_interval": "0 1 * * *",
160 |         "timezone": "Europe/London",
161 |         "params": {"value_a": [1, 2, 3]},
162 |         "database": "sandbox",
163 |         "schema": "dop_sandbox_us",
164 |         "tasks": [
165 |             {
166 |                 "partitioning": {"field": "date", "data_type": "date"},
167 |                 "identifier": "stg_covid19",
168 |                 "schema": "dop_sandbox_us",
169 |                 "kind": {"action": "materialization", "target": "table"},
170 |                 "dependencies": ["a", "b", "c"],
171 |             }
172 |         ],
173 |     }
174 | 


--------------------------------------------------------------------------------
/tests/unit_tests/requirements.txt:
--------------------------------------------------------------------------------
1 | # Test libraries
2 | pytest==6.2.4
3 | 
4 | # Dependencies extracted from infrastructure/docker/requirements.txt
5 | croniter==0.3.31
6 | marshmallow==2.19.5
7 | 


--------------------------------------------------------------------------------