├── packages.txt ├── .airflowignore ├── dbt ├── seeds │ └── .gitkeep ├── tests │ └── .gitkeep ├── analyses │ └── .gitkeep ├── macros │ ├── .gitkeep │ ├── macro_docs.yml │ ├── abstract_cte.sql │ └── macro_docs.md ├── snapshots │ ├── .gitkeep │ └── fakestore_products_history.sql ├── models │ ├── marts │ │ ├── .gitkeep │ │ ├── top_performing_products.sql │ │ ├── user_portfolio.sql │ │ └── mart_docs.yml │ └── staging │ │ ├── stg_users.sql │ │ └── src_fakestoredata.yml ├── .gitignore ├── README.md └── dbt_project.yml ├── dags ├── .airflowignore ├── utils │ ├── utils.py │ └── ddl_scripts.sql ├── airflow_dbt_dag_2.py ├── example_dag_basic.py ├── airflow_dbt_dag_1.py └── example_dag_advanced.py ├── Dockerfile ├── requirements.txt ├── .astro ├── config.yaml └── test_dag_integrity_default.py ├── .dockerignore ├── img └── workflow.png ├── start.sh ├── .gitignore ├── .env.example ├── docker-compose.override.yml ├── airflow_settings.example.yaml ├── tests └── dags │ └── test_dag_integrity.py ├── include ├── helper_scripts.py └── transformers.py └── README.md /packages.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.airflowignore: -------------------------------------------------------------------------------- 1 | dbt/ -------------------------------------------------------------------------------- /dbt/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dags/.airflowignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/models/marts/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/astronomer/astro-runtime:9.1.0 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-dbt-cloud==3.2.0 -------------------------------------------------------------------------------- /.astro/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: airflow-dbt-magic 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | astro 2 | .git 3 | .env 4 | airflow_settings.yaml 5 | logs/ 6 | -------------------------------------------------------------------------------- /img/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VICIWUOHA/airflow-dbt-magic/HEAD/img/workflow.png -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | # installl astro cli and start astro 2 | 3 | curl -sSL install.astronomer.io | sudo bash -s 4 | astro dev start -------------------------------------------------------------------------------- /dags/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | dag_owner = os.getenv("DATAFEST_23_USER") 4 | DBT_JOB_SCHEMA = "dbt_" + dag_owner 5 | -------------------------------------------------------------------------------- /dbt/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default .gitignore content added by dbt Cloud 3 | target/ 4 | dbt_packages/ 5 | logs/ 6 | # end dbt Cloud content 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .env 3 | .DS_Store # macOS specific ignore 4 | airflow_settings.yaml 5 | __pycache__/ 6 | astro 7 | data_lake/* 8 | .vscode 9 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | DATAFEST_23_USER=ANY_USER_NAME_in_lower_case 2 | DATAFEST_23_DB=DB_NAME_OF_CHOICE 3 | ENV_PG_DB_URI=postgresql://DB_USER:DB_PASSWORD@host -------------------------------------------------------------------------------- /docker-compose.override.yml: -------------------------------------------------------------------------------- 1 | version: "3.1" 2 | services: 3 | scheduler: 4 | user: root 5 | volumes: 6 | - ./data_lake:/usr/local/airflow/data_lake:rw -------------------------------------------------------------------------------- /dbt/models/marts/top_performing_products.sql: -------------------------------------------------------------------------------- 1 | -- Top 10 products Model 2 | SELECT 3 | id, 4 | title as product_name, 5 | rating_rate as product_rating 6 | FROM {{ source('fakestoreapi','products')}} 7 | ORDER BY 3 DESC 8 | LIMIT 10 -------------------------------------------------------------------------------- /dbt/macros/macro_docs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | macros: 4 | - name: abstract_cte 5 | description: A macro to abstract selection of all columns with a cte 6 | arguments: 7 | - name: tuple_list 8 | type: array 9 | description: '{{ doc("abstract_cte")}}' -------------------------------------------------------------------------------- /dbt/models/staging/stg_users.sql: -------------------------------------------------------------------------------- 1 | -- Staging Model for Users with only appropiate data 2 | 3 | SELECT 4 | id, 5 | INITCAP(firstname) ||' '|| INITCAP(lastname) as full_name, 6 | email, 7 | phone, 8 | address_zipcode 9 | FROM {{ source('fakestoreapi','users')}} -------------------------------------------------------------------------------- /dbt/macros/abstract_cte.sql: -------------------------------------------------------------------------------- 1 | {% macro abstract_cte(tuple_list) %} 2 | 3 | WITH{% for cte_ref in tuple_list %} {{cte_ref[0]}} AS ( 4 | 5 | SELECT * 6 | FROM {{ ref(cte_ref[1]) }} 7 | 8 | ) 9 | {%- if not loop.last -%} 10 | , 11 | {%- endif -%} 12 | 13 | {%- endfor -%} 14 | 15 | {%- endmacro %} 16 | -------------------------------------------------------------------------------- /dbt/snapshots/fakestore_products_history.sql: -------------------------------------------------------------------------------- 1 | {% snapshot fakestore_products_history%} 2 | 3 | {{ 4 | config( 5 | target_schema=env_var("DBT_DATAFEST_23_SCHEMA"), 6 | unique_key='id', 7 | strategy='timestamp', 8 | updated_at='updated_at', 9 | invalidate_hard_deletes=True, 10 | ) 11 | }} 12 | 13 | SELECT * FROM {{source('fakestoreapi','products')}} 14 | --This would start tracking the changes on inventory items from the moment this model was created. 15 | {% endsnapshot%} -------------------------------------------------------------------------------- /dbt/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /airflow_settings.example.yaml: -------------------------------------------------------------------------------- 1 | airflow: 2 | connections: ## conn_id and conn_type are required 3 | - conn_id: postgres_default 4 | conn_type: postgres 5 | conn_host: YourPostgresDBHost eg; abcd123.efg456.hij.com OR localhost OR IP 6 | conn_schema: DB_NAME 7 | conn_login: DB_USERNAME 8 | conn_password: DB_PASSWORD 9 | conn_port: 5432 #or any specific port (the default postgres shipped with airflow uses this port) 10 | - conn_id: dbt_cloud_default 11 | conn_type: dbt_cloud 12 | conn_host: cloud.getdbt.com 13 | conn_login: YOUR_DBT_ACCOUNT_ID 14 | conn_password: YOUR_DBT_API_TOKEN 15 | variables: ## variable_name and variable_value are required 16 | - variable_name: datafest_meetup_job 17 | variable_value: YOUR_DBT_CLOUD_JOB_ID 18 | -------------------------------------------------------------------------------- /dbt/models/staging/src_fakestoredata.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: fakestoreapi 5 | description: Source schema for all fakestoreapi data 6 | database: "{{ env_var('DBT_DATAFEST_23_DB') }}" 7 | schema: "{{ env_var('DBT_DATAFEST_23_SCHEMA') }}" 8 | tables: 9 | - name: carts 10 | description: table containing cart info placed by users at the store 11 | - name: products 12 | description: table containing the products available on the store site 13 | - name: users 14 | description: table that holds user information 15 | tags: ["contains pii"] 16 | columns: 17 | - name: id 18 | description: unique identifier of the user 19 | tests: 20 | - unique 21 | - not_null 22 | models: 23 | - name: stg_users 24 | description: Staging Model for Important User Info 25 | -------------------------------------------------------------------------------- /dbt/macros/macro_docs.md: -------------------------------------------------------------------------------- 1 | {% docs abstract_cte %} 2 | 3 | ### abstract_cte 4 | 5 | This macro abstracts the need to write all your **select * ctes** explicitly and saves lines of code to be generated at compile time. 6 | 7 | - #### tuple_list: An array containing one or more tuples of two elements as follows; 8 | 9 | - 1. the name to give the cte 10 | - 2. the name of the model to be referenced. 11 | - #### Example: 12 | 13 | {% raw %} 14 | ``` 15 | {{ abstract_cte([ 16 | ('system_a','system_a_daily_sales'), 17 | ('system_b','system_b_daily_sales') 18 | ]) 19 | }} 20 | would be compiled as ; 21 | 22 | WITH system_a AS ( 23 | SELECT * 24 | FROM {{ref('system_a_daily_sales')}} 25 | ), 26 | system_b AS ( 27 | SELECT * 28 | FROM {{ref('system_b_daily_sales')}} 29 | ), 30 | ``` 31 | {% endraw %} 32 | 33 | - The Rest of the logic can then be implemented downstream. 34 | 35 | {% enddocs %} -------------------------------------------------------------------------------- /dbt/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'airflow_dbt_magic' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'default' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `source-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_packages" 26 | 27 | 28 | # Configuring models 29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 30 | 31 | # In this example config, we tell dbt to build all models in the example/ directory 32 | # as tables. These settings can be overridden in the individual model files 33 | # using the `{{ config(...) }}` macro. 34 | models: 35 | airflow_dbt_magic: 36 | # Applies to all files under models/example/ 37 | marts: 38 | materialized: view 39 | staging: 40 | materialized: view 41 | -------------------------------------------------------------------------------- /dags/airflow_dbt_dag_2.py: -------------------------------------------------------------------------------- 1 | # when dataset is updated in db, this dag should trigger the dbt cloud run via the API 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG, Dataset 5 | from airflow.models import Variable 6 | from airflow.operators.empty import EmptyOperator 7 | from airflow.providers.dbt.cloud.operators.dbt import DbtCloudRunJobOperator 8 | 9 | from utils.utils import dag_owner, DBT_JOB_SCHEMA 10 | 11 | # define arguments to be used in dag 12 | dbt_job_id: int = Variable.get("datafest_meetup_job") 13 | default_args = { 14 | "owner": dag_owner, 15 | "depends_on_past": False, 16 | "retries": 1, 17 | "retry_delay": timedelta(minutes=2), 18 | } 19 | 20 | with DAG( 21 | dag_id="fakestore_dbt_job_pipeline", 22 | default_args=default_args, 23 | description="Simple Dag to Trigger Dbt Production Job", 24 | start_date=datetime(2023, 6, 11), 25 | schedule=[Dataset("//fakestore_dwh/tables")], 26 | catchup=False, 27 | tags=["airflow-dbt-magic"], 28 | ) as dag: 29 | start = EmptyOperator(task_id="start_pipeline") 30 | 31 | trigger_dbt_cloud_job = DbtCloudRunJobOperator( 32 | task_id="trigger_dbt_cloud_job", 33 | job_id=dbt_job_id, 34 | trigger_reason=f"Triggered From Airflow Dataset Update configured by {dag_owner}", 35 | steps_override=None, 36 | schema_override=DBT_JOB_SCHEMA, 37 | wait_for_termination=True, 38 | timeout=400, 39 | check_interval=60, 40 | additional_run_config=None, 41 | ) 42 | 43 | start >> trigger_dbt_cloud_job 44 | -------------------------------------------------------------------------------- /dbt/models/marts/user_portfolio.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | abstract_cte([ 3 | ('users','stg_users') 4 | ]) 5 | }}, 6 | 7 | 8 | user_cart_summary AS ( 9 | 10 | SELECT 11 | user_id, 12 | COUNT(DISTINCT product_id) AS unique_products_tried_out, 13 | SUM(quantity) AS lifetime_product_volume, 14 | COUNT(DISTINCT date) AS no_of_days_in_store 15 | 16 | FROM {{ source('fakestoreapi','carts')}} 17 | GROUP BY 1 18 | -- HAVING ((COUNT(DISTINCT date)) > 1 OR (COUNT(DISTINCT product_id))>1) -- have visisted the fakestore more than once 19 | 20 | ), 21 | 22 | 23 | user_top_products AS ( 24 | 25 | SELECT 26 | a.user_id, 27 | c.full_name, 28 | c.email, 29 | b.title as top_product, 30 | a.quantity as top_product_quantity 31 | FROM( 32 | SELECT 33 | user_id, 34 | product_id, 35 | quantity, 36 | ROW_NUMBER() OVER (PARTITION BY user_id ORDER by quantity DESC ) as rn 37 | FROM {{ source('fakestoreapi','carts')}} 38 | ) AS a 39 | LEFT JOIN {{ source('fakestoreapi','products')}} AS b 40 | ON a.product_id = b.id 41 | LEFT JOIN {{ ref('stg_users')}} c 42 | ON a.user_id = c.id 43 | WHERE a.rn = 1 44 | ) 45 | 46 | 47 | SELECT 48 | a.user_id, 49 | a.full_name, 50 | a.email, 51 | b.lifetime_product_volume, 52 | b.unique_products_tried_out, 53 | b.no_of_days_in_store, 54 | a.top_product, 55 | a.top_product_quantity 56 | FROM user_top_products a 57 | LEFT JOIN user_cart_summary b 58 | ON a.user_id = b.user_id -------------------------------------------------------------------------------- /dags/utils/ddl_scripts.sql: -------------------------------------------------------------------------------- 1 | --Syntax: PostgreSql 2 | -- Execute against Data Warehouse 3 | -- {{params.schema}} would be passed at runtime 4 | -- Drop Statements can be commented out if not needed.(but subsequent runs would fail on pkey constraints) 5 | 6 | CREATE SCHEMA IF NOT EXISTS {{params.schema}}; 7 | DROP TABLE IF EXISTS {{params.schema}}.products CASCADE; 8 | CREATE TABLE IF NOT EXISTS {{params.schema}}.products( 9 | id numeric, 10 | title varchar, 11 | price numeric, 12 | description varchar, 13 | category varchar, 14 | image varchar, 15 | rating_rate decimal, 16 | rating_count numeric, 17 | updated_at timestamp, 18 | _datafest_meetup_user varchar, 19 | uuid varchar primary key 20 | 21 | ); 22 | 23 | DROP TABLE IF EXISTS {{params.schema}}.users CASCADE; 24 | CREATE TABLE IF NOT EXISTS {{params.schema}}.users( 25 | id numeric, 26 | email varchar, 27 | username varchar, 28 | phone varchar, 29 | address_geolocation_lat numeric, 30 | address_geolocation_long numeric, 31 | address_city varchar, 32 | address_street varchar, 33 | address_number numeric, 34 | address_zipcode varchar, 35 | firstname varchar, 36 | lastname varchar, 37 | updated_at timestamp, 38 | _datafest_meetup_user varchar, 39 | uuid varchar primary key 40 | 41 | ); 42 | 43 | DROP TABLE IF EXISTS {{params.schema}}.carts CASCADE; 44 | CREATE TABLE IF NOT EXISTS {{params.schema}}.carts( 45 | cart_id numeric, 46 | id varchar, 47 | date date, 48 | user_id numeric, 49 | product_id numeric, 50 | quantity numeric, 51 | _datafest_meetup_user varchar, 52 | uuid varchar primary key 53 | 54 | ); -------------------------------------------------------------------------------- /dbt/models/marts/mart_docs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: top_performing_products 5 | description: Shows 10 most rated products on the Fakestore API 6 | columns: 7 | - name: id 8 | description: Product Unique Identifier 9 | tests: 10 | - not_null 11 | - name: product_name 12 | description: The Name of the product as it appears on the title bar of the fakestore 13 | - name: product_rating 14 | description: The rating of this product as starred by customers 15 | 16 | - name: user_portfolio 17 | description: Table showing a Users Portfolio based on purchase history from the fakestore. 18 | columns: 19 | - name: user_id 20 | description: Unique id of a user. 21 | - name: full_name 22 | description: User's First and Last names. 23 | - name: email 24 | description: Email of the User 25 | tests: 26 | - unique 27 | - name: lifetime_product_volume, 28 | description: Total quantity of Products a user has bought over time 29 | - name: unique_products_tried_out 30 | description: The Unique number of products the customer has tried out irrespective of their volume 31 | - name: no_of_days_in_store, 32 | description: Number of days on which the customer has visited the fake store 33 | - name: top_product 34 | description: Most purchased product by this customer based on 35 | - name: top_product_quantity 36 | description: The Volume of the top product that the customer has bought 37 | 38 | exposures: 39 | - name: customer_portfolio_app 40 | description: Application Containing Summary Matrica of a Customer's value in the FakeStore business. 41 | type: application 42 | depends_on: 43 | - ref('user_portfolio') 44 | owner: 45 | email: dbtlagosmeetup@email.com -------------------------------------------------------------------------------- /tests/dags/test_dag_integrity.py: -------------------------------------------------------------------------------- 1 | """Test the validity of all DAGs. This test ensures that all Dags have tags, retries set to two, and no import errors. Feel free to add and remove tests.""" 2 | 3 | import os 4 | import logging 5 | from contextlib import contextmanager 6 | import pytest 7 | from airflow.models import DagBag 8 | 9 | 10 | @contextmanager 11 | def suppress_logging(namespace): 12 | logger = logging.getLogger(namespace) 13 | old_value = logger.disabled 14 | logger.disabled = True 15 | try: 16 | yield 17 | finally: 18 | logger.disabled = old_value 19 | 20 | 21 | def get_import_errors(): 22 | """ 23 | Generate a tuple for import errors in the dag bag 24 | """ 25 | with suppress_logging("airflow"): 26 | dag_bag = DagBag(include_examples=False) 27 | 28 | def strip_path_prefix(path): 29 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 30 | 31 | # we prepend "(None,None)" to ensure that a test object is always created even if its a no op. 32 | return [(None, None)] + [ 33 | (strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items() 34 | ] 35 | 36 | 37 | def get_dags(): 38 | """ 39 | Generate a tuple of dag_id, in the DagBag 40 | """ 41 | with suppress_logging("airflow"): 42 | dag_bag = DagBag(include_examples=False) 43 | 44 | def strip_path_prefix(path): 45 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 46 | 47 | return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()] 48 | 49 | 50 | @pytest.mark.parametrize( 51 | "rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()] 52 | ) 53 | def test_file_imports(rel_path, rv): 54 | """Test for import errors on a file""" 55 | if rel_path and rv: 56 | raise Exception(f"{rel_path} failed to import with message \n {rv}") 57 | 58 | 59 | APPROVED_TAGS = {} 60 | 61 | 62 | @pytest.mark.parametrize( 63 | "dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()] 64 | ) 65 | def test_dag_tags(dag_id, dag, fileloc): 66 | """ 67 | test if a DAG is tagged and if those TAGs are in the approved list 68 | """ 69 | assert dag.tags, f"{dag_id} in {fileloc} has no tags" 70 | if APPROVED_TAGS: 71 | assert not set(dag.tags) - APPROVED_TAGS 72 | 73 | 74 | @pytest.mark.parametrize( 75 | "dag_id,dag, fileloc", get_dags(), ids=[x[2] for x in get_dags()] 76 | ) 77 | def test_dag_retries(dag_id, dag, fileloc): 78 | """ 79 | test if a DAG has retries set 80 | """ 81 | assert ( 82 | dag.default_args.get("retries", None) >= 2 83 | ), f"{dag_id} in {fileloc} does not have retries not set to 2." 84 | -------------------------------------------------------------------------------- /dags/example_dag_basic.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime, timedelta 3 | 4 | from airflow.decorators import ( 5 | dag, 6 | task, 7 | ) # DAG and task decorators for interfacing with the TaskFlow API 8 | 9 | 10 | @dag( 11 | # This defines how often your DAG will run, or the schedule by which your DAG runs. In this case, this DAG 12 | # will run daily 13 | schedule_interval="@daily", 14 | # This DAG is set to run for the first time on January 1, 2021. Best practice is to use a static 15 | # start_date. Subsequent DAG runs are instantiated based on scheduler_interval 16 | start_date=datetime(2021, 1, 1), 17 | # When catchup=False, your DAG will only run for the latest schedule_interval. In this case, this means 18 | # that tasks will not be run between January 1, 2021 and 30 mins ago. When turned on, this DAG's first 19 | # run will be for the next 30 mins, per the schedule_interval 20 | catchup=False, 21 | default_args={ 22 | "retries": 2, # If a task fails, it will retry 2 times. 23 | }, 24 | tags=["example"], 25 | ) # If set, this tag is shown in the DAG view of the Airflow UI 26 | def example_dag_basic(): 27 | """ 28 | ### Basic ETL Dag 29 | This is a simple ETL data pipeline example that demonstrates the use of 30 | the TaskFlow API using three simple tasks for extract, transform, and load. 31 | For more information on Airflow's TaskFlow API, reference documentation here: 32 | https://airflow.apache.org/docs/apache-airflow/stable/tutorial_taskflow_api.html 33 | """ 34 | 35 | @task() 36 | def extract(): 37 | """ 38 | #### Extract task 39 | A simple "extract" task to get data ready for the rest of the 40 | pipeline. In this case, getting data is simulated by reading from a 41 | hardcoded JSON string. 42 | """ 43 | data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}' 44 | 45 | order_data_dict = json.loads(data_string) 46 | return order_data_dict 47 | 48 | @task( 49 | multiple_outputs=True 50 | ) # multiple_outputs=True unrolls dictionaries into separate XCom values 51 | def transform(order_data_dict: dict): 52 | """ 53 | #### Transform task 54 | A simple "transform" task which takes in the collection of order data and 55 | computes the total order value. 56 | """ 57 | total_order_value = 0 58 | 59 | for value in order_data_dict.values(): 60 | total_order_value += value 61 | 62 | return {"total_order_value": total_order_value} 63 | 64 | @task() 65 | def load(total_order_value: float): 66 | """ 67 | #### Load task 68 | A simple "load" task that takes in the result of the "transform" task and prints it out, 69 | instead of saving it to end user review 70 | """ 71 | 72 | print(f"Total order value is: {total_order_value:.2f}") 73 | 74 | order_data = extract() 75 | order_summary = transform(order_data) 76 | load(order_summary["total_order_value"]) 77 | 78 | 79 | example_dag_basic = example_dag_basic() 80 | -------------------------------------------------------------------------------- /include/helper_scripts.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import os 4 | from typing import List, Literal 5 | from time import sleep 6 | from sqlalchemy import create_engine 7 | 8 | 9 | def transform_col_names(dataset: pd.DataFrame) -> List[str]: 10 | """Transformer function to change CamelCases to sql friendly snake_case and handle irregularities in field names. 11 | Parameters 12 | ---------- 13 | dataset : pd.DataFrame 14 | DataFrame for which a column name transformation is needed 15 | 16 | Returns 17 | ------- 18 | List[str] 19 | List of transformed column names to be mapped to the original dataframe. 20 | """ 21 | 22 | old_col_list = dataset.columns.to_list() 23 | new_col_list = [] 24 | for col in old_col_list: 25 | # change from CamelCase to snake_case 26 | col_name = re.sub("([a-z])([A-Z0-9])", r"\1_\2", col) 27 | # replace dots with underscores 28 | new_name = col_name.replace("name.", "").replace(".", "_") 29 | new_col_list.append(new_name.lower()) 30 | 31 | return new_col_list 32 | 33 | 34 | def load_to_db( 35 | table_name: str, 36 | dataset: pd.DataFrame, 37 | db_name: str = os.getenv("DATAFEST_23_DB"), 38 | schema: str = "dbt_" + str(os.getenv("DATAFEST_23_USER")), 39 | if_exists="append", 40 | ) -> Literal[True]: 41 | """Connects to a Database and Loads a supplied Dataframe to a specific schema and table in that Database 42 | 43 | Parameters 44 | ---------- 45 | db_name : str 46 | The database to connect to 47 | table_name : str 48 | The table to be loaded with data 49 | dataset : pd.DataFrame 50 | data to be loaded to db table 51 | schema : str, optional 52 | schema of the database to be operated upon ->default: dev 53 | if_exists : str, optional 54 | Logic to apply if the table already exists -> default: appends to table. "append" 55 | 56 | Returns 57 | ------- 58 | boolean|None 59 | True if load action was successful otherwise Nothing is returned. 60 | 61 | Raises 62 | ------ 63 | Exception 64 | Any SqlAlchemy Engine Connection Error encountered. 65 | Exception 66 | Any Exception after connection that occurs during database load operation. 67 | """ 68 | print("=> Connecting to Database......") 69 | try: 70 | DEV_ENV = os.getenv("ENV_PG_DB_URI") 71 | engine = create_engine(f"{DEV_ENV}/{db_name}") 72 | conx = engine.connect() 73 | message = "=>Successfully Established Connection to Database" 74 | print(message) 75 | except Exception as e: 76 | error_log = "Pipeline Broken,Connection to Database Failed : {}".format(e) 77 | print(error_log) 78 | raise e 79 | print("=> Writing Data to Database.....") 80 | sleep(2) 81 | try: 82 | dataset.to_sql( 83 | f"{table_name}", con=conx, schema=schema, if_exists=if_exists, index=False 84 | ) 85 | message = ( 86 | "***==> Successfully Written `{}` Rows of Data to db: `{}.{}.{}` .".format( 87 | len(dataset), db_name, schema, table_name 88 | ) 89 | ) 90 | conx.close() 91 | print(message) 92 | return True 93 | except Exception as e: 94 | error_log = "Pipeline Broken,Failed to Write data to Database : {}".format(e) 95 | print(error_log) 96 | raise e 97 | -------------------------------------------------------------------------------- /.astro/test_dag_integrity_default.py: -------------------------------------------------------------------------------- 1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**""" 2 | from contextlib import contextmanager 3 | import logging 4 | import os 5 | 6 | import pytest 7 | 8 | from airflow.models import DagBag, Variable, Connection 9 | from airflow.hooks.base import BaseHook 10 | 11 | 12 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables 13 | 14 | # =========== MONKEYPATCH BaseHook.get_connection() =========== 15 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs): 16 | print( 17 | f"Attempted to fetch connection during parse returning an empty Connection object for {key}" 18 | ) 19 | return Connection(key) 20 | 21 | 22 | BaseHook.get_connection = basehook_get_connection_monkeypatch 23 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() =========== 24 | 25 | # =========== MONKEYPATCH OS.GETENV() =========== 26 | def os_getenv_monkeypatch(key: str, *args, default=None, **kwargs): 27 | print( 28 | f"Attempted to fetch os environment variable during parse, returning a mocked value for {key}" 29 | ) 30 | if ( 31 | key == "JENKINS_HOME" and default is None 32 | ): # fix https://github.com/astronomer/astro-cli/issues/601 33 | return None 34 | if default: 35 | return default 36 | return "NON_DEFAULT_OS_ENV_VALUE" 37 | 38 | 39 | os.getenv = os_getenv_monkeypatch 40 | # # =========== /MONKEYPATCH OS.GETENV() =========== 41 | 42 | # =========== MONKEYPATCH VARIABLE.GET() =========== 43 | 44 | 45 | class magic_dict(dict): 46 | def __init__(self, *args, **kwargs): 47 | self.update(*args, **kwargs) 48 | 49 | def __getitem__(self, key): 50 | return {}.get(key, "MOCKED_KEY_VALUE") 51 | 52 | 53 | def variable_get_monkeypatch(key: str, default_var=None, deserialize_json=False): 54 | print( 55 | f"Attempted to get Variable value during parse, returning a mocked value for {key}" 56 | ) 57 | 58 | if default_var: 59 | return default_var 60 | if deserialize_json: 61 | return magic_dict() 62 | return "NON_DEFAULT_MOCKED_VARIABLE_VALUE" 63 | 64 | 65 | Variable.get = variable_get_monkeypatch 66 | # # =========== /MONKEYPATCH VARIABLE.GET() =========== 67 | 68 | 69 | @contextmanager 70 | def suppress_logging(namespace): 71 | """ 72 | Suppress logging within a specific namespace to keep tests "clean" during build 73 | """ 74 | logger = logging.getLogger(namespace) 75 | old_value = logger.disabled 76 | logger.disabled = True 77 | try: 78 | yield 79 | finally: 80 | logger.disabled = old_value 81 | 82 | 83 | def get_import_errors(): 84 | """ 85 | Generate a tuple for import errors in the dag bag 86 | """ 87 | with suppress_logging("airflow"): 88 | dag_bag = DagBag(include_examples=False) 89 | 90 | def strip_path_prefix(path): 91 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 92 | 93 | # we prepend "(None,None)" to ensure that a test object is always created even if its a no op. 94 | return [(None, None)] + [ 95 | (strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items() 96 | ] 97 | 98 | 99 | @pytest.mark.parametrize( 100 | "rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()] 101 | ) 102 | def test_file_imports(rel_path, rv): 103 | """Test for import errors on a file""" 104 | if rel_path and rv: # Make sure our no op test doesn't raise an error 105 | raise Exception(f"{rel_path} failed to import with message \n {rv}") 106 | -------------------------------------------------------------------------------- /dags/airflow_dbt_dag_1.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import Dataset 3 | from airflow.decorators import ( 4 | dag, 5 | task, 6 | ) 7 | 8 | # import astro.sql as aql 9 | # from astro.sql.table import Table 10 | from airflow.operators.empty import EmptyOperator 11 | from airflow.providers.postgres.operators.postgres import PostgresOperator 12 | from include.transformers import FakeStoreApiTransformer, FAKE_STORE_ARTIFACTS 13 | from include.helper_scripts import load_to_db 14 | from utils.utils import dag_owner, DBT_JOB_SCHEMA 15 | 16 | dag_docs = """ This DAG gets and transforms fictitious retail data from http://fakestoreapi.com into a data warehouse. 17 | It also implements airflow concepts such as; 18 | 19 | - `task.expand()` for dynamic mapping 20 | - airflow `Datasets` for Data aware scheduling used to trigger a DBT DAG run in fakestore_dbt_dag 21 | 22 | """ 23 | 24 | 25 | @dag( 26 | dag_id="fakestore_elt_pipeline", 27 | start_date=datetime(2023, 6, 11), 28 | # This defines how many instantiations of this DAG (DAG Runs) can execute concurrently. In this case, 29 | # we're only allowing 1 DAG run at any given time, as opposed to allowing multiple overlapping DAG runs. 30 | max_active_runs=1, 31 | schedule_interval="@daily", 32 | # Default settings applied to all tasks within the DAG; can be overwritten at the task level. 33 | default_args={ 34 | "owner": f"{dag_owner}", # This defines the value of the "owner" column in the DAG view of the Airflow UI 35 | "retries": 1, # If a task fails, it will retry 2 times. 36 | "retry_delay": timedelta( 37 | seconds=30 38 | ), # A task that fails will wait 30 seconds to retry. 39 | }, 40 | catchup=False, 41 | tags=["airflow-dbt-magic"], 42 | doc_md=dag_docs, 43 | ) 44 | def data_elt_process(): 45 | create_destination_schema_and_tables = PostgresOperator( 46 | task_id="create_destination_tables_if_not_exists", 47 | postgres_conn_id="postgres_default", 48 | sql="utils/ddl_scripts.sql", 49 | params={"schema": DBT_JOB_SCHEMA}, 50 | ) 51 | 52 | transformer = FakeStoreApiTransformer() 53 | 54 | @task(max_active_tis_per_dag=1) 55 | def get_and_load_data(artifact: str, ti=None): 56 | file_path = transformer.get_fakestore_data(artifact) 57 | ti.xcom_push(key="uploaded_file_paths", value=file_path) 58 | 59 | upload_files = get_and_load_data.expand(artifact=FAKE_STORE_ARTIFACTS) 60 | 61 | transform_tasks = [] 62 | # run transform tasks in parallel 63 | for artifact in FAKE_STORE_ARTIFACTS: 64 | 65 | @task(task_id=f"transform_and_load_{artifact}") 66 | def transform_and_load(artifact: str, ti=None): 67 | print("=> Transforming data.........") 68 | 69 | file_paths = list(ti.xcom_pull(key="uploaded_file_paths")) 70 | file_to_transform = [file for file in list(file_paths) if artifact in file][ 71 | 0 72 | ] 73 | transformed_data = transformer.transform_fakestore_data( 74 | artifact=f"{artifact}", json_file_path=file_to_transform 75 | ) 76 | # load to db 77 | load = load_to_db(table_name=artifact, dataset=transformed_data) 78 | if load: 79 | return load 80 | 81 | transform_and_load_task = transform_and_load(artifact=artifact) 82 | transform_tasks.append(transform_and_load_task) 83 | 84 | end_pipeline = EmptyOperator( 85 | task_id="end_pipeline", 86 | trigger_rule="none_failed", 87 | outlets=[Dataset("//fakestore_dwh/tables")], 88 | ) 89 | 90 | # enforce dependencies 91 | ( 92 | create_destination_schema_and_tables 93 | >> upload_files 94 | >> transform_tasks 95 | >> end_pipeline 96 | ) 97 | 98 | 99 | dag_run = data_elt_process() 100 | -------------------------------------------------------------------------------- /include/transformers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | import json 4 | import os 5 | from hashlib import md5 6 | from datetime import datetime, timedelta 7 | from pathlib import Path 8 | from include.helper_scripts import transform_col_names 9 | 10 | FAKE_STORE_ARTIFACTS = ["users", "products", "carts"] 11 | pipeline_user = os.getenv("DATAFEST_23_USER") 12 | 13 | 14 | class FakeStoreApiTransformer: 15 | """Transfromer Class for manipulating data from the FakeStoreApi at https://fakestoreapi.com/ 16 | to deploy into a test datawarehouse. (This is not an exhaustive Class for all endpoints but PR's are welcome.) 17 | """ 18 | 19 | def __init__(self) -> None: 20 | self.base_url = "https://fakestoreapi.com/" 21 | 22 | def get_fakestore_data(self, artifact: str, query_params: str = "") -> str: 23 | """Makes a call to https://fakestoreapi.com/ for a specified artifact/entity and simply writes to file storage 24 | 25 | Parameters 26 | ---------- 27 | artifact : str 28 | The api endpoint/ db entity for which data should be gotten for. 29 | 30 | Returns 31 | ------- 32 | str 33 | File Path where extracted Json data was written to. 34 | 35 | Raises 36 | ------ 37 | Exception 38 | Any Exception encountered during REST API call or while writing to storage.. 39 | """ 40 | try: 41 | # make API call once and get json response 42 | with requests.Session() as s: 43 | print(f"=> Now Getting data for {artifact}") 44 | artifact_data = s.get( 45 | self.base_url + f"{artifact}" + f"{query_params}" 46 | ).json() 47 | # write raw json to file storage: 48 | time_info = datetime.strftime( 49 | datetime.now() + timedelta(hours=1), "%Y_%m_%d_%H%M%S" 50 | ) 51 | file_path = f"data_lake/{artifact}/" 52 | file_name = f"{artifact}_{time_info}.json" 53 | # check if file path exists and create if needed 54 | path_exists = os.path.exists(file_path) 55 | if not path_exists: 56 | f_path = Path(file_path) 57 | f_path.mkdir(parents=True) 58 | 59 | with open(file_path + file_name, "w", encoding="utf-8") as file: 60 | raw_data = json.dumps({f"{artifact}": artifact_data}, indent=4) 61 | file.write(raw_data) 62 | 63 | print( 64 | f"=> ``{artifact}`` data written successfully to ``{file_path+file_name}``." 65 | ) 66 | return file_path + file_name 67 | 68 | except Exception as e: 69 | print("** Error while Calling API or writing to Data Lake.") 70 | raise e 71 | 72 | def transform_fakestore_data( 73 | self, artifact: str, json_file_path: str 74 | ) -> pd.DataFrame: 75 | """This Method Transforms defined artifacts from the FakeStoreApi as at 2023-06-15 , It supports 76 | (Users, Products & Carts) but can be extended. 77 | 78 | Parameters 79 | ---------- 80 | artifact : str 81 | The defined artifact on of (Users, Products & Carts) 82 | json_file_path : str 83 | json file path on file storage which can be accessed from the airflow. 84 | Returns 85 | ------- 86 | pd.DataFrame 87 | Normalized dataframe containing returned data from the Api in tabular format. 88 | """ 89 | 90 | with open(json_file_path) as file: 91 | data = json.load(file)[artifact] 92 | print("=> Normalizing Dataset..") 93 | if artifact != "carts": 94 | artifact_data_trans = pd.json_normalize(data) 95 | artifact_data_trans["updated_at"] = datetime.strftime( 96 | datetime.now() + timedelta(hours=1), "%Y-%m-%d %H:%M:%S" 97 | ) 98 | # artifact_data_trans 99 | else: 100 | artifact_data_trans = pd.json_normalize( 101 | data, record_path=["products"], meta=["id", "userId", "date"] 102 | ) 103 | artifact_data_trans = artifact_data_trans.rename(columns={"id": "cart_id"}) 104 | # create hash for item in cart purchased by user on a given date 105 | artifact_data_trans["id"] = ( 106 | artifact_data_trans["date"] 107 | + artifact_data_trans["userId"].astype(str) 108 | + artifact_data_trans["productId"].astype(str) 109 | ).apply(lambda val: md5(val.encode()).hexdigest()) 110 | artifact_data_trans = artifact_data_trans.reindex( 111 | columns=["cart_id", "id", "date", "userId", "productId", "quantity"] 112 | ) 113 | 114 | # standardize column names 115 | artifact_data_trans.columns = transform_col_names(artifact_data_trans) 116 | # validate that users have added appropiate env vars during live demo and add their id's to all tables before writes 117 | print("=> Applying Identifiers..") 118 | assert ( 119 | pipeline_user is not None 120 | ), "You must Include an env variable named DATAFEST_23_USER" 121 | artifact_data_trans["_datafest_meetup_user"] = pipeline_user 122 | artifact_data_trans["uuid"] = ( 123 | artifact_data_trans["id"].astype(str) 124 | + artifact_data_trans["_datafest_meetup_user"] 125 | ).apply(lambda val: md5(val.encode()).hexdigest()) 126 | artifact_data_clean = artifact_data_trans.drop( 127 | columns=["__v", "password"], errors="ignore" 128 | ) 129 | return artifact_data_clean 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | AIRFLOW & DBT CLOUD INTEGRATION PROJECT FOR DATA & ANALYTICS ENGINEERS 2 | ======== 3 | 4 | *Author: [Victor Iwuoha](https://linkedin.com/in/viciwuoha)* 5 | 6 | *Date: 13th & 14th October 2023* 7 | 8 | *Event: DataFest Africa Workshop* 9 | 10 |   11 | Project Contents 12 | ================ 13 | This Project is built using the astro cli provisioned by [Astronomer](https://docs.astronomer.io/) 14 | To Run this project a linux environment is highly recommended. 15 | 16 | 17 | Workflow 18 | ================ 19 | ![workflow diagram](img/workflow.png) 20 | 21 | ### Key Takeaways: 22 | - Airflow Core Concepts such as (Operators, taskflow API, dynamic task mapping, Data-Aware Scheduling, Variables & Connections ) 23 | - DBT UI Familiarization Up to Job Creation with basic concepts of Macros, Documentation, Snapshots for SCD's, and Exposures. 24 | 25 | ### Prerequisites: 26 | 27 | - Linux Environment/ github codespaces/Ubuntu distribution on Windows 28 | - Docker Compose 29 | - A DBT Cloud Account (With an API Key) 30 | - A .env file at the root of this directory with environment variables exactly as those in .env.example but with actual values. (Do this edit In Codespaces / with a **machine of 4 Cores, 8GB RAM & 32GB Storage**) 31 | - An accessible Postgres database with a valid connection URL. (Spin Up a free one on [ElephantSql.com](https://elephantsql.com)). _In the Url, replace *postgres with **postgresql**_ 32 | - Basic Understanding of Python & SQL 33 | 34 | Deployment & Execution 35 | ====================== 36 | 37 | ### Steps for deployment: 38 | 39 | - Fork This Project to your git profile, create a branch named dev, then connect the repository to your dbt account. 40 | - Give DBT adequate access to connect to this repository on your git provider (github/gitlab) -> [see steps](https://docs.getdbt.com/docs/cloud/git/connect-github) 41 | - **Create a dbt project** with the name airflow_dbt_magic or any name of choice and point it to the dbt subdirectory of this repository. 42 | - **Create two DBT environment Variables** as follows; 43 | - Key: DBT_DATAFEST_23_DB Value: As used above within airflow .env 44 | - Key: DBT_DATAFEST_23_SCHEMA, Value: dbt_DATAFEST_23_USER (where DATAFEST_23_USER has the same value as used in .env above). This can basically be aby schema or database. 45 | - Create a Production environment and link it to the main branch, then create a simple DBT JOB in the Production Environment called AIRFLOW DBT JOB and add the commands (`dbt build` & `dbt snapshot`) Also select the generate docs on run checkbox. Note the **Job Id** as well as the **Account id** as they would be needed in Airflow. 46 | 47 | 48 | ### Execution: 49 | 50 | 1. Configuration, Connections & Airflow Variables setup 51 | - a. After adding the environment variables in [**Prerequisites** above](#prerequisites) to your .env file, (optionally) rename the `airflow_settings.example.yaml` file as `airflow_settings.yaml` and supply the adequate values, doing so would let astro automatically load these to your airflow instance (_otherwise, follow step 2 below_). 52 | - b. Run the start.sh script using the command `bash start.sh` This should start your project, export all environment variables and create a **data_lake/** dir. To restart your airflow container after any environment/config changes, simply run the command `astro dev restart`. 53 | 2. Create 2 airflow Connections and one Airflow Variable by using the airflow UI via Admin>Variables 54 | - a. **DBT Cloud connection with the following;** 55 | - Connection Id: dbt_cloud_default 56 | - Account Id: YOUR_DBT_ACCOUNT_ID 57 | - Api Token: YOUR_DBT_API_TOKEN 58 |   59 | 60 | - b. **Postgres DB Connection as follows;** 61 | - Connection Id: postgres_default 62 | - Host: rajje.db.elephantsql.com (same as supplied in .env) or any other hosting platform including localhost. 63 | - Schema: As supplied during meetup or any other database on your host 64 | - Login: User name for schema 65 | - Password: Password of User to DB 66 | - Port: 5432 67 |   68 | 69 | - c. **DBT JOB ID Variable as follows;** 70 | - Key: datafest_meetup_job 71 | - Value: YOUR_CREATED_DBT_JOB_ID 72 | - Description: DATAFEST meetup Job ID 73 | 74 | 3. Turn on the two **fakestore_** dags and Trigger the Dag Named _**fakestore_elt_pipeline**_. If this Runs SuccessFully , the _**fakestore_dbt_job_pipeline**_ would automagically get triggered based on the dataset schedule. See more on [Airflow Datasets](https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/datasets.html). 75 | 76 | 77 | 4. Wait for the dbt dag to complete running and navigate to the dbt cloud UI to see that the dag was triggered via the API. For more notes on the operation of this dag, see [DbtCloudOperator](https://airflow.apache.org/docs/apache-airflow-providers-dbt-cloud/stable/operators.html). In More complex Setups, there are packages that can be used with dbt core to convert your entire dbt project into airflow tasks for easier management. An example is [Astronomer Cosmos](https://github.com/astronomer/astronomer-cosmos). 78 | 79 | Credits & Resources: 80 | =========================== 81 | 82 | The Structure of this project was adapted from the astronomer provided astro cli and created using astro dev init 83 | Docs are available at the following Links 84 | 85 | - [Apache Airflow](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/index.html) 86 | - [Astronomer](https://docs.astronomer.io/) 87 | - [DBT Cloud](https://docs.getdbt.com/) and [DBT-Cloud-Airflow Example](https://docs.getdbt.com/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud) 88 | 89 | LEARN AIRFLOW 90 | ================ 91 | - [Astronomer Academy](https://academy.astronomer.io/) 92 | 93 | LEARN DBT 94 | ============ 95 | - [DBT Learn Official Website](https://courses.getdbt.com/collections) 96 | - [DBT Crash Course - Radovan Bacovic](https://gitlab.com/rbacovic/dbt_tutorial/) 97 | 98 | 99 | The compilation of this project was inspired with ❤️ by the **dbt-lagos-community** 📦 . 100 | 101 | 102 | =========================== -------------------------------------------------------------------------------- /dags/example_dag_advanced.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from typing import Dict 3 | 4 | # Airflow operators are templates for tasks and encompass the logic that your DAG will actually execute. 5 | # To use an operator in your DAG, you first have to import it. 6 | # To learn more about operators, see: https://registry.astronomer.io/. 7 | 8 | from airflow.decorators import ( 9 | dag, 10 | task, 11 | ) # DAG and task decorators for interfacing with the TaskFlow API 12 | from airflow.models.baseoperator import ( 13 | chain, 14 | ) # A function that sets sequential dependencies between tasks including lists of tasks. 15 | from airflow.operators.bash import BashOperator 16 | from airflow.operators.dummy import DummyOperator 17 | from airflow.operators.email import EmailOperator 18 | from airflow.operators.python import BranchPythonOperator 19 | from airflow.operators.weekday import BranchDayOfWeekOperator 20 | from airflow.utils.edgemodifier import ( 21 | Label, 22 | ) # Used to label node edges in the Airflow UI 23 | from airflow.utils.task_group import ( 24 | TaskGroup, 25 | ) # Used to group tasks together in the Graph view of the Airflow UI 26 | from airflow.utils.trigger_rule import ( 27 | TriggerRule, 28 | ) # Used to change how an Operator is triggered 29 | from airflow.utils.weekday import ( 30 | WeekDay, 31 | ) # Used to determine what day of the week it is 32 | 33 | 34 | """ 35 | This DAG is intended to demonstrate a number of core Apache Airflow concepts that are central to the pipeline 36 | authoring experience, including the TaskFlow API, Edge Labels, Jinja templating, branching, 37 | dynamic task generation, Task Groups, and Trigger Rules. 38 | 39 | First, this DAG checks if the current day is a weekday or weekend. Next, the DAG checks which day of the week 40 | it is. Lastly, the DAG prints out a bash statement based on which day it is. On Tuesday, for example, the DAG 41 | prints "It's Tuesday and I'm busy with studying". 42 | 43 | This DAG uses the following operators: 44 | 45 | BashOperator - 46 | Executes a bash script or bash command. 47 | 48 | See more info about this operator here: 49 | https://registry.astronomer.io/providers/apache-airflow/modules/bashoperator 50 | 51 | DummyOperator - 52 | Does nothing but can be used to group tasks in a DAG 53 | 54 | See more info about this operator here: 55 | https://registry.astronomer.io/providers/apache-airflow/modules/dummyoperator 56 | 57 | EmailOperator - 58 | Used to send emails 59 | 60 | See more info about this operator here: 61 | https://registry.astronomer.io/providers/apache-airflow/modules/emailoperator 62 | 63 | BranchPythonOperator - 64 | Allows a workflow to “branch” after a task based on the result of a Python function 65 | 66 | See more info about this operator here: 67 | https://registry.astronomer.io/providers/apache-airflow/modules/branchpythonoperator 68 | 69 | BranchDayOfWeekOperator - 70 | Branches into one of two lists of tasks depending on the current day 71 | 72 | See more info about this operator here: 73 | https://registry.astronomer.io/providers/apache-airflow/modules/branchdayofweekoperator 74 | """ 75 | 76 | # Reference data that defines "weekday" as well as the activity assigned to each day of the week. 77 | DAY_ACTIVITY_MAPPING = { 78 | "monday": {"is_weekday": True, "activity": "guitar lessons"}, 79 | "tuesday": {"is_weekday": True, "activity": "studying"}, 80 | "wednesday": {"is_weekday": True, "activity": "soccer practice"}, 81 | "thursday": {"is_weekday": True, "activity": "contributing to Airflow"}, 82 | "friday": {"is_weekday": True, "activity": "family dinner"}, 83 | "saturday": {"is_weekday": False, "activity": "going to the beach"}, 84 | "sunday": {"is_weekday": False, "activity": "sleeping in"}, 85 | } 86 | 87 | 88 | @task( 89 | multiple_outputs=True 90 | ) # multiple_outputs=True unrolls dictionaries into separate XCom values 91 | def _going_to_the_beach() -> Dict: 92 | return { 93 | "subject": "Beach day!", 94 | "body": "It's Saturday and I'm heading to the beach.

Come join me!
", 95 | } 96 | 97 | 98 | # This functions gets the activity from the "DAY_ACTIVITY_MAPPING" dictionary 99 | def _get_activity(day_name) -> str: 100 | activity_id = DAY_ACTIVITY_MAPPING[day_name]["activity"].replace(" ", "_") 101 | 102 | if DAY_ACTIVITY_MAPPING[day_name]["is_weekday"]: 103 | return f"weekday_activities.{activity_id}" 104 | 105 | return f"weekend_activities.{activity_id}" 106 | 107 | 108 | # When using the DAG decorator, the "dag" argument doesn't need to be specified for each task. 109 | # The "dag_id" value defaults to the name of the function it is decorating if not explicitly set. 110 | # In this example, the "dag_id" value would be "example_dag_advanced". 111 | @dag( 112 | # This DAG is set to run for the first time on June 11, 2021. Best practice is to use a static start_date. 113 | # Subsequent DAG runs are instantiated based on scheduler_interval below. 114 | start_date=datetime(2021, 6, 11), 115 | # This defines how many instantiations of this DAG (DAG Runs) can execute concurrently. In this case, 116 | # we're only allowing 1 DAG run at any given time, as opposed to allowing multiple overlapping DAG runs. 117 | max_active_runs=1, 118 | # This defines how often your DAG will run, or the schedule by which DAG runs are created. It can be 119 | # defined as a cron expression or custom timetable. This DAG will run daily. 120 | schedule_interval="@daily", 121 | # Default settings applied to all tasks within the DAG; can be overwritten at the task level. 122 | default_args={ 123 | "owner": "community", # This defines the value of the "owner" column in the DAG view of the Airflow UI 124 | "retries": 2, # If a task fails, it will retry 2 times. 125 | "retry_delay": timedelta( 126 | minutes=3 127 | ), # A task that fails will wait 3 minutes to retry. 128 | }, 129 | default_view="graph", # This defines the default view for this DAG in the Airflow UI 130 | # When catchup=False, your DAG will only run for the latest schedule interval. In this case, this means 131 | # that tasks will not be run between June 11, 2021 and 1 day ago. When turned on, this DAG's first run 132 | # will be for today, per the @daily schedule interval 133 | catchup=False, 134 | tags=["example"], # If set, this tag is shown in the DAG view of the Airflow UI 135 | ) 136 | def example_dag_advanced(): 137 | # DummyOperator placeholder for first task 138 | begin = DummyOperator(task_id="begin") 139 | # Last task will only trigger if no previous task failed 140 | end = DummyOperator(task_id="end", trigger_rule=TriggerRule.NONE_FAILED) 141 | 142 | # This task checks which day of the week it is 143 | check_day_of_week = BranchDayOfWeekOperator( 144 | task_id="check_day_of_week", 145 | week_day={WeekDay.SATURDAY, WeekDay.SUNDAY}, # This checks day of week 146 | follow_task_ids_if_true="weekend", # Next task if criteria is met 147 | follow_task_ids_if_false="weekday", # Next task if criteria is not met 148 | use_task_execution_day=True, # If True, uses task’s execution day to compare with is_today 149 | ) 150 | 151 | weekend = DummyOperator(task_id="weekend") # "weekend" placeholder task 152 | weekday = DummyOperator(task_id="weekday") # "weekday" placeholder task 153 | 154 | # Templated value for determining the name of the day of week based on the start date of the DAG Run 155 | day_name = "{{ dag_run.start_date.strftime('%A').lower() }}" 156 | 157 | # Begin weekday tasks. 158 | # Tasks within this TaskGroup (weekday tasks) will be grouped together in the Airflow UI 159 | with TaskGroup("weekday_activities") as weekday_activities: 160 | which_weekday_activity_day = BranchPythonOperator( 161 | task_id="which_weekday_activity_day", 162 | python_callable=_get_activity, # Python function called when task executes 163 | op_args=[day_name], 164 | ) 165 | 166 | for day, day_info in DAY_ACTIVITY_MAPPING.items(): 167 | if day_info["is_weekday"]: 168 | day_of_week = Label(label=day) 169 | activity = day_info["activity"] 170 | 171 | # This task prints the weekday activity to bash 172 | do_activity = BashOperator( 173 | task_id=activity.replace(" ", "_"), 174 | bash_command=f"echo It's {day.capitalize()} and I'm busy with {activity}.", # This is the bash command to run 175 | ) 176 | 177 | # Declaring task dependencies within the "TaskGroup" via the classic bitshift operator. 178 | which_weekday_activity_day >> day_of_week >> do_activity 179 | 180 | # Begin weekend tasks 181 | # Tasks within this TaskGroup will be grouped together in the UI 182 | with TaskGroup("weekend_activities") as weekend_activities: 183 | which_weekend_activity_day = BranchPythonOperator( 184 | task_id="which_weekend_activity_day", 185 | python_callable=_get_activity, # Python function called when task executes 186 | op_args=[day_name], 187 | ) 188 | 189 | # Labels that will appear in the Graph view of the Airflow UI 190 | saturday = Label(label="saturday") 191 | sunday = Label(label="sunday") 192 | 193 | # This task prints the Sunday activity to bash 194 | sleeping_in = BashOperator( 195 | task_id="sleeping_in", bash_command="sleep $[ ( $RANDOM % 30 ) + 1 ]s" 196 | ) 197 | 198 | going_to_the_beach = _going_to_the_beach() # Calling the taskflow function 199 | 200 | # Because the "_going_to_the_beach()" function has "multiple_outputs" enabled, each dict key is 201 | # accessible as their own "XCom" key. 202 | inviting_friends = EmailOperator( 203 | task_id="inviting_friends", 204 | to="friends@community.com", # Email to send email to 205 | subject=going_to_the_beach["subject"], # Email subject 206 | html_content=going_to_the_beach["body"], # Eamil body content 207 | ) 208 | 209 | # Using "chain()" here for list-to-list dependencies which are not supported by the bitshift 210 | # operator and to simplify the notation for the desired dependency structure. 211 | chain( 212 | which_weekend_activity_day, 213 | [saturday, sunday], 214 | [going_to_the_beach, sleeping_in], 215 | ) 216 | 217 | # High-level dependencies between tasks 218 | chain( 219 | begin, 220 | check_day_of_week, 221 | [weekday, weekend], 222 | [weekday_activities, weekend_activities], 223 | end, 224 | ) 225 | 226 | # Task dependency created by XComArgs: 227 | # going_to_the_beach >> inviting_friends 228 | 229 | 230 | dag = example_dag_advanced() 231 | --------------------------------------------------------------------------------