├── packages.txt
├── .airflowignore
├── dbt
    ├── seeds
    │   └── .gitkeep
    ├── tests
    │   └── .gitkeep
    ├── analyses
    │   └── .gitkeep
    ├── macros
    │   ├── .gitkeep
    │   ├── macro_docs.yml
    │   ├── abstract_cte.sql
    │   └── macro_docs.md
    ├── snapshots
    │   ├── .gitkeep
    │   └── fakestore_products_history.sql
    ├── models
    │   ├── marts
    │   │   ├── .gitkeep
    │   │   ├── top_performing_products.sql
    │   │   ├── user_portfolio.sql
    │   │   └── mart_docs.yml
    │   └── staging
    │   │   ├── stg_users.sql
    │   │   └── src_fakestoredata.yml
    ├── .gitignore
    ├── README.md
    └── dbt_project.yml
├── dags
    ├── .airflowignore
    ├── utils
    │   ├── utils.py
    │   └── ddl_scripts.sql
    ├── airflow_dbt_dag_2.py
    ├── example_dag_basic.py
    ├── airflow_dbt_dag_1.py
    └── example_dag_advanced.py
├── Dockerfile
├── requirements.txt
├── .astro
    ├── config.yaml
    └── test_dag_integrity_default.py
├── .dockerignore
├── img
    └── workflow.png
├── start.sh
├── .gitignore
├── .env.example
├── docker-compose.override.yml
├── airflow_settings.example.yaml
├── tests
    └── dags
    │   └── test_dag_integrity.py
├── include
    ├── helper_scripts.py
    └── transformers.py
└── README.md


/packages.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.airflowignore:
--------------------------------------------------------------------------------
1 | dbt/


--------------------------------------------------------------------------------
/dbt/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dags/.airflowignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/models/marts/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/astronomer/astro-runtime:9.1.0
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-dbt-cloud==3.2.0


--------------------------------------------------------------------------------
/.astro/config.yaml:
--------------------------------------------------------------------------------
1 | project:
2 |   name: airflow-dbt-magic
3 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | astro
2 | .git
3 | .env
4 | airflow_settings.yaml
5 | logs/
6 | 


--------------------------------------------------------------------------------
/img/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VICIWUOHA/airflow-dbt-magic/HEAD/img/workflow.png


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
1 | # installl astro cli and start astro
2 | 
3 | curl -sSL install.astronomer.io | sudo bash -s
4 | astro dev start


--------------------------------------------------------------------------------
/dags/utils/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | dag_owner = os.getenv("DATAFEST_23_USER")
4 | DBT_JOB_SCHEMA = "dbt_" + dag_owner
5 | 


--------------------------------------------------------------------------------
/dbt/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | # Default .gitignore content added by dbt Cloud
3 | target/
4 | dbt_packages/
5 | logs/
6 | # end dbt Cloud content
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .git
2 | .env
3 | .DS_Store # macOS specific ignore
4 | airflow_settings.yaml
5 | __pycache__/
6 | astro
7 | data_lake/*
8 | .vscode
9 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | DATAFEST_23_USER=ANY_USER_NAME_in_lower_case
2 | DATAFEST_23_DB=DB_NAME_OF_CHOICE
3 | ENV_PG_DB_URI=postgresql://DB_USER:DB_PASSWORD@host


--------------------------------------------------------------------------------
/docker-compose.override.yml:
--------------------------------------------------------------------------------
1 | version: "3.1"
2 | services:
3 |   scheduler:
4 |     user: root
5 |     volumes:
6 |       - ./data_lake:/usr/local/airflow/data_lake:rw


--------------------------------------------------------------------------------
/dbt/models/marts/top_performing_products.sql:
--------------------------------------------------------------------------------
1 | -- Top 10 products Model
2 | SELECT 
3 |     id,
4 |     title as product_name,
5 |     rating_rate as product_rating
6 | FROM {{ source('fakestoreapi','products')}}
7 | ORDER BY 3 DESC
8 | LIMIT 10


--------------------------------------------------------------------------------
/dbt/macros/macro_docs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | macros:
4 |   - name: abstract_cte
5 |     description: A macro to abstract selection of all columns with a cte
6 |     arguments:
7 |       - name: tuple_list
8 |         type: array
9 |         description: '{{ doc("abstract_cte")}}'


--------------------------------------------------------------------------------
/dbt/models/staging/stg_users.sql:
--------------------------------------------------------------------------------
1 | -- Staging Model for Users with only appropiate data
2 | 
3 | SELECT
4 |     id,
5 |     INITCAP(firstname) ||' '|| INITCAP(lastname) as full_name,
6 |     email,
7 |     phone,
8 |     address_zipcode
9 | FROM {{ source('fakestoreapi','users')}}


--------------------------------------------------------------------------------
/dbt/macros/abstract_cte.sql:
--------------------------------------------------------------------------------
 1 | {% macro abstract_cte(tuple_list) %}
 2 | 
 3 | WITH{% for cte_ref in tuple_list %} {{cte_ref[0]}} AS (
 4 | 
 5 |     SELECT *
 6 |     FROM {{ ref(cte_ref[1]) }}
 7 | 
 8 | )
 9 |     {%- if not loop.last -%}
10 |         ,
11 |     {%- endif -%}
12 | 
13 |     {%- endfor -%}
14 | 
15 | {%- endmacro %}
16 | 


--------------------------------------------------------------------------------
/dbt/snapshots/fakestore_products_history.sql:
--------------------------------------------------------------------------------
 1 | {% snapshot fakestore_products_history%}
 2 | 
 3 | {{
 4 |     config(
 5 |         target_schema=env_var("DBT_DATAFEST_23_SCHEMA"),
 6 |         unique_key='id',
 7 |         strategy='timestamp',
 8 |         updated_at='updated_at',
 9 |         invalidate_hard_deletes=True,
10 |     )
11 | }}
12 | 
13 | SELECT * FROM {{source('fakestoreapi','products')}}
14 | --This would start tracking the changes on inventory items from the moment this model was created.
15 | {% endsnapshot%}


--------------------------------------------------------------------------------
/dbt/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/airflow_settings.example.yaml:
--------------------------------------------------------------------------------
 1 | airflow:
 2 |   connections: ## conn_id and conn_type are required
 3 |     - conn_id: postgres_default
 4 |       conn_type: postgres
 5 |       conn_host: YourPostgresDBHost eg; abcd123.efg456.hij.com OR localhost OR IP
 6 |       conn_schema: DB_NAME
 7 |       conn_login: DB_USERNAME
 8 |       conn_password: DB_PASSWORD
 9 |       conn_port: 5432 #or any specific port (the default postgres shipped with airflow uses this port)
10 |     - conn_id: dbt_cloud_default
11 |       conn_type: dbt_cloud
12 |       conn_host: cloud.getdbt.com
13 |       conn_login: YOUR_DBT_ACCOUNT_ID
14 |       conn_password: YOUR_DBT_API_TOKEN
15 |   variables: ## variable_name and variable_value are required
16 |     - variable_name: datafest_meetup_job
17 |       variable_value: YOUR_DBT_CLOUD_JOB_ID
18 | 


--------------------------------------------------------------------------------
/dbt/models/staging/src_fakestoredata.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: fakestoreapi
 5 |     description: Source schema for all fakestoreapi data
 6 |     database: "{{ env_var('DBT_DATAFEST_23_DB') }}"
 7 |     schema: "{{ env_var('DBT_DATAFEST_23_SCHEMA') }}"
 8 |     tables:
 9 |       - name: carts
10 |         description: table containing cart info placed by users at the store
11 |       - name: products
12 |         description: table containing the products available on the store site
13 |       - name: users
14 |         description: table that holds user information
15 |         tags: ["contains pii"]
16 |         columns:
17 |           - name: id
18 |             description: unique identifier of the user
19 |             tests:
20 |               - unique
21 |               - not_null
22 | models:
23 |   - name: stg_users
24 |     description: Staging Model for Important User Info
25 | 


--------------------------------------------------------------------------------
/dbt/macros/macro_docs.md:
--------------------------------------------------------------------------------
 1 | {% docs abstract_cte %}
 2 | 
 3 | ### abstract_cte
 4 | 
 5 | This macro abstracts the need to write all your **select * ctes** explicitly and saves lines of code to be generated at compile time.
 6 | 
 7 | - #### tuple_list: An array containing one or more tuples of two elements as follows;
 8 | 
 9 |   - 1. the name to give the cte
10 |   - 2. the name of the model to be referenced.
11 |   - #### Example:
12 | 
13 |     {% raw %}
14 |     ```
15 |     {{ abstract_cte([
16 |             ('system_a','system_a_daily_sales'),
17 |             ('system_b','system_b_daily_sales')
18 |         ])
19 |     }}
20 |     would be compiled as ;
21 | 
22 |     WITH system_a AS (
23 |     SELECT *
24 |     FROM {{ref('system_a_daily_sales')}}
25 |     ),
26 |     system_b AS (
27 |     SELECT *
28 |     FROM {{ref('system_b_daily_sales')}}
29 |     ),
30 |     ```
31 |     {% endraw %}
32 | 
33 | - The Rest of the logic can then be implemented downstream.
34 | 
35 | {% enddocs %}


--------------------------------------------------------------------------------
/dbt/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'airflow_dbt_magic'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'default'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `source-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | target-path: "target"  # directory which will store compiled SQL files
23 | clean-targets:         # directories to be removed by `dbt clean`
24 |   - "target"
25 |   - "dbt_packages"
26 | 
27 | 
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 | 
31 | # In this example config, we tell dbt to build all models in the example/ directory
32 | # as tables. These settings can be overridden in the individual model files
33 | # using the `{{ config(...) }}` macro.
34 | models:
35 |   airflow_dbt_magic:
36 |     # Applies to all files under models/example/
37 |     marts:
38 |       materialized: view
39 |     staging:
40 |       materialized: view
41 | 


--------------------------------------------------------------------------------
/dags/airflow_dbt_dag_2.py:
--------------------------------------------------------------------------------
 1 | # when dataset is updated in db, this dag should trigger the dbt cloud run via the API
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG, Dataset
 5 | from airflow.models import Variable
 6 | from airflow.operators.empty import EmptyOperator
 7 | from airflow.providers.dbt.cloud.operators.dbt import DbtCloudRunJobOperator
 8 | 
 9 | from utils.utils import dag_owner, DBT_JOB_SCHEMA
10 | 
11 | # define arguments to be used in dag
12 | dbt_job_id: int = Variable.get("datafest_meetup_job")
13 | default_args = {
14 |     "owner": dag_owner,
15 |     "depends_on_past": False,
16 |     "retries": 1,
17 |     "retry_delay": timedelta(minutes=2),
18 | }
19 | 
20 | with DAG(
21 |     dag_id="fakestore_dbt_job_pipeline",
22 |     default_args=default_args,
23 |     description="Simple Dag to Trigger Dbt Production Job",
24 |     start_date=datetime(2023, 6, 11),
25 |     schedule=[Dataset("//fakestore_dwh/tables")],
26 |     catchup=False,
27 |     tags=["airflow-dbt-magic"],
28 | ) as dag:
29 |     start = EmptyOperator(task_id="start_pipeline")
30 | 
31 |     trigger_dbt_cloud_job = DbtCloudRunJobOperator(
32 |         task_id="trigger_dbt_cloud_job",
33 |         job_id=dbt_job_id,
34 |         trigger_reason=f"Triggered From Airflow Dataset Update configured by {dag_owner}",
35 |         steps_override=None,
36 |         schema_override=DBT_JOB_SCHEMA,
37 |         wait_for_termination=True,
38 |         timeout=400,
39 |         check_interval=60,
40 |         additional_run_config=None,
41 |     )
42 | 
43 |     start >> trigger_dbt_cloud_job
44 | 


--------------------------------------------------------------------------------
/dbt/models/marts/user_portfolio.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     abstract_cte([
 3 |         ('users','stg_users')
 4 |     ])
 5 | }},
 6 | 
 7 | 
 8 | user_cart_summary AS (
 9 | 
10 |     SELECT
11 |         user_id,
12 |         COUNT(DISTINCT product_id) AS unique_products_tried_out,
13 |         SUM(quantity) AS lifetime_product_volume,
14 |         COUNT(DISTINCT date) AS no_of_days_in_store
15 | 
16 |     FROM {{ source('fakestoreapi','carts')}}
17 |     GROUP BY 1
18 |     -- HAVING ((COUNT(DISTINCT date)) > 1 OR (COUNT(DISTINCT product_id))>1) -- have visisted the fakestore more than once
19 |     
20 | ),
21 | 
22 | 
23 | user_top_products AS (
24 | 
25 |     SELECT 
26 |         a.user_id,
27 |         c.full_name,
28 |         c.email,
29 |         b.title as top_product,
30 |         a.quantity as top_product_quantity
31 |     FROM(
32 |             SELECT 
33 |                 user_id,
34 |                 product_id,
35 |                 quantity,
36 |                 ROW_NUMBER() OVER (PARTITION BY user_id ORDER by quantity DESC ) as rn
37 |             FROM {{ source('fakestoreapi','carts')}}
38 |     ) AS a
39 |     LEFT JOIN {{ source('fakestoreapi','products')}} AS b
40 |     ON a.product_id = b.id
41 |     LEFT JOIN {{ ref('stg_users')}} c
42 |     ON a.user_id = c.id
43 |     WHERE a.rn = 1
44 | )
45 | 
46 | 
47 | SELECT
48 |     a.user_id,
49 |     a.full_name,
50 |     a.email,
51 |     b.lifetime_product_volume,
52 |     b.unique_products_tried_out,
53 |     b.no_of_days_in_store,
54 |     a.top_product,
55 |     a.top_product_quantity
56 | FROM user_top_products a
57 | LEFT JOIN user_cart_summary b
58 | ON a.user_id = b.user_id


--------------------------------------------------------------------------------
/dags/utils/ddl_scripts.sql:
--------------------------------------------------------------------------------
 1 | --Syntax: PostgreSql
 2 | -- Execute against Data Warehouse
 3 | -- {{params.schema}} would be passed at runtime
 4 | -- Drop Statements can be commented out if not needed.(but subsequent runs would fail on pkey constraints)
 5 | 
 6 | CREATE SCHEMA IF NOT EXISTS {{params.schema}};
 7 | DROP TABLE IF EXISTS {{params.schema}}.products CASCADE;
 8 | CREATE TABLE IF NOT EXISTS {{params.schema}}.products(
 9 |     id numeric, 
10 |     title varchar, 
11 |     price numeric, 
12 |     description varchar,
13 |     category varchar, 
14 |     image varchar, 
15 |     rating_rate decimal, 
16 |     rating_count numeric,
17 |     updated_at timestamp,
18 |     _datafest_meetup_user varchar,
19 |     uuid varchar primary key
20 | 
21 | );
22 | 
23 | DROP TABLE IF EXISTS {{params.schema}}.users CASCADE;
24 | CREATE TABLE IF NOT EXISTS {{params.schema}}.users(
25 |     id numeric, 
26 |     email varchar, 
27 |     username varchar,
28 |     phone varchar,
29 |     address_geolocation_lat numeric,
30 |     address_geolocation_long numeric, 
31 |     address_city varchar, 
32 |     address_street varchar, 
33 |     address_number numeric, 
34 |     address_zipcode varchar, 
35 |     firstname varchar, 
36 |     lastname varchar,
37 |     updated_at timestamp,
38 |     _datafest_meetup_user varchar,
39 |     uuid varchar primary key
40 | 
41 | );
42 | 
43 | DROP TABLE IF EXISTS {{params.schema}}.carts CASCADE;
44 | CREATE TABLE IF NOT EXISTS {{params.schema}}.carts(
45 |     cart_id numeric,
46 | 	id varchar,
47 |     date date,
48 |     user_id numeric,
49 |     product_id numeric,
50 |     quantity numeric,
51 |     _datafest_meetup_user varchar,
52 |     uuid varchar primary key
53 | 
54 | );


--------------------------------------------------------------------------------
/dbt/models/marts/mart_docs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: top_performing_products
 5 |     description: Shows 10 most rated products on the Fakestore API
 6 |     columns:
 7 |       - name: id
 8 |         description: Product Unique Identifier
 9 |         tests:
10 |           - not_null
11 |       - name: product_name
12 |         description: The Name of the product as it appears on the title bar of the fakestore
13 |       - name: product_rating
14 |         description: The rating of this product as starred by customers
15 |   
16 |   - name: user_portfolio
17 |     description: Table showing a Users Portfolio based on purchase history from the fakestore.
18 |     columns:
19 |       - name: user_id
20 |         description: Unique id of a user.
21 |       - name: full_name
22 |         description: User's First and Last names.
23 |       - name: email
24 |         description: Email of the User
25 |         tests:
26 |           - unique
27 |       - name: lifetime_product_volume,
28 |         description: Total quantity of Products a user has bought over time
29 |       - name: unique_products_tried_out
30 |         description: The Unique number of products the customer has tried out irrespective of their volume
31 |       - name: no_of_days_in_store,
32 |         description: Number of days on which the customer has visited the fake store
33 |       - name: top_product
34 |         description: Most purchased product by this customer based on
35 |       - name: top_product_quantity
36 |         description: The Volume of the top product that the customer has bought
37 | 
38 | exposures:
39 |   - name: customer_portfolio_app
40 |     description: Application Containing Summary Matrica of a Customer's value in the FakeStore business.
41 |     type: application
42 |     depends_on:
43 |       - ref('user_portfolio')
44 |     owner:
45 |       email: dbtlagosmeetup@email.com


--------------------------------------------------------------------------------
/tests/dags/test_dag_integrity.py:
--------------------------------------------------------------------------------
 1 | """Test the validity of all DAGs. This test ensures that all Dags have tags, retries set to two, and no import errors. Feel free to add and remove tests."""
 2 | 
 3 | import os
 4 | import logging
 5 | from contextlib import contextmanager
 6 | import pytest
 7 | from airflow.models import DagBag
 8 | 
 9 | 
10 | @contextmanager
11 | def suppress_logging(namespace):
12 |     logger = logging.getLogger(namespace)
13 |     old_value = logger.disabled
14 |     logger.disabled = True
15 |     try:
16 |         yield
17 |     finally:
18 |         logger.disabled = old_value
19 | 
20 | 
21 | def get_import_errors():
22 |     """
23 |     Generate a tuple for import errors in the dag bag
24 |     """
25 |     with suppress_logging("airflow"):
26 |         dag_bag = DagBag(include_examples=False)
27 | 
28 |         def strip_path_prefix(path):
29 |             return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
30 | 
31 |         # we prepend "(None,None)" to ensure that a test object is always created even if its a no op.
32 |         return [(None, None)] + [
33 |             (strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items()
34 |         ]
35 | 
36 | 
37 | def get_dags():
38 |     """
39 |     Generate a tuple of dag_id, <DAG objects> in the DagBag
40 |     """
41 |     with suppress_logging("airflow"):
42 |         dag_bag = DagBag(include_examples=False)
43 | 
44 |     def strip_path_prefix(path):
45 |         return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
46 | 
47 |     return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()]
48 | 
49 | 
50 | @pytest.mark.parametrize(
51 |     "rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]
52 | )
53 | def test_file_imports(rel_path, rv):
54 |     """Test for import errors on a file"""
55 |     if rel_path and rv:
56 |         raise Exception(f"{rel_path} failed to import with message \n {rv}")
57 | 
58 | 
59 | APPROVED_TAGS = {}
60 | 
61 | 
62 | @pytest.mark.parametrize(
63 |     "dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()]
64 | )
65 | def test_dag_tags(dag_id, dag, fileloc):
66 |     """
67 |     test if a DAG is tagged and if those TAGs are in the approved list
68 |     """
69 |     assert dag.tags, f"{dag_id} in {fileloc} has no tags"
70 |     if APPROVED_TAGS:
71 |         assert not set(dag.tags) - APPROVED_TAGS
72 | 
73 | 
74 | @pytest.mark.parametrize(
75 |     "dag_id,dag, fileloc", get_dags(), ids=[x[2] for x in get_dags()]
76 | )
77 | def test_dag_retries(dag_id, dag, fileloc):
78 |     """
79 |     test if a DAG has retries set
80 |     """
81 |     assert (
82 |         dag.default_args.get("retries", None) >= 2
83 |     ), f"{dag_id} in {fileloc} does not have retries not set to 2."
84 | 


--------------------------------------------------------------------------------
/dags/example_dag_basic.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow.decorators import (
 5 |     dag,
 6 |     task,
 7 | )  # DAG and task decorators for interfacing with the TaskFlow API
 8 | 
 9 | 
10 | @dag(
11 |     # This defines how often your DAG will run, or the schedule by which your DAG runs. In this case, this DAG
12 |     # will run daily
13 |     schedule_interval="@daily",
14 |     # This DAG is set to run for the first time on January 1, 2021. Best practice is to use a static
15 |     # start_date. Subsequent DAG runs are instantiated based on scheduler_interval
16 |     start_date=datetime(2021, 1, 1),
17 |     # When catchup=False, your DAG will only run for the latest schedule_interval. In this case, this means
18 |     # that tasks will not be run between January 1, 2021 and 30 mins ago. When turned on, this DAG's first
19 |     # run will be for the next 30 mins, per the schedule_interval
20 |     catchup=False,
21 |     default_args={
22 |         "retries": 2,  # If a task fails, it will retry 2 times.
23 |     },
24 |     tags=["example"],
25 | )  # If set, this tag is shown in the DAG view of the Airflow UI
26 | def example_dag_basic():
27 |     """
28 |     ### Basic ETL Dag
29 |     This is a simple ETL data pipeline example that demonstrates the use of
30 |     the TaskFlow API using three simple tasks for extract, transform, and load.
31 |     For more information on Airflow's TaskFlow API, reference documentation here:
32 |     https://airflow.apache.org/docs/apache-airflow/stable/tutorial_taskflow_api.html
33 |     """
34 | 
35 |     @task()
36 |     def extract():
37 |         """
38 |         #### Extract task
39 |         A simple "extract" task to get data ready for the rest of the
40 |         pipeline. In this case, getting data is simulated by reading from a
41 |         hardcoded JSON string.
42 |         """
43 |         data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}'
44 | 
45 |         order_data_dict = json.loads(data_string)
46 |         return order_data_dict
47 | 
48 |     @task(
49 |         multiple_outputs=True
50 |     )  # multiple_outputs=True unrolls dictionaries into separate XCom values
51 |     def transform(order_data_dict: dict):
52 |         """
53 |         #### Transform task
54 |         A simple "transform" task which takes in the collection of order data and
55 |         computes the total order value.
56 |         """
57 |         total_order_value = 0
58 | 
59 |         for value in order_data_dict.values():
60 |             total_order_value += value
61 | 
62 |         return {"total_order_value": total_order_value}
63 | 
64 |     @task()
65 |     def load(total_order_value: float):
66 |         """
67 |         #### Load task
68 |         A simple "load" task that takes in the result of the "transform" task and prints it out,
69 |         instead of saving it to end user review
70 |         """
71 | 
72 |         print(f"Total order value is: {total_order_value:.2f}")
73 | 
74 |     order_data = extract()
75 |     order_summary = transform(order_data)
76 |     load(order_summary["total_order_value"])
77 | 
78 | 
79 | example_dag_basic = example_dag_basic()
80 | 


--------------------------------------------------------------------------------
/include/helper_scripts.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import re
 3 | import os
 4 | from typing import List, Literal
 5 | from time import sleep
 6 | from sqlalchemy import create_engine
 7 | 
 8 | 
 9 | def transform_col_names(dataset: pd.DataFrame) -> List[str]:
10 |     """Transformer function to change CamelCases to sql friendly snake_case and handle irregularities in field names.
11 |     Parameters
12 |     ----------
13 |     dataset : pd.DataFrame
14 |         DataFrame for which a column name transformation is needed
15 | 
16 |     Returns
17 |     -------
18 |     List[str]
19 |         List of transformed column names to be mapped to the original dataframe.
20 |     """
21 | 
22 |     old_col_list = dataset.columns.to_list()
23 |     new_col_list = []
24 |     for col in old_col_list:
25 |         # change from CamelCase to snake_case
26 |         col_name = re.sub("([a-z])([A-Z0-9])", r"\1_\2", col)
27 |         # replace dots with underscores
28 |         new_name = col_name.replace("name.", "").replace(".", "_")
29 |         new_col_list.append(new_name.lower())
30 | 
31 |     return new_col_list
32 | 
33 | 
34 | def load_to_db(
35 |     table_name: str,
36 |     dataset: pd.DataFrame,
37 |     db_name: str = os.getenv("DATAFEST_23_DB"),
38 |     schema: str = "dbt_" + str(os.getenv("DATAFEST_23_USER")),
39 |     if_exists="append",
40 | ) -> Literal[True]:
41 |     """Connects to a Database and Loads a supplied Dataframe to a specific schema and table in that Database
42 | 
43 |     Parameters
44 |     ----------
45 |     db_name : str
46 |         The database to connect to
47 |     table_name : str
48 |         The table to be loaded with data
49 |     dataset : pd.DataFrame
50 |        data to be loaded to db table
51 |     schema : str, optional
52 |         schema of the database to be operated upon ->default: dev
53 |     if_exists : str, optional
54 |         Logic to apply if the table already exists -> default: appends to table. "append"
55 | 
56 |     Returns
57 |     -------
58 |     boolean|None
59 |         True if load action was successful otherwise Nothing is returned.
60 | 
61 |     Raises
62 |     ------
63 |     Exception
64 |        Any SqlAlchemy Engine Connection Error encountered.
65 |     Exception
66 |         Any Exception after connection that occurs during database load operation.
67 |     """
68 |     print("=> Connecting to Database......")
69 |     try:
70 |         DEV_ENV = os.getenv("ENV_PG_DB_URI")
71 |         engine = create_engine(f"{DEV_ENV}/{db_name}")
72 |         conx = engine.connect()
73 |         message = "=>Successfully Established Connection to Database"
74 |         print(message)
75 |     except Exception as e:
76 |         error_log = "Pipeline Broken,Connection to Database Failed : {}".format(e)
77 |         print(error_log)
78 |         raise e
79 |     print("=> Writing Data to Database.....")
80 |     sleep(2)
81 |     try:
82 |         dataset.to_sql(
83 |             f"{table_name}", con=conx, schema=schema, if_exists=if_exists, index=False
84 |         )
85 |         message = (
86 |             "***==> Successfully Written `{}` Rows of Data to db: `{}.{}.{}` .".format(
87 |                 len(dataset), db_name, schema, table_name
88 |             )
89 |         )
90 |         conx.close()
91 |         print(message)
92 |         return True
93 |     except Exception as e:
94 |         error_log = "Pipeline Broken,Failed to Write data to Database : {}".format(e)
95 |         print(error_log)
96 |         raise e
97 | 


--------------------------------------------------------------------------------
/.astro/test_dag_integrity_default.py:
--------------------------------------------------------------------------------
  1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**"""
  2 | from contextlib import contextmanager
  3 | import logging
  4 | import os
  5 | 
  6 | import pytest
  7 | 
  8 | from airflow.models import DagBag, Variable, Connection
  9 | from airflow.hooks.base import BaseHook
 10 | 
 11 | 
 12 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables
 13 | 
 14 | # =========== MONKEYPATCH BaseHook.get_connection() ===========
 15 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs):
 16 |     print(
 17 |         f"Attempted to fetch connection during parse returning an empty Connection object for {key}"
 18 |     )
 19 |     return Connection(key)
 20 | 
 21 | 
 22 | BaseHook.get_connection = basehook_get_connection_monkeypatch
 23 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() ===========
 24 | 
 25 | # =========== MONKEYPATCH OS.GETENV() ===========
 26 | def os_getenv_monkeypatch(key: str, *args, default=None, **kwargs):
 27 |     print(
 28 |         f"Attempted to fetch os environment variable during parse, returning a mocked value for {key}"
 29 |     )
 30 |     if (
 31 |         key == "JENKINS_HOME" and default is None
 32 |     ):  # fix https://github.com/astronomer/astro-cli/issues/601
 33 |         return None
 34 |     if default:
 35 |         return default
 36 |     return "NON_DEFAULT_OS_ENV_VALUE"
 37 | 
 38 | 
 39 | os.getenv = os_getenv_monkeypatch
 40 | # # =========== /MONKEYPATCH OS.GETENV() ===========
 41 | 
 42 | # =========== MONKEYPATCH VARIABLE.GET() ===========
 43 | 
 44 | 
 45 | class magic_dict(dict):
 46 |     def __init__(self, *args, **kwargs):
 47 |         self.update(*args, **kwargs)
 48 | 
 49 |     def __getitem__(self, key):
 50 |         return {}.get(key, "MOCKED_KEY_VALUE")
 51 | 
 52 | 
 53 | def variable_get_monkeypatch(key: str, default_var=None, deserialize_json=False):
 54 |     print(
 55 |         f"Attempted to get Variable value during parse, returning a mocked value for {key}"
 56 |     )
 57 | 
 58 |     if default_var:
 59 |         return default_var
 60 |     if deserialize_json:
 61 |         return magic_dict()
 62 |     return "NON_DEFAULT_MOCKED_VARIABLE_VALUE"
 63 | 
 64 | 
 65 | Variable.get = variable_get_monkeypatch
 66 | # # =========== /MONKEYPATCH VARIABLE.GET() ===========
 67 | 
 68 | 
 69 | @contextmanager
 70 | def suppress_logging(namespace):
 71 |     """
 72 |     Suppress logging within a specific namespace to keep tests "clean" during build
 73 |     """
 74 |     logger = logging.getLogger(namespace)
 75 |     old_value = logger.disabled
 76 |     logger.disabled = True
 77 |     try:
 78 |         yield
 79 |     finally:
 80 |         logger.disabled = old_value
 81 | 
 82 | 
 83 | def get_import_errors():
 84 |     """
 85 |     Generate a tuple for import errors in the dag bag
 86 |     """
 87 |     with suppress_logging("airflow"):
 88 |         dag_bag = DagBag(include_examples=False)
 89 | 
 90 |         def strip_path_prefix(path):
 91 |             return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
 92 | 
 93 |         # we prepend "(None,None)" to ensure that a test object is always created even if its a no op.
 94 |         return [(None, None)] + [
 95 |             (strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items()
 96 |         ]
 97 | 
 98 | 
 99 | @pytest.mark.parametrize(
100 |     "rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]
101 | )
102 | def test_file_imports(rel_path, rv):
103 |     """Test for import errors on a file"""
104 |     if rel_path and rv:  # Make sure our no op test doesn't raise an error
105 |         raise Exception(f"{rel_path} failed to import with message \n {rv}")
106 | 


--------------------------------------------------------------------------------
/dags/airflow_dbt_dag_1.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | from airflow import Dataset
  3 | from airflow.decorators import (
  4 |     dag,
  5 |     task,
  6 | )
  7 | 
  8 | # import astro.sql as aql
  9 | # from astro.sql.table import Table
 10 | from airflow.operators.empty import EmptyOperator
 11 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 12 | from include.transformers import FakeStoreApiTransformer, FAKE_STORE_ARTIFACTS
 13 | from include.helper_scripts import load_to_db
 14 | from utils.utils import dag_owner, DBT_JOB_SCHEMA
 15 | 
 16 | dag_docs = """ This DAG gets and transforms fictitious retail data from http://fakestoreapi.com into a data warehouse.
 17 |                 It also implements airflow concepts such as;
 18 | 
 19 |     - `task.expand()`  for dynamic mapping
 20 |     - airflow `Datasets` for Data aware scheduling used to trigger a DBT DAG run in fakestore_dbt_dag
 21 | 
 22 |             """
 23 | 
 24 | 
 25 | @dag(
 26 |     dag_id="fakestore_elt_pipeline",
 27 |     start_date=datetime(2023, 6, 11),
 28 |     # This defines how many instantiations of this DAG (DAG Runs) can execute concurrently. In this case,
 29 |     # we're only allowing 1 DAG run at any given time, as opposed to allowing multiple overlapping DAG runs.
 30 |     max_active_runs=1,
 31 |     schedule_interval="@daily",
 32 |     # Default settings applied to all tasks within the DAG; can be overwritten at the task level.
 33 |     default_args={
 34 |         "owner": f"{dag_owner}",  # This defines the value of the "owner" column in the DAG view of the Airflow UI
 35 |         "retries": 1,  # If a task fails, it will retry 2 times.
 36 |         "retry_delay": timedelta(
 37 |             seconds=30
 38 |         ),  # A task that fails will wait 30 seconds to retry.
 39 |     },
 40 |     catchup=False,
 41 |     tags=["airflow-dbt-magic"],
 42 |     doc_md=dag_docs,
 43 | )
 44 | def data_elt_process():
 45 |     create_destination_schema_and_tables = PostgresOperator(
 46 |         task_id="create_destination_tables_if_not_exists",
 47 |         postgres_conn_id="postgres_default",
 48 |         sql="utils/ddl_scripts.sql",
 49 |         params={"schema": DBT_JOB_SCHEMA},
 50 |     )
 51 | 
 52 |     transformer = FakeStoreApiTransformer()
 53 | 
 54 |     @task(max_active_tis_per_dag=1)
 55 |     def get_and_load_data(artifact: str, ti=None):
 56 |         file_path = transformer.get_fakestore_data(artifact)
 57 |         ti.xcom_push(key="uploaded_file_paths", value=file_path)
 58 | 
 59 |     upload_files = get_and_load_data.expand(artifact=FAKE_STORE_ARTIFACTS)
 60 | 
 61 |     transform_tasks = []
 62 |     # run transform tasks in parallel
 63 |     for artifact in FAKE_STORE_ARTIFACTS:
 64 | 
 65 |         @task(task_id=f"transform_and_load_{artifact}")
 66 |         def transform_and_load(artifact: str, ti=None):
 67 |             print("=> Transforming data.........")
 68 | 
 69 |             file_paths = list(ti.xcom_pull(key="uploaded_file_paths"))
 70 |             file_to_transform = [file for file in list(file_paths) if artifact in file][
 71 |                 0
 72 |             ]
 73 |             transformed_data = transformer.transform_fakestore_data(
 74 |                 artifact=f"{artifact}", json_file_path=file_to_transform
 75 |             )
 76 |             # load to db
 77 |             load = load_to_db(table_name=artifact, dataset=transformed_data)
 78 |             if load:
 79 |                 return load
 80 | 
 81 |         transform_and_load_task = transform_and_load(artifact=artifact)
 82 |         transform_tasks.append(transform_and_load_task)
 83 | 
 84 |     end_pipeline = EmptyOperator(
 85 |         task_id="end_pipeline",
 86 |         trigger_rule="none_failed",
 87 |         outlets=[Dataset("//fakestore_dwh/tables")],
 88 |     )
 89 | 
 90 |     # enforce dependencies
 91 |     (
 92 |         create_destination_schema_and_tables
 93 |         >> upload_files
 94 |         >> transform_tasks
 95 |         >> end_pipeline
 96 |     )
 97 | 
 98 | 
 99 | dag_run = data_elt_process()
100 | 


--------------------------------------------------------------------------------
/include/transformers.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import pandas as pd
  3 | import json
  4 | import os
  5 | from hashlib import md5
  6 | from datetime import datetime, timedelta
  7 | from pathlib import Path
  8 | from include.helper_scripts import transform_col_names
  9 | 
 10 | FAKE_STORE_ARTIFACTS = ["users", "products", "carts"]
 11 | pipeline_user = os.getenv("DATAFEST_23_USER")
 12 | 
 13 | 
 14 | class FakeStoreApiTransformer:
 15 |     """Transfromer Class for manipulating data from the FakeStoreApi at https://fakestoreapi.com/
 16 |     to deploy into a test datawarehouse. (This is not an exhaustive Class for all endpoints but PR's are welcome.)
 17 |     """
 18 | 
 19 |     def __init__(self) -> None:
 20 |         self.base_url = "https://fakestoreapi.com/"
 21 | 
 22 |     def get_fakestore_data(self, artifact: str, query_params: str = "") -> str:
 23 |         """Makes a call to https://fakestoreapi.com/ for a specified artifact/entity and simply writes to file storage
 24 | 
 25 |         Parameters
 26 |         ----------
 27 |         artifact : str
 28 |             The api endpoint/ db entity for which data should be gotten for.
 29 | 
 30 |         Returns
 31 |         -------
 32 |         str
 33 |             File Path where extracted Json data was written to.
 34 | 
 35 |         Raises
 36 |         ------
 37 |         Exception
 38 |             Any Exception encountered during REST API call or while writing to storage..
 39 |         """
 40 |         try:
 41 |             # make API call once and get json response
 42 |             with requests.Session() as s:
 43 |                 print(f"=> Now Getting data for {artifact}")
 44 |                 artifact_data = s.get(
 45 |                     self.base_url + f"{artifact}" + f"{query_params}"
 46 |                 ).json()
 47 |             # write raw json to file storage:
 48 |             time_info = datetime.strftime(
 49 |                 datetime.now() + timedelta(hours=1), "%Y_%m_%d_%H%M%S"
 50 |             )
 51 |             file_path = f"data_lake/{artifact}/"
 52 |             file_name = f"{artifact}_{time_info}.json"
 53 |             # check if file path exists and create if needed
 54 |             path_exists = os.path.exists(file_path)
 55 |             if not path_exists:
 56 |                 f_path = Path(file_path)
 57 |                 f_path.mkdir(parents=True)
 58 | 
 59 |             with open(file_path + file_name, "w", encoding="utf-8") as file:
 60 |                 raw_data = json.dumps({f"{artifact}": artifact_data}, indent=4)
 61 |                 file.write(raw_data)
 62 | 
 63 |             print(
 64 |                 f"=> ``{artifact}`` data written successfully to ``{file_path+file_name}``."
 65 |             )
 66 |             return file_path + file_name
 67 | 
 68 |         except Exception as e:
 69 |             print("** Error while Calling API or writing to Data Lake.")
 70 |             raise e
 71 | 
 72 |     def transform_fakestore_data(
 73 |         self, artifact: str, json_file_path: str
 74 |     ) -> pd.DataFrame:
 75 |         """This Method Transforms defined artifacts from the FakeStoreApi as at 2023-06-15 , It supports
 76 |         (Users, Products & Carts) but can be extended.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         artifact : str
 81 |             The defined artifact on of (Users, Products & Carts)
 82 |         json_file_path : str
 83 |             json file path on file storage which can be accessed from the airflow.
 84 |         Returns
 85 |         -------
 86 |         pd.DataFrame
 87 |             Normalized dataframe containing returned data from the Api in tabular format.
 88 |         """
 89 | 
 90 |         with open(json_file_path) as file:
 91 |             data = json.load(file)[artifact]
 92 |         print("=> Normalizing Dataset..")
 93 |         if artifact != "carts":
 94 |             artifact_data_trans = pd.json_normalize(data)
 95 |             artifact_data_trans["updated_at"] = datetime.strftime(
 96 |                 datetime.now() + timedelta(hours=1), "%Y-%m-%d %H:%M:%S"
 97 |             )
 98 |             # artifact_data_trans
 99 |         else:
100 |             artifact_data_trans = pd.json_normalize(
101 |                 data, record_path=["products"], meta=["id", "userId", "date"]
102 |             )
103 |             artifact_data_trans = artifact_data_trans.rename(columns={"id": "cart_id"})
104 |             # create hash for item in cart purchased by user on a given date
105 |             artifact_data_trans["id"] = (
106 |                 artifact_data_trans["date"]
107 |                 + artifact_data_trans["userId"].astype(str)
108 |                 + artifact_data_trans["productId"].astype(str)
109 |             ).apply(lambda val: md5(val.encode()).hexdigest())
110 |             artifact_data_trans = artifact_data_trans.reindex(
111 |                 columns=["cart_id", "id", "date", "userId", "productId", "quantity"]
112 |             )
113 | 
114 |         # standardize column names
115 |         artifact_data_trans.columns = transform_col_names(artifact_data_trans)
116 |         # validate that users have added appropiate env vars during live demo and add their id's to all tables before writes
117 |         print("=> Applying Identifiers..")
118 |         assert (
119 |             pipeline_user is not None
120 |         ), "You must Include an env variable named DATAFEST_23_USER"
121 |         artifact_data_trans["_datafest_meetup_user"] = pipeline_user
122 |         artifact_data_trans["uuid"] = (
123 |             artifact_data_trans["id"].astype(str)
124 |             + artifact_data_trans["_datafest_meetup_user"]
125 |         ).apply(lambda val: md5(val.encode()).hexdigest())
126 |         artifact_data_clean = artifact_data_trans.drop(
127 |             columns=["__v", "password"], errors="ignore"
128 |         )
129 |         return artifact_data_clean
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | AIRFLOW & DBT CLOUD INTEGRATION PROJECT FOR DATA & ANALYTICS ENGINEERS
  2 | ========
  3 | 
  4 | *Author: [Victor Iwuoha](https://linkedin.com/in/viciwuoha)*
  5 | 
  6 | *Date: 13th & 14th October 2023*
  7 | 
  8 | *Event: DataFest Africa Workshop*
  9 | 
 10 | &nbsp;
 11 | Project Contents
 12 | ================
 13 | This Project is built using the astro cli provisioned by [Astronomer](https://docs.astronomer.io/)
 14 | To Run this project a linux environment is highly recommended.
 15 | 
 16 | 
 17 | Workflow
 18 | ================
 19 | ![workflow diagram](img/workflow.png)
 20 | 
 21 | ### Key Takeaways:
 22 | - Airflow Core Concepts such as (Operators, taskflow API, dynamic task mapping, Data-Aware Scheduling, Variables & Connections )
 23 | - DBT UI Familiarization Up to Job Creation with basic concepts of Macros, Documentation, Snapshots for SCD's, and Exposures.
 24 | 
 25 | ### Prerequisites:
 26 | 
 27 | - Linux Environment/ github codespaces/Ubuntu distribution on Windows
 28 | - Docker Compose
 29 | - A DBT Cloud Account (With an API Key)
 30 | - A .env file at the root of this directory with environment variables exactly as those in .env.example but with actual values. (Do this edit In Codespaces / with a **machine of 4 Cores, 8GB RAM & 32GB Storage**)
 31 | - An accessible Postgres database with a valid connection URL. (Spin Up a free one on [ElephantSql.com](https://elephantsql.com)). _In the Url, replace *postgres with **postgresql**_
 32 | - Basic Understanding of Python & SQL
 33 | 
 34 | Deployment & Execution
 35 | ======================
 36 | 
 37 | ### Steps for deployment:
 38 | 
 39 | - Fork This Project to your git profile, create a branch named dev, then connect the repository to your dbt account.
 40 | - Give DBT adequate access to connect to this repository on your git provider (github/gitlab) -> [see steps](https://docs.getdbt.com/docs/cloud/git/connect-github)
 41 | - **Create a dbt project** with the name airflow_dbt_magic or any name of choice and point it to the dbt subdirectory of this repository.
 42 | - **Create two DBT environment Variables** as follows;
 43 |     - Key: DBT_DATAFEST_23_DB Value: As used above within airflow .env
 44 |     - Key: DBT_DATAFEST_23_SCHEMA, Value: dbt_DATAFEST_23_USER (where DATAFEST_23_USER has the same value as used in .env above). This can basically be aby schema or database.
 45 | - Create a Production environment and link it to the main branch, then create a simple DBT JOB in the Production Environment called AIRFLOW DBT JOB and add the commands (`dbt build` & `dbt snapshot`) Also select the generate docs on run checkbox. Note the **Job Id** as well as the **Account id** as they would be needed in Airflow.
 46 | 
 47 | 
 48 | ### Execution:
 49 | 
 50 | 1. Configuration, Connections & Airflow Variables setup
 51 |     - a.  After adding the environment variables in [**Prerequisites** above](#prerequisites) to your .env file, (optionally) rename the `airflow_settings.example.yaml` file as `airflow_settings.yaml` and supply the adequate values, doing so would let astro automatically load these to your airflow instance (_otherwise, follow step 2 below_). 
 52 |     - b.   Run the start.sh script using the command `bash start.sh` This should start your project, export all environment variables and create a **data_lake/** dir. To restart your airflow container after any environment/config changes, simply run the command `astro dev restart`.
 53 | 2. Create 2 airflow Connections and one Airflow Variable by using the airflow UI via Admin>Variables
 54 |     -  a. **DBT Cloud connection with the following;**
 55 |         -   Connection Id: dbt_cloud_default
 56 |         -   Account Id: YOUR_DBT_ACCOUNT_ID
 57 |         -   Api Token: YOUR_DBT_API_TOKEN
 58 |     &nbsp;
 59 | 
 60 |     -  b. **Postgres DB Connection as follows;**
 61 |         - Connection Id: postgres_default
 62 |         - Host: rajje.db.elephantsql.com (same as supplied in .env) or any other hosting platform including localhost.
 63 |         - Schema: As supplied during meetup or any other database on your host
 64 |         - Login: User name for schema
 65 |         - Password: Password of User to DB
 66 |         - Port: 5432
 67 |     &nbsp;
 68 | 
 69 |     -   c. **DBT JOB ID Variable as follows;**
 70 |         - Key: datafest_meetup_job
 71 |         - Value: YOUR_CREATED_DBT_JOB_ID
 72 |         - Description: DATAFEST meetup Job ID
 73 | 
 74 | 3. Turn on the two **fakestore_** dags and Trigger the Dag Named _**fakestore_elt_pipeline**_. If this Runs SuccessFully , the _**fakestore_dbt_job_pipeline**_ would automagically get triggered based on the dataset schedule. See more on [Airflow Datasets](https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/datasets.html).
 75 | 
 76 | 
 77 | 4. Wait for the dbt dag to complete running and navigate to the dbt cloud UI to see that the dag was triggered via the API. For more notes on the operation of this dag, see [DbtCloudOperator](https://airflow.apache.org/docs/apache-airflow-providers-dbt-cloud/stable/operators.html). In More complex Setups, there are packages that can be used with dbt core to convert your entire dbt project into airflow tasks for easier management. An example is [Astronomer Cosmos](https://github.com/astronomer/astronomer-cosmos).
 78 | 
 79 | Credits & Resources:
 80 | ===========================
 81 | 
 82 | The Structure of this project was adapted from the astronomer provided astro cli and created using astro dev init
 83 | Docs are available at the following Links
 84 | 
 85 | - [Apache Airflow](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/index.html)
 86 | - [Astronomer](https://docs.astronomer.io/)
 87 | - [DBT Cloud](https://docs.getdbt.com/) and [DBT-Cloud-Airflow Example](https://docs.getdbt.com/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud)
 88 | 
 89 | LEARN AIRFLOW
 90 | ================
 91 | - [Astronomer Academy](https://academy.astronomer.io/)
 92 | 
 93 | LEARN DBT
 94 | ============
 95 | - [DBT Learn Official Website](https://courses.getdbt.com/collections)
 96 | - [DBT Crash Course - Radovan Bacovic](https://gitlab.com/rbacovic/dbt_tutorial/)
 97 | 
 98 | 
 99 | The compilation of this project was inspired with ❤️ by the **dbt-lagos-community** 📦 .
100 | 
101 | 
102 | ===========================


--------------------------------------------------------------------------------
/dags/example_dag_advanced.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | from typing import Dict
  3 | 
  4 | # Airflow operators are templates for tasks and encompass the logic that your DAG will actually execute.
  5 | # To use an operator in your DAG, you first have to import it.
  6 | # To learn more about operators, see: https://registry.astronomer.io/.
  7 | 
  8 | from airflow.decorators import (
  9 |     dag,
 10 |     task,
 11 | )  # DAG and task decorators for interfacing with the TaskFlow API
 12 | from airflow.models.baseoperator import (
 13 |     chain,
 14 | )  # A function that sets sequential dependencies between tasks including lists of tasks.
 15 | from airflow.operators.bash import BashOperator
 16 | from airflow.operators.dummy import DummyOperator
 17 | from airflow.operators.email import EmailOperator
 18 | from airflow.operators.python import BranchPythonOperator
 19 | from airflow.operators.weekday import BranchDayOfWeekOperator
 20 | from airflow.utils.edgemodifier import (
 21 |     Label,
 22 | )  # Used to label node edges in the Airflow UI
 23 | from airflow.utils.task_group import (
 24 |     TaskGroup,
 25 | )  # Used to group tasks together in the Graph view of the Airflow UI
 26 | from airflow.utils.trigger_rule import (
 27 |     TriggerRule,
 28 | )  # Used to change how an Operator is triggered
 29 | from airflow.utils.weekday import (
 30 |     WeekDay,
 31 | )  # Used to determine what day of the week it is
 32 | 
 33 | 
 34 | """
 35 | This DAG is intended to demonstrate a number of core Apache Airflow concepts that are central to the pipeline
 36 | authoring experience, including the TaskFlow API, Edge Labels, Jinja templating, branching,
 37 | dynamic task generation, Task Groups, and Trigger Rules.
 38 | 
 39 | First, this DAG checks if the current day is a weekday or weekend. Next, the DAG  checks which day of the week
 40 | it is. Lastly, the DAG prints out a bash statement based on which day it is. On Tuesday, for example, the DAG
 41 | prints "It's Tuesday and I'm busy with studying".
 42 | 
 43 | This DAG uses the following operators:
 44 | 
 45 | BashOperator -
 46 |     Executes a bash script or bash command.
 47 | 
 48 |     See more info about this operator here:
 49 |         https://registry.astronomer.io/providers/apache-airflow/modules/bashoperator
 50 | 
 51 | DummyOperator -
 52 |     Does nothing but can be used to group tasks in a DAG
 53 | 
 54 |     See more info about this operator here:
 55 |         https://registry.astronomer.io/providers/apache-airflow/modules/dummyoperator
 56 | 
 57 | EmailOperator -
 58 |     Used to send emails
 59 | 
 60 |     See more info about this operator here:
 61 |         https://registry.astronomer.io/providers/apache-airflow/modules/emailoperator
 62 | 
 63 | BranchPythonOperator -
 64 |     Allows a workflow to “branch” after a task based on the result of a Python function
 65 | 
 66 |     See more info about this operator here:
 67 |         https://registry.astronomer.io/providers/apache-airflow/modules/branchpythonoperator
 68 | 
 69 | BranchDayOfWeekOperator -
 70 |     Branches into one of two lists of tasks depending on the current day
 71 | 
 72 |     See more info about this operator here:
 73 |         https://registry.astronomer.io/providers/apache-airflow/modules/branchdayofweekoperator
 74 | """
 75 | 
 76 | # Reference data that defines "weekday" as well as the activity assigned to each day of the week.
 77 | DAY_ACTIVITY_MAPPING = {
 78 |     "monday": {"is_weekday": True, "activity": "guitar lessons"},
 79 |     "tuesday": {"is_weekday": True, "activity": "studying"},
 80 |     "wednesday": {"is_weekday": True, "activity": "soccer practice"},
 81 |     "thursday": {"is_weekday": True, "activity": "contributing to Airflow"},
 82 |     "friday": {"is_weekday": True, "activity": "family dinner"},
 83 |     "saturday": {"is_weekday": False, "activity": "going to the beach"},
 84 |     "sunday": {"is_weekday": False, "activity": "sleeping in"},
 85 | }
 86 | 
 87 | 
 88 | @task(
 89 |     multiple_outputs=True
 90 | )  # multiple_outputs=True unrolls dictionaries into separate XCom values
 91 | def _going_to_the_beach() -> Dict:
 92 |     return {
 93 |         "subject": "Beach day!",
 94 |         "body": "It's Saturday and I'm heading to the beach.<br><br>Come join me!<br>",
 95 |     }
 96 | 
 97 | 
 98 | # This functions gets the activity from the "DAY_ACTIVITY_MAPPING" dictionary
 99 | def _get_activity(day_name) -> str:
100 |     activity_id = DAY_ACTIVITY_MAPPING[day_name]["activity"].replace(" ", "_")
101 | 
102 |     if DAY_ACTIVITY_MAPPING[day_name]["is_weekday"]:
103 |         return f"weekday_activities.{activity_id}"
104 | 
105 |     return f"weekend_activities.{activity_id}"
106 | 
107 | 
108 | # When using the DAG decorator, the "dag" argument doesn't need to be specified for each task.
109 | # The "dag_id" value defaults to the name of the function it is decorating if not explicitly set.
110 | # In this example, the "dag_id" value would be "example_dag_advanced".
111 | @dag(
112 |     # This DAG is set to run for the first time on June 11, 2021. Best practice is to use a static start_date.
113 |     # Subsequent DAG runs are instantiated based on scheduler_interval below.
114 |     start_date=datetime(2021, 6, 11),
115 |     # This defines how many instantiations of this DAG (DAG Runs) can execute concurrently. In this case,
116 |     # we're only allowing 1 DAG run at any given time, as opposed to allowing multiple overlapping DAG runs.
117 |     max_active_runs=1,
118 |     # This defines how often your DAG will run, or the schedule by which DAG runs are created. It can be
119 |     # defined as a cron expression or custom timetable. This DAG will run daily.
120 |     schedule_interval="@daily",
121 |     # Default settings applied to all tasks within the DAG; can be overwritten at the task level.
122 |     default_args={
123 |         "owner": "community",  # This defines the value of the "owner" column in the DAG view of the Airflow UI
124 |         "retries": 2,  # If a task fails, it will retry 2 times.
125 |         "retry_delay": timedelta(
126 |             minutes=3
127 |         ),  # A task that fails will wait 3 minutes to retry.
128 |     },
129 |     default_view="graph",  # This defines the default view for this DAG in the Airflow UI
130 |     # When catchup=False, your DAG will only run for the latest schedule interval. In this case, this means
131 |     # that tasks will not be run between June 11, 2021 and 1 day ago. When turned on, this DAG's first run
132 |     # will be for today, per the @daily schedule interval
133 |     catchup=False,
134 |     tags=["example"],  # If set, this tag is shown in the DAG view of the Airflow UI
135 | )
136 | def example_dag_advanced():
137 |     # DummyOperator placeholder for first task
138 |     begin = DummyOperator(task_id="begin")
139 |     # Last task will only trigger if no previous task failed
140 |     end = DummyOperator(task_id="end", trigger_rule=TriggerRule.NONE_FAILED)
141 | 
142 |     # This task checks which day of the week it is
143 |     check_day_of_week = BranchDayOfWeekOperator(
144 |         task_id="check_day_of_week",
145 |         week_day={WeekDay.SATURDAY, WeekDay.SUNDAY},  # This checks day of week
146 |         follow_task_ids_if_true="weekend",  # Next task if criteria is met
147 |         follow_task_ids_if_false="weekday",  # Next task if criteria is not met
148 |         use_task_execution_day=True,  # If True, uses task’s execution day to compare with is_today
149 |     )
150 | 
151 |     weekend = DummyOperator(task_id="weekend")  # "weekend" placeholder task
152 |     weekday = DummyOperator(task_id="weekday")  # "weekday" placeholder task
153 | 
154 |     # Templated value for determining the name of the day of week based on the start date of the DAG Run
155 |     day_name = "{{ dag_run.start_date.strftime('%A').lower() }}"
156 | 
157 |     # Begin weekday tasks.
158 |     # Tasks within this TaskGroup (weekday tasks) will be grouped together in the Airflow UI
159 |     with TaskGroup("weekday_activities") as weekday_activities:
160 |         which_weekday_activity_day = BranchPythonOperator(
161 |             task_id="which_weekday_activity_day",
162 |             python_callable=_get_activity,  # Python function called when task executes
163 |             op_args=[day_name],
164 |         )
165 | 
166 |         for day, day_info in DAY_ACTIVITY_MAPPING.items():
167 |             if day_info["is_weekday"]:
168 |                 day_of_week = Label(label=day)
169 |                 activity = day_info["activity"]
170 | 
171 |                 # This task prints the weekday activity to bash
172 |                 do_activity = BashOperator(
173 |                     task_id=activity.replace(" ", "_"),
174 |                     bash_command=f"echo It's {day.capitalize()} and I'm busy with {activity}.",  # This is the bash command to run
175 |                 )
176 | 
177 |                 # Declaring task dependencies within the "TaskGroup" via the classic bitshift operator.
178 |                 which_weekday_activity_day >> day_of_week >> do_activity
179 | 
180 |     # Begin weekend tasks
181 |     # Tasks within this TaskGroup will be grouped together in the UI
182 |     with TaskGroup("weekend_activities") as weekend_activities:
183 |         which_weekend_activity_day = BranchPythonOperator(
184 |             task_id="which_weekend_activity_day",
185 |             python_callable=_get_activity,  # Python function called when task executes
186 |             op_args=[day_name],
187 |         )
188 | 
189 |         # Labels that will appear in the Graph view of the Airflow UI
190 |         saturday = Label(label="saturday")
191 |         sunday = Label(label="sunday")
192 | 
193 |         # This task prints the Sunday activity to bash
194 |         sleeping_in = BashOperator(
195 |             task_id="sleeping_in", bash_command="sleep $[ ( $RANDOM % 30 )  + 1 ]s"
196 |         )
197 | 
198 |         going_to_the_beach = _going_to_the_beach()  # Calling the taskflow function
199 | 
200 |         # Because the "_going_to_the_beach()" function has "multiple_outputs" enabled, each dict key is
201 |         # accessible as their own "XCom" key.
202 |         inviting_friends = EmailOperator(
203 |             task_id="inviting_friends",
204 |             to="friends@community.com",  # Email to send email to
205 |             subject=going_to_the_beach["subject"],  # Email subject
206 |             html_content=going_to_the_beach["body"],  # Eamil body content
207 |         )
208 | 
209 |         # Using "chain()" here for list-to-list dependencies which are not supported by the bitshift
210 |         # operator and to simplify the notation for the desired dependency structure.
211 |         chain(
212 |             which_weekend_activity_day,
213 |             [saturday, sunday],
214 |             [going_to_the_beach, sleeping_in],
215 |         )
216 | 
217 |     # High-level dependencies between tasks
218 |     chain(
219 |         begin,
220 |         check_day_of_week,
221 |         [weekday, weekend],
222 |         [weekday_activities, weekend_activities],
223 |         end,
224 |     )
225 | 
226 |     # Task dependency created by XComArgs:
227 |     # going_to_the_beach >> inviting_friends
228 | 
229 | 
230 | dag = example_dag_advanced()
231 | 


--------------------------------------------------------------------------------