├── requirements.txt ├── .flake8 ├── data ├── expected │ ├── agg_sales_category.csv │ ├── stg_purchases_2020-01-01.csv │ ├── stg_products.csv │ └── products_sales.csv ├── products.csv └── purchases.csv ├── sql ├── sales │ ├── union_staging_to_products_sales.sql │ ├── delete_products_sales_exec_date.sql │ ├── clean_staging_tables.sql │ ├── agg_sales_category.sql │ └── join_purchases_with_products.sql └── init │ ├── create_products.sql │ ├── create_purchases.sql │ └── create_products_sales.sql ├── assets ├── images │ ├── diagram.png │ ├── our_dag.png │ ├── our_initial_dag.png │ ├── output_products.png │ ├── workflow_with_tests.png │ └── airflow_first_exec_error.png └── how-to │ ├── create-dag-using-tdd.md │ └── criar-dag-usando-tdd.md ├── .github └── workflows │ ├── monday-building.yml │ └── airflow.yml ├── .env ├── Makefile ├── docker-compose.yml ├── .gitignore ├── dags └── dag_sales_pipeline.py ├── tests └── test_sales_pipeline.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | sqlAlchemy 3 | pandas 4 | psycopg2-binary -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501 3 | exclude = 4 | __pycache__, 5 | -------------------------------------------------------------------------------- /data/expected/agg_sales_category.csv: -------------------------------------------------------------------------------- 1 | product_category,total_revenue_category 2 | T-Shirt,375.0 3 | Shoes,300.0 4 | -------------------------------------------------------------------------------- /sql/sales/union_staging_to_products_sales.sql: -------------------------------------------------------------------------------- 1 | insert into products_sales 2 | select * from stg_join_purchases_with_products; -------------------------------------------------------------------------------- /assets/images/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/HEAD/assets/images/diagram.png -------------------------------------------------------------------------------- /assets/images/our_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/HEAD/assets/images/our_dag.png -------------------------------------------------------------------------------- /sql/sales/delete_products_sales_exec_date.sql: -------------------------------------------------------------------------------- 1 | delete from products_sales where "purchase_date"::text = cast({{ ds }} as text) -------------------------------------------------------------------------------- /assets/images/our_initial_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/HEAD/assets/images/our_initial_dag.png -------------------------------------------------------------------------------- /assets/images/output_products.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/HEAD/assets/images/output_products.png -------------------------------------------------------------------------------- /sql/sales/clean_staging_tables.sql: -------------------------------------------------------------------------------- 1 | delete from stg_purchases; 2 | delete from stg_products; 3 | delete from stg_join_purchases_with_products; -------------------------------------------------------------------------------- /assets/images/workflow_with_tests.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/HEAD/assets/images/workflow_with_tests.png -------------------------------------------------------------------------------- /assets/images/airflow_first_exec_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/HEAD/assets/images/airflow_first_exec_error.png -------------------------------------------------------------------------------- /sql/init/create_products.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {tablename} ( 2 | product_id INTEGER, 3 | product_name TEXT, 4 | product_category TEXT 5 | ); -------------------------------------------------------------------------------- /data/products.csv: -------------------------------------------------------------------------------- 1 | product_id,product_name,product_category 2 | 220,Brand A T-Shirt,T-Shirt 3 | 222,Dri-FIT T-Shirt,T-Shirt 4 | 225,Brand N T-shirt,T-Shirt 5 | 227,Casual Shoes,Shoes 6 | 228,Generic Running Shoes,Shoes 7 | -------------------------------------------------------------------------------- /data/expected/stg_purchases_2020-01-01.csv: -------------------------------------------------------------------------------- 1 | purchase_id,purchase_date,user_id,product_id,unit_price,quantity,total_revenue 2 | 1,2020-01-01,111,222,150.0,2,300.0 3 | 2,2020-01-01,101,225,75,1,75 4 | 3,2020-01-01,153,228,300,1,300 5 | -------------------------------------------------------------------------------- /data/expected/stg_products.csv: -------------------------------------------------------------------------------- 1 | product_id,product_name,product_category 2 | 220,Brand A T-Shirt,T-Shirt 3 | 222,Dri-FIT T-Shirt,T-Shirt 4 | 225,Brand N T-shirt,T-Shirt 5 | 227,Casual Shoes,Shoes 6 | 228,Generic Running Shoes,Shoes 7 | -------------------------------------------------------------------------------- /sql/sales/agg_sales_category.sql: -------------------------------------------------------------------------------- 1 | drop table if exists agg_sales_category; 2 | 3 | create table if not exists agg_sales_category as ( 4 | select 5 | product_category, 6 | sum(total_revenue) as total_revenue_category 7 | from products_sales 8 | group by 1 9 | ) -------------------------------------------------------------------------------- /sql/init/create_purchases.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {tablename} ( 2 | purchase_id INTEGER, 3 | purchase_date TEXT, 4 | user_id INTEGER, 5 | product_id INTEGER, 6 | unit_price REAL, 7 | quantity INTEGER, 8 | total_revenue REAL 9 | ) -------------------------------------------------------------------------------- /sql/sales/join_purchases_with_products.sql: -------------------------------------------------------------------------------- 1 | create table if not exists stg_join_purchases_with_products as ( 2 | select 3 | t.*, 4 | p.product_name, 5 | p.product_category 6 | from stg_purchases t 7 | left join stg_products p 8 | on p.product_id = t.product_id 9 | ) -------------------------------------------------------------------------------- /data/expected/products_sales.csv: -------------------------------------------------------------------------------- 1 | purchase_id,purchase_date,user_id,product_id,unit_price,quantity,total_revenue,product_name,product_category 2 | 1,2020-01-01,111,222,150.0,2,300.0,Dri-FIT T-Shirt,T-Shirt 3 | 2,2020-01-01,101,225,75.0,1,75.0,Brand N T-shirt,T-Shirt 4 | 3,2020-01-01,153,228,300.0,1,300.0,Generic Running Shoes,Shoes 5 | -------------------------------------------------------------------------------- /.github/workflows/monday-building.yml: -------------------------------------------------------------------------------- 1 | name: MondayBuilding 2 | 3 | on: 4 | schedule: 5 | - cron: "0 10 * * 1" 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Create airflow env 13 | run: make setup 14 | - name: Run tests 15 | run: make testing 16 | -------------------------------------------------------------------------------- /sql/init/create_products_sales.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {tablename} ( 2 | purchase_id INTEGER, 3 | purchase_date TEXT, 4 | user_id INTEGER, 5 | product_id INTEGER, 6 | unit_price REAL, 7 | quantity INTEGER, 8 | total_revenue REAL, 9 | product_name TEXT, 10 | product_category TEXT 11 | ) -------------------------------------------------------------------------------- /.github/workflows/airflow.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | steps: 14 | 15 | - uses: actions/checkout@v2 16 | - name: Create airflow env 17 | run: make setup 18 | - name: Run tests 19 | run: make testing 20 | -------------------------------------------------------------------------------- /data/purchases.csv: -------------------------------------------------------------------------------- 1 | purchase_id,purchase_date,user_id,product_id,unit_price,quantity,total_revenue 2 | 1,2020-01-01,111,222,150.0,2,300.0 3 | 2,2020-01-01,101,225,75,1,75 4 | 3,2020-01-01,153,228,300,1,300 5 | 4,2020-01-10,111,227,500,1,500 6 | 5,2020-01-10,199,222,150,3,450 7 | 6,2020-01-10,182,220,35,4,140 8 | 7,2020-01-10,174,222,150,1,150 9 | 8,2020-01-15,166,227,500,1,500 10 | 9,2020-01-15,132,225,75,1,75 11 | 10,2020-01-15,188,220,35,10,350 12 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | EXECUTOR=SequentialExecutor 2 | REPO=apache/airflow:master-python3.7 3 | FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= 4 | AIRFLOW_API_BACKEND=airflow.api.auth.backend.default 5 | AIRFLOW_EXAMPLES=False 6 | 7 | POSTGRES_USER=root 8 | POSTGRES_PASSWORD=root 9 | POSTGRES_DB=airflow 10 | POSTGRES_PORT=5432 11 | POSTGRES_HOST=airflow-metastore 12 | 13 | SOURCE_HOST=oltp-db 14 | SOURCE_PORT=5432 15 | SOURCE_DB=oltp 16 | SOURCE_USER=root 17 | SOURCE_PASSWORD=root 18 | 19 | DEST_HOST=olap-db 20 | DEST_PORT=5432 21 | DEST_DB=olap 22 | DEST_USER=root 23 | DEST_PASSWORD=root -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include .env 2 | 3 | setup: 4 | docker-compose up -d --force-recreate --remove-orphans 5 | sleep 240 6 | docker exec airflow airflow users create --username admin --password admin --role Admin --firstname Ademir --lastname Junior --email admin@email.com 7 | docker exec airflow airflow connections add 'oltp' --conn-uri 'postgresql://root:root@oltp-db:5432/oltp' 8 | docker exec airflow airflow connections add 'olap' --conn-uri 'postgresql://root:root@olap-db:5432/olap' 9 | 10 | down: 11 | docker-compose down 12 | 13 | testing: 14 | docker exec airflow pytest -v 15 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Airflow is evolving rapidly and so maybe the docker and commands 2 | # used here may fail in the near future. The problem I face 3 | # now you have to wait a few minutes before you can send commands 4 | # for Airflow. 5 | 6 | # Running Airflow in Docker (official documentation) 7 | # http://apache-airflow-docs.s3-website.eu-central-1.amazonaws.com/docs/apache-airflow/latest/start/docker.html 8 | # I'm trying to implement this setup, but had problem 9 | 10 | # Github discussion about prod-ready docker-compose Airflow images 11 | # https://github.com/apache/airflow/issues/8605 12 | 13 | 14 | version: '3.6' 15 | 16 | services: 17 | 18 | airflow: 19 | image: ${REPO} 20 | container_name: airflow 21 | volumes: 22 | - ./dags:/opt/airflow/dags 23 | - ./sql:/opt/airflow/sql 24 | - ./tests:/opt/airflow/tests 25 | - ./data:/opt/airflow/data 26 | - ./requirements.txt:/opt/airflow/requirements.txt 27 | environment: 28 | - AIRFLOW_HOME=/opt/airflow 29 | - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} 30 | - AIRFLOW__CORE__EXECUTOR=${EXECUTOR} 31 | - AIRFLOW__CORE__LOAD_EXAMPLES=false 32 | - AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=false 33 | - AIRFLOW__API__AUTH_BACKEND=${AIRFLOW_API_BACKEND} 34 | ports: 35 | - 8080:8080 36 | command: 'bash -c "pip3 install -r requirements.txt && airflow db init && airflow webserver -D && airflow scheduler -D"' 37 | 38 | oltp-db: 39 | image: postgres:11.4-alpine 40 | restart: always 41 | container_name: oltp-db 42 | ports: 43 | - 54320:54320 44 | environment: 45 | - POSTGRES_USER=${SOURCE_USER} 46 | - POSTGRES_PASSWORD=${SOURCE_PASSWORD} 47 | - POSTGRES_DB=${SOURCE_DB} 48 | 49 | olap-db: 50 | image: postgres:11.4-alpine 51 | restart: always 52 | container_name: olap-db 53 | ports: 54 | - 54321:54321 55 | environment: 56 | - POSTGRES_USER=${DEST_USER} 57 | - POSTGRES_PASSWORD=${DEST_PASSWORD} 58 | - POSTGRES_DB=${DEST_DB} 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Airflow logs 2 | logs/ 3 | dags/sql 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | #.env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ 143 | -------------------------------------------------------------------------------- /dags/dag_sales_pipeline.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.utils.dates import days_ago 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook 4 | from airflow.providers.postgres.operators.postgres import PostgresOperator 5 | from airflow.operators.python import PythonOperator 6 | 7 | 8 | def transfer_oltp_olap(**kwargs): 9 | """Get records from OLTP and transfer to OLAP database""" 10 | dest_table = kwargs.get('dest_table') 11 | sql = kwargs.get('sql') 12 | params = kwargs.get('params') 13 | 14 | oltp_hook = PostgresHook(postgres_conn_id='oltp') 15 | olap_hook = PostgresHook(postgres_conn_id='olap') 16 | data_extracted = oltp_hook.get_records(sql=sql, parameters=params) 17 | olap_hook.insert_rows(dest_table, data_extracted, commit_every=1000) 18 | 19 | 20 | with DAG(dag_id='products_sales_pipeline', 21 | default_args={'owner': 'airflow'}, 22 | schedule_interval=None, 23 | start_date=days_ago(2), 24 | template_searchpath='/opt/airflow/sql/sales/', 25 | tags=['etl', 'analytics', 'sales']) as dag: 26 | 27 | execution_date = '{{ ds }}' 28 | 29 | load_incremental_purchases_data = PythonOperator( 30 | task_id='load_incremental_purchases', 31 | python_callable=transfer_oltp_olap, 32 | op_kwargs={ 33 | 'dest_table': 'stg_purchases', 34 | 'sql': 'select * from purchases where "purchase_date" = %s', 35 | 'params': [execution_date], 36 | }) 37 | 38 | load_full_products_data = PythonOperator( 39 | task_id='load_full_products', 40 | python_callable=transfer_oltp_olap, 41 | op_kwargs={ 42 | 'dest_table': 'stg_products', 43 | 'sql': 'select * from products', 44 | }) 45 | 46 | delete_products_sales_exec_date = PostgresOperator( 47 | task_id='delete_products_sales_exec_date', 48 | postgres_conn_id='olap', 49 | sql='delete_products_sales_exec_date.sql' 50 | ) 51 | 52 | join_purchases_with_products = PostgresOperator( 53 | task_id='join_purchases_products', 54 | postgres_conn_id='olap', 55 | sql='join_purchases_with_products.sql' 56 | ) 57 | 58 | union_incremental_products_sales = PostgresOperator( 59 | task_id='union_staging_to_products_sales', 60 | postgres_conn_id='olap', 61 | sql='union_staging_to_products_sales.sql' 62 | ) 63 | 64 | agg_sales_category = PostgresOperator( 65 | task_id='rebuild_agg_sales_category', 66 | postgres_conn_id='olap', 67 | sql='agg_sales_category.sql' 68 | ) 69 | 70 | [load_full_products_data, load_incremental_purchases_data, delete_products_sales_exec_date] >> join_purchases_with_products 71 | join_purchases_with_products >> union_incremental_products_sales 72 | union_incremental_products_sales >> agg_sales_category 73 | -------------------------------------------------------------------------------- /tests/test_sales_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import pandas as pd 4 | from unittest import TestCase 5 | from pandas._testing import assert_frame_equal 6 | from airflow.providers.postgres.hooks.postgres import PostgresHook 7 | 8 | 9 | AIRFLOW_HOME = os.environ.get('AIRFLOW_HOME') 10 | 11 | 12 | def insert_initial_data(tablename, hook): 13 | """This script will populate database with initial data to run job""" 14 | conn_engine = hook.get_sqlalchemy_engine() 15 | sample_data = pd.read_csv(f'{AIRFLOW_HOME}/data/{tablename}.csv') 16 | sample_data.to_sql(name=tablename, con=conn_engine, if_exists='replace', index=False) 17 | 18 | 19 | def create_table(tablename, hook): 20 | filename = tablename.replace('stg_', '') 21 | sql_stmt = open(f'/opt/airflow/sql/init/create_{filename}.sql').read() 22 | hook.run(sql_stmt.format(tablename=tablename)) 23 | 24 | 25 | def output_expected_as_df(filename): 26 | return pd.read_csv(f'{AIRFLOW_HOME}/data/expected/{filename}.csv') 27 | 28 | 29 | def execute_dag(dag_id, execution_date): 30 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 31 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 32 | 33 | 34 | class TestDataTransfer(TestCase): 35 | 36 | def setUp(self): 37 | self.oltp_hook = PostgresHook('oltp') 38 | self.olap_hook = PostgresHook('olap') 39 | 40 | def test_validate_sales_pipeline(self): 41 | """Validate if Sales Pipeline DAG run correctly""" 42 | date = '2020-01-01' 43 | 44 | create_table('purchases', self.oltp_hook) 45 | insert_initial_data('purchases', self.oltp_hook) 46 | 47 | create_table('products', self.oltp_hook) 48 | insert_initial_data('products', self.oltp_hook) 49 | 50 | create_table('stg_purchases', self.olap_hook) 51 | create_table('stg_products', self.olap_hook) 52 | create_table('products_sales', self.olap_hook) 53 | 54 | execute_dag('products_sales_pipeline', date) 55 | 56 | stg_purchases_result = self.olap_hook.get_pandas_df('select * from stg_purchases') 57 | stg_purchases_expected = output_expected_as_df(f'stg_purchases_{date}') 58 | assert_frame_equal(stg_purchases_result, stg_purchases_expected) 59 | assert len(stg_purchases_result) == 3 60 | 61 | stg_products_result = self.olap_hook.get_pandas_df('select * from stg_products') 62 | stg_products_expected = output_expected_as_df('stg_products') 63 | assert_frame_equal(stg_products_result, stg_products_expected) 64 | assert len(stg_products_result) == 5 65 | 66 | product_sales_result = self.olap_hook.get_pandas_df('select * from products_sales') 67 | product_sales_expected = output_expected_as_df('products_sales') 68 | assert_frame_equal(product_sales_result, product_sales_expected) 69 | assert len(product_sales_result) == 3 70 | 71 | agg_result = self.olap_hook.get_pandas_df('select * from agg_sales_category') 72 | agg_expected = output_expected_as_df('agg_sales_category') 73 | assert_frame_equal(agg_result, agg_expected) 74 | assert len(agg_result) == 2 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Airflow DAG development with tests + CI workflow 2 | [![CI](https://github.com/marcosmarxm/airflow-testing-ci-workflow/workflows/CI/badge.svg?branch=master)](https://github.com/marcosmarxm/airflow-testing-ci-workflow/actions?query=workflow%3ACI) 3 | [![MondayBuilding](https://github.com/marcosmarxm/airflow-testing-ci-workflow/workflows/MondayBuilding/badge.svg)](https://github.com/marcosmarxm/airflow-testing-ci-workflow/actions?query=workflow%3AMondayBuilding) 4 | 5 | This code is complementar to the article [How to develop data pipeline in Airflow through TDD (test-driven development)](https://blog.magrathealabs.com/how-to-develop-data-pipeline-in-airflow-through-tdd-test-driven-development-c3333439f358). 6 | I suggest you to read to better understand the code and the way I think how to setup the project. 7 | 8 | [Step-by-step: How to develop a DAG using TDD (english version)](assets/how-to/create-dag-using-tdd.md)
9 | [Passo-a-passo: Como desenvolver uma DAG usando TDD (portuguese version)](assets/how-to/criar-dag-usando-tdd.md) 10 | 11 | ## The project 12 | 13 | Below is a summary of what will be accomplished in this project. We'll simulate the transfer of some fake transaction data from an ecommerce. A simple task transfering data from the `otlp-db` database to the `olap-db` database. 14 | 15 | ![Diagram](./assets/images/diagram.png) 16 | 17 | To help in the development we use a local development environment to build the pipeline with tests and also a Continuous Integration pipeline with Github Action to ensure that tests are applied at each change. 18 | 19 | **Containers** 20 | - **airflow**: container running local setup for development; 21 | - **oltp-db** and **olap-db**: container that simulate database in a production environment and receive fake data; 22 | 23 | In this tutorial we won't developt the dashboard part only the pipeline. 24 | 25 | ### Dependencies? 26 | Docker, docker-compose and makefile. 27 | 28 | ### How to run? 29 | 30 | The command below will setup the environment using docker-compose. Wait a few minutes (240s, yeah omg right?) to Airflow initialize its internal configuration, then the command will create credentials and connections. 31 | ```bash 32 | make setup 33 | ``` 34 | By running the above command it is possible to access Airflow at `localhost: 8080`. 35 | A user of test user: admin / password: admin is created. At this stage you can develop your DAGs and test them as you modify them. 36 | And finally, the command that calls the `pytest` to perform tests. 37 | ```bash 38 | make testing 39 | ``` 40 | ![Containers](./assets/images/workflow_with_tests.png) 41 | --- 42 | 43 | Some resources about Airflow testing and DataOps: 44 | * [Pipelines on pipelines: Agile CI/CD workflows for Airflow DAGs @ Airflow Summit 2020](https://www.youtube.com/watch?v=tY4F9X5l6dg) 45 | * [Data Testing with Airflow](https://github.com/danielvdende/data-testing-with-airflow) 46 | * [Data's Inferno: 7 Circles of Data Testing Hell with Airflow](https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8) 47 | * [Testing and Debugging in Apache Airflow by GoDataDriven](https://godatadriven.com/blog/testing-and-debugging-apache-airflow/) 48 | * [The Challenge of Testing Data Pipelines](https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1) 49 | * [Automated Testing for Proceting Data Pipeliens from Undocumented Assumptions](https://www.youtube.com/watch?v=z-kPgEAJCrA&ab_channel=Databricks) 50 | * [Why Great Data Engineering Needs Automated Testing](https://medium.com/weareservian/why-data-engineering-needs-automated-testing-a37a0844d7db) 51 | * [Testing in Airflow Part 1 - DAG validation tests, DAG definition tests and unit tests](https://blog.usejournal.com/testing-in-airflow-part-1-dag-validation-tests-dag-definition-tests-and-unit-tests-2aa94970570c) 52 | * [Testing in Airflow Part 2 - Integration Tests and e2e Pipeline Tests](https://medium.com/@chandukavar/testing-in-airflow-part-2-integration-tests-and-end-to-end-pipeline-tests-af0555cd1a82) 53 | -------------------------------------------------------------------------------- /assets/how-to/create-dag-using-tdd.md: -------------------------------------------------------------------------------- 1 | # Tutorial explaining how to develop a DAG using TDD 2 | 3 | ## Introduction 4 | 5 | In this tutorial we'll build the DAG requested in the project from the beginning. If you want to follow along I advise you to clone the repository and checkout to the branch `tutorial`. 6 | 7 | ```bash 8 | git clone git@github.com:marcosmarxm/airflow-testing-ci-workflow.git 9 | git checkout tutorial 10 | ``` 11 | 12 | To get the most out of it you must have a basic knowledge of **Airflow**, **python** and **pytest**. 13 | In case you don't know, I think ... as we build gradually, maybe you can go on researching and learning the concepts right away. 14 | 15 | Recalling the pipeline that we'll develop: 16 | 17 | ![Our DAG](../images/our_dag.png) 18 | 19 | Explaining each task: 20 | 21 | - **load_full_products**: deletes the old data and loads the `products` table completely every day. 22 | - **load_incremental_purchases**: due to the size of this table, an incremental load will be performed using the `execution_date` data parameter. 23 | - **join_purchase_products_as_product_sales_daily**: this intermediary task prepares the raw data (products and purchases) loaded from the `oltp` database to be stored in the` product_sales` results table that will be used by the analytics team. 24 | - **delete_products_sales_exec_date**: this task has the function of clearing the data from the `product_sales` result table at the beginning of the pipeline, thus ensuring that there will be no duplicate data (idempotency). 25 | - **union_staging_to_products_sales**: load the data from the staging `product_sales_daily` to the table with historical data ` product_sales`. 26 | - **rebuild_agg_sales_category**: the result of the table above already illustrates a standard consumption format for a data warehouse, this task illustrates a creation of a simplified "data mart". 27 | 28 | ## Let's start! 29 | 30 | First, let's get our development environment up and running. 31 | If you have doubts about the environment, I recommend reading the article [How to develop data pipeline in Airflow through TDD (test-driven development)](https://blog.magrathealabs.com/how-to-develop-data-pipeline-in-airflow-through-tdd-test-driven-development-c3333439f358). You can see the code in the file `Makefile`. 32 | 33 | ```bash 34 | make setup 35 | ``` 36 | 37 | This will take a few minutes. 38 | The Airflow 2.0 docker image with LocalExecutor is taking too long to do the initial setup. 39 | After the initial configuration, we sent some commands to Airflow: create a user, some connections (oltp-db and olap-db) and variables. 40 | 41 | We already have a diagram of the pipeline sketched. 42 | So let's think about how to build it now. 43 | We have also seen the data format and a small sample of it. 44 | With that we have the input to carry out the project's development. 45 | 46 | ## TASK: Full load product data 47 | 48 | The first task that we'll develop is `full_load_product_data`. 49 | It's intended to take data from the `products` table of the` oltp-db` database and transfer it to `olap-db`. 50 | First let's create our fake data to guide us. 51 | Create a file in the `/data` directory called `products.csv`. 52 | You can get the data from the [file provided as an example in the master branch](https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/master/data/products.csv). 53 | Example below: 54 | 55 | |product_id|product_name |product_category| 56 | |----------|---------------------|----------------| 57 | |220 |Brand A T-Shirt |T-Shirt | 58 | |222 |Dri-FIT T-Shirt |T-Shirt | 59 | |225 |Brand N T-shirt |T-Shirt | 60 | |227 |Casual Shoes |Shoes | 61 | |228 |Generic Running Shoes|Shoes | 62 | 63 | After that, we'll start the development of DAG using the TDD methodology. 64 | We need to create a test, run it, and it'll fail. 65 | Next we'll code to correct the error. 66 | Entering a looping between test/error/code correction until the pipeline ends. 67 | The advantages are: 68 | 69 | - quick feedback of the problem, we'll have only one error to solve; 70 | - gradual building our code certifying that it works. 71 | 72 | Create a file in the directory `/tests` called `test_sales_pipeline.py`. 73 | 74 | ```python 75 | # test_sales_pipeline.py 76 | class TestSalesPipeline: 77 | 78 | def test_validate_sales_pipeline(self): 79 | assert len(olap_product_size) == 5 80 | ``` 81 | 82 | **Reflecting with myself**: The purpose of this task is to compare the data that will be in the `olap-db` database in the `products` table with the sample data `/data/products.csv`... 83 | 84 | `olap_product_size`: it is the variable that I am planning to receive the values that should be transferred, it is likely a list with values or a pandas dataframe. 85 | Let's start with the most basic possible: 86 | 87 | * Compare our `olap_product_size` result and see if it has all the items we expect it to have. As we can see in the sample data `/data/products.csv` we have 5 entries, so we want to compare the size of `olap_product_size` with 5. 88 | 89 | We can run our test for the first time using the command: 90 | 91 | ```bash 92 | make testing 93 | ``` 94 | 95 | It'll result in an error. The variable `olap_product_size` doesn't exist. 96 | In the project, this variable must retrieve the data from the `olap-db` database in the `products` table. 97 | So, we need to create a connection with the `olap-db` database and retrieve these records. 98 | 99 | Since we are using Airflow, we'll use **Hooks** that have several methods to interact with databases. 100 | As we have already configured the container `olap-db` and the connection of Airflow to it in the setup, it'll be very simple to complete this step. 101 | We will use **PostgresHook**, if you want to know more about Hooks you can access [Airflow's documentation](https://airflow.apache.org/docs/apache-airflow/stable/concepts.html?highlight=hook#hooks) 102 | 103 | ```python 104 | from airflow.providers.postgres.hooks.postgres import PostgresHook 105 | 106 | class TestSalesPipeline: 107 | 108 | def test_validate_sales_pipeline(self): 109 | olap_hook = PostgresHook('olap') 110 | olap_product_size = olap_hook.get_records( 111 | 'select * from products' 112 | ) 113 | assert len(olap_product_size) == 5 114 | ``` 115 | 116 | We imported **PostgresHook** and created the hook for the `olap-db` database. 117 | This hook has a method that can execute an SQL query and return its values. 118 | After editing the test file as shown above, we can run `make testing` again. 119 | We'll receive the error that the `products` table doesn't exist in the `olap-db` database. 120 | 121 | > **Attention Point** Here comes an important consideration about tests. 122 | > Our pipeline is responsible for transferring data and not creating these tables. 123 | > So it is part of the test to configure this setup of the tables. 124 | 125 | ```python 126 | from airflow.providers.postgres.hooks.postgres import PostgresHook 127 | 128 | class TestSalesPipeline: 129 | 130 | def test_validate_sales_pipeline(self): 131 | olap_hook = PostgresHook('olap') 132 | olap_hook.run(''' 133 | CREATE TABLE IF NOT EXISTS products ( 134 | product_id INTEGER, 135 | product_name TEXT, 136 | product_category TEXT 137 | ); 138 | ''') 139 | olap_product_size = olap_hook.get_records( 140 | 'select * from products' 141 | ) 142 | assert len(olap_product_size) == 5 143 | ``` 144 | 145 | The `.run(sql statement)` command performs an SQL query on the database. It is similar to the `.get_records` we saw before, however it's for when we do not want the return data. 146 | In the example above, it'll create the table `products` with the necessary columns according to our sample data from `/data/products.csv`. 147 | 148 | We ran the tests again and now the error we have is that there is a difference between `olap_product_size` and the value we expect to equal 5. 149 | At this stage we have reached the need to start our DAG because we have already configured our test initially. 150 | We'll create a file called `dag_sales_pipeline.py` inside the `/dags` directory. 151 | 152 | ```python 153 | from airflow import DAG 154 | from airflow.utils.dates import days_ago 155 | 156 | with DAG(dag_id='products_sales_pipeline', 157 | default_args={'owner': 'airflow'}, 158 | schedule_interval=None, 159 | start_date=days_ago(2)) as dag: 160 | ``` 161 | 162 | The above code just instantiates a new DAG. 163 | We need to think now about how to solve our problem. 164 | We need a function that transfers data from the `oltp-db` database to `olap-db`. 165 | We have already seen that **Hooks** in Airflow have methods that can help us: execute a sql and get the data (`get_records`), execute a sql without returning the data (`run`), among other interactions with the database (`insert_rows`, etc). 166 | 167 | ```python 168 | from airflow import DAG 169 | from airflow.utils.dates import days_ago 170 | from airflow.providers.postgres.hooks.postgres import PostgresHook 171 | from airflow.operators.python import PythonOperator 172 | 173 | 174 | def transfer_oltp_olap(**kwargs): 175 | """Get records from OLTP and transfer to OLAP database""" 176 | dest_table = kwargs.get('dest_table') 177 | sql = kwargs.get('sql') 178 | 179 | oltp_hook = PostgresHook(postgres_conn_id='oltp') 180 | olap_hook = PostgresHook(postgres_conn_id='olap') 181 | 182 | data_extracted = oltp_hook.get_records(sql=sql) 183 | olap_hook.insert_rows(dest_table, data_extracted, commit_every=1000) 184 | 185 | 186 | with DAG(dag_id='products_sales_pipeline', 187 | default_args={'owner': 'airflow'}, 188 | schedule_interval=None, 189 | start_date=days_ago(2)) as dag: 190 | 191 | load_full_products_data = PythonOperator( 192 | task_id='load_full_products', 193 | python_callable=transfer_oltp_olap, 194 | op_kwargs={ 195 | 'dest_table': 'products', 196 | 'sql': 'select * from products', 197 | }) 198 | ``` 199 | 200 | Explaining what was accomplished: 201 | 202 | 1. We created the **task** `load_full_products_data`, which is a PythonOperator. An **Operator** is a concept in Airflow that can invoke basic/standardized commands. For example **PythonOperator** calls functions in `python` and **PostgresOperator** can execute SQL queries but cannot transfer data from one database to another. For more information, I recommend reading the [documentation](https://airflow.apache.org/docs/apache-airflow/stable/concepts.html?highlight=hook#operators). 203 | 2. We created the function `transfer_oltp_olap` it basically creates the two hooks to perform the data collection in the `oltp-db` database for `olap-db`. Why didn't we use **PostgresOperator**? The reason is that the operator can only execute the query at the limit of the bank it is associated with, it does not transfer data. That's why we use hooks. _kwargs_ is an Airflow convention to pass arguments in functions called by **PythonOperator**. 204 | 205 | > **Explaining** the `load_full_products_data` function is using Airflow resources (memory, cpu) to execute. 206 | Nothing wrong with that, but in a more realistic scenario you should probably call an external service to carry out this transfer, transformation etc. (Cluster Spark / Hadoop, etc.) 207 | 208 | After completing the DAG we can access Airflow at http://localhost:8080, using the credentials *admin/admin*, and verify that we have our first DAG there! 209 | 210 | ![Our initial DAG](../images/our_initial_dag.png) 211 | 212 | When it is activated it in the Airflow UI, the following error will be added to the logs: 213 | 214 | ![Our initial DAG](../images/airflow_first_exec_error.png) 215 | 216 | We'll evaluate the Airflow log and identify that the table `products` was not found in the database `oltp-db`. 217 | It is the same situation as the previous one: we need to create this table in our test function. 218 | So let's change `test_sales_pipeline.py` again. 219 | 220 | ```python 221 | from airflow.providers.postgres.hooks.postgres import PostgresHook 222 | 223 | class TestSalesPipeline: 224 | 225 | def test_validate_sales_pipeline(self): 226 | oltp_hook = PostgresHook('oltp') 227 | oltp_hook.run(''' 228 | CREATE TABLE IF NOT EXISTS products ( 229 | product_id INTEGER, 230 | product_name TEXT, 231 | product_category TEXT 232 | ); 233 | ''') 234 | 235 | olap_hook = PostgresHook('olap') 236 | olap_hook.run(''' 237 | CREATE TABLE IF NOT EXISTS products ( 238 | product_id INTEGER, 239 | product_name TEXT, 240 | product_category TEXT 241 | ); 242 | ''') 243 | 244 | olap_product_size = olap_hook.get_records( 245 | 'select * from products' 246 | ) 247 | assert len(olap_product_size) == 5 248 | ``` 249 | 250 | We created the hook to access `oltp-db` and created the `products` table in it. 251 | We ran the test and got the same error that the size is different from 5. 252 | However, now we have our DAG and the `products` table in both databases. 253 | If we execute the DAG in the Airflow UI it will be successful in execution. 254 | Now we need to have it run during our test. 255 | 256 | Airflow offers several commands through its **cli** (commands execute in the terminal). The command `airflow dags backfill --start_date DAG_ID` allows you to trigger a DAG on a specific date (see the [documentation](https://airflow.apache.org/docs/apache-airflow/stable/dag-run.html#backfill)). 257 | This command is perfect for our case. 258 | 259 | We can execute this command in the terminal... so we'll take advantage of Python and execute it using the _subprocess_ library. 260 | 261 | ```python 262 | import subprocess 263 | from airflow.providers.postgres.hooks.postgres import PostgresHook 264 | 265 | def execute_dag(dag_id, execution_date): 266 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 267 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 268 | 269 | class TestSalesPipeline: 270 | 271 | def test_validate_sales_pipeline(self): 272 | oltp_hook = PostgresHook('oltp') 273 | oltp_hook.run(''' 274 | CREATE TABLE IF NOT EXISTS products ( 275 | product_id INTEGER, 276 | product_name TEXT, 277 | product_category TEXT 278 | ); 279 | ''') 280 | 281 | olap_hook = PostgresHook('olap') 282 | olap_hook.run(''' 283 | CREATE TABLE IF NOT EXISTS products ( 284 | product_id INTEGER, 285 | product_name TEXT, 286 | product_category TEXT 287 | ); 288 | ''') 289 | date = '2020-01-01' 290 | execute_dag('products_sales_pipeline', date) 291 | 292 | olap_product_size = olap_hook.get_records( 293 | 'select * from products' 294 | ) 295 | assert len(olap_product_size) == 5 296 | ``` 297 | 298 | We created a function to help us invoke the execution of the DAG during the test. 299 | So when we run `make testing` it'll automatically run the DAG with the date we passed, in the case` 2020-01-01`. 300 | 301 | The test will return **FAILED**. 302 | 303 | We've already created the two tables, however the `oltp-db` database has no records. 304 | We need to be able to insert the fake data into it. 305 | We've created the file `/data/products.csv`, but we need to transport its data into `oltp-db`. 306 | The simplest way that comes to mind is to read the *csv* file using the _pandas_ library and transfer the data to the database using the _pandas_ API. 307 | 308 | ```python 309 | import subprocess 310 | import pandas as pd 311 | from airflow.providers.postgres.hooks.postgres import PostgresHook 312 | 313 | 314 | def execute_dag(dag_id, execution_date): 315 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 316 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 317 | 318 | 319 | class TestSalesPipeline: 320 | 321 | def test_validate_sales_pipeline(self): 322 | oltp_hook = PostgresHook('oltp') 323 | oltp_hook.run(''' 324 | CREATE TABLE IF NOT EXISTS products ( 325 | product_id INTEGER, 326 | product_name TEXT, 327 | product_category TEXT 328 | ); 329 | ''') 330 | 331 | oltp_conn = oltp_hook.get_sqlalchemy_engine() 332 | sample_data = pd.read_csv('./data/products.csv') 333 | sample_data.to_sql( 334 | name='products', # name of sql table 335 | con=oltp_conn, # SQLalchemy connection 336 | if_exists='replace', # refresh data if run again 337 | index=False # don't want the pandas index inside db table 338 | ) 339 | 340 | olap_hook = PostgresHook('olap') 341 | olap_hook.run(''' 342 | CREATE TABLE IF NOT EXISTS products ( 343 | product_id INTEGER, 344 | product_name TEXT, 345 | product_category TEXT 346 | ); 347 | ''') 348 | date = '2020-01-01' 349 | execute_dag('products_sales_pipeline', date) 350 | 351 | olap_product_size = olap_hook.get_records( 352 | 'select * from products' 353 | ) 354 | assert len(olap_product_size) == 5 355 | ``` 356 | 357 | As I mentioned, Airflow hooks have several methods that help in communication and operations with database. In this case we easily create an *SQLAlchemy engine* for _pandas_ to send the data from *csv* to the `products` table in the` oltp-db` database. 358 | 359 | Now, a moment of tension... we execute `make testing` again... AND OUR **TEST PASSED!** 360 | 361 | The 1 warning is due to the Airflow hook using the old Postgresql connection format. 362 | 363 | ```text 364 | ======================== 1 passed, 1 warning in 11.06s ========================= 365 | ``` 366 | 367 | To really check, we can access the `olap-db` database through the command in the terminal: 368 | 369 | ```bash 370 | docker exec -ti olap-db psql -U root olap 371 | ``` 372 | 373 | and then running `select * from products;` we'll get the following result. 374 | 375 | ![Our first passed test output](../images/output_products.png) 376 | 377 | Very well! Finally we have our DAG doing the first task as we expect. 378 | We now need to develop the next tasks. 379 | As we already have the foundation built, it'll be faster and simpler to carry out the next tasks. 380 | 381 | > **Attention point: we created a very simplistic test**. 382 | 383 | It would be better to do a comparison that guarantees the result of the DAG is compatible with the data that we really expect. 384 | In this task, we want the data in the `products` table in the `olap-db` database to be the same as the `/data/products.csv` file. Let's do this now. 385 | 386 | ```python 387 | import subprocess 388 | import pandas as pd 389 | from pandas._testing import assert_frame_equal 390 | from airflow.providers.postgres.hooks.postgres import PostgresHook 391 | 392 | 393 | def execute_dag(dag_id, execution_date): 394 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 395 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 396 | 397 | 398 | class TestSalesPipeline: 399 | 400 | def test_validate_sales_pipeline(self): 401 | oltp_hook = PostgresHook('oltp') 402 | oltp_hook.run(''' 403 | CREATE TABLE IF NOT EXISTS products ( 404 | product_id INTEGER, 405 | product_name TEXT, 406 | product_category TEXT 407 | ); 408 | ''') 409 | oltp_conn = oltp_hook.get_sqlalchemy_engine() 410 | sample_data = pd.read_csv('./data/products.csv') 411 | sample_data.to_sql('products', con=oltp_conn, if_exists='replace', index=False) 412 | 413 | olap_hook = PostgresHook('olap') 414 | olap_hook.run(''' 415 | CREATE TABLE IF NOT EXISTS products ( 416 | product_id INTEGER, 417 | product_name TEXT, 418 | product_category TEXT 419 | ); 420 | ''') 421 | date = '2020-01-01' 422 | execute_dag('products_sales_pipeline', date) 423 | 424 | # Renaming the variable to size 425 | olap_product_size = olap_hook.get_records( 426 | 'select * from products' 427 | ) 428 | assert len(olap_product_size) == 5 429 | 430 | # New test! 431 | olap_product_data = olap_hook.get_pandas_df('select * from products') 432 | assert_frame_equal(olap_product_data, sample_data) 433 | ``` 434 | 435 | 1. we import `from pandas._testing import assert_frame_equal` to help us compare a pandas dataframe. 436 | 2. we created the variable `olap_product_data` using the hook again but now returning the database data as a dataframe. 437 | 3. as we had already loaded the data from the file `/data/products.csv` into the variable `sample_data` it made it easier to perform the comparison. 438 | 439 | Now finally a test that really compares if what we hope is running. 440 | 441 | **Now we need to stop, think and breathe.** 442 | 443 | Looking at the next task (`load_incremental_purchases`) it'll have the same steps. 444 | In the test code there are several parts that can be refactored, modularized in functions so that they can be reused for the next task. 445 | Lets do this. The activities to be carried out: 446 | 447 | * transfer the sql commands to files leaving the code more organized; 448 | * we will create specific files for the **expected** result that we'll compare with the result of the tasks. 449 | * the function (`create_table`) that creates a table in a given database, it receives the name of the sql file (described in the item above) which will also be the name of the table and the database hook to perform the operation; 450 | * the function (`insert_initial_data`) that inserts the initial data into the specified table; 451 | * the function (`output_expected_as_df`) that takes the expected data to compare with the result of the DAG. 452 | 453 | First let's put the scripts for creating the tables in files. 454 | Create a file in the path and named: `/sql/init/create_products.sql` 455 | 456 | ```sql 457 | CREATE TABLE IF NOT EXISTS products ( 458 | product_id INTEGER, 459 | product_name TEXT, 460 | product_category TEXT 461 | ); 462 | ``` 463 | 464 | Create a `expected` folder within `/data`. 465 | In this case we will just duplicate the `products.csv` file into it. 466 | 467 | After these steps we are back to editing our test. 468 | 469 | ```python 470 | import subprocess 471 | import pandas as pd 472 | from pandas._testing import assert_frame_equal 473 | from airflow.providers.postgres.hooks.postgres import PostgresHook 474 | 475 | 476 | def insert_initial_data(tablename, hook): 477 | """This script will populate database with initial data to run job""" 478 | conn_engine = hook.get_sqlalchemy_engine() 479 | sample_data = pd.read_csv(f'/opt/airflow/data/{tablename}.csv') 480 | sample_data.to_sql(name=tablename, con=conn_engine, if_exists='replace', index=False) 481 | 482 | 483 | def create_table(tablename, hook): 484 | sql_stmt = open(f'/opt/airflow/sql/init/create_{tablename}.sql').read() 485 | hook.run(sql_stmt) 486 | 487 | 488 | def output_expected_as_df(filename): 489 | return pd.read_csv(f'/opt/airflow/data/expected/{filename}.csv') 490 | 491 | 492 | def execute_dag(dag_id, execution_date): 493 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 494 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 495 | 496 | 497 | class TestSalesPipeline: 498 | 499 | def test_validate_sales_pipeline(self): 500 | oltp_hook = PostgresHook('oltp') 501 | create_table('products', oltp_hook) 502 | insert_initial_data('products', oltp_hook) 503 | 504 | olap_hook = PostgresHook('olap') 505 | create_table('products', olap_hook) 506 | 507 | date = '2020-01-01' 508 | execute_dag('products_sales_pipeline', date) 509 | 510 | olap_product_size = olap_hook.get_records('select * from products') 511 | assert len(olap_product_size) == 5 512 | 513 | expected_product_data = output_expected_as_df('products') 514 | olap_product_data = olap_hook.get_pandas_df('select * from products') 515 | assert_frame_equal(olap_product_data, expected_product_data) 516 | 517 | ``` 518 | 519 | Our refactored test with functions that will be reused in the next steps. 520 | It is much more readable separated into functions. 521 | Take time and study the changes that have occurred. 522 | This will help a lot those who are starting to understand and pioneer this refactoring process. (If you have a cruel question you can send me a message about this) 523 | 524 | ## TASK: Load incremental purchases 525 | 526 | Let's start the next task! 527 | The only difference from it to the previous one is that it'll have a condition in the data load. 528 | We should only load the data for the execution day, `execution_date`. 529 | First let's create our file with fake data. 530 | Create the `purchases.csv` file inside the `/data` directory. 531 | You can get the data from the [file provided as an example in the master branch](https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/master/data/purchases.csv). 532 | 533 | |purchase_id|purchase_date|user_id|product_id|unit_price|quantity|total_revenue| 534 | |--------------|-------------|-------|----------|----------|--------|-------------| 535 | |1 |2020-01-01 |111 |222 |150.0 |2 |300.0 | 536 | |2 |2020-01-01 |101 |225 |75 |1 |75 | 537 | |3 |2020-01-01 |153 |228 |300 |1 |300 | 538 | |4 |2020-01-10 |111 |227 |500 |1 |500 | 539 | |5 |2020-01-10 |199 |222 |150 |3 |450 | 540 | |6 |2020-01-10 |182 |220 |35 |4 |140 | 541 | |7 |2020-01-10 |174 |222 |150 |1 |150 | 542 | |8 |2020-01-15 |166 |227 |500 |1 |500 | 543 | |9 |2020-01-15 |132 |225 |75 |1 |75 | 544 | |10 |2020-01-15 |188 |220 |35 |10 |350 | 545 | 546 | Below is our test class (the other functions and imports in the file have been omitted to decrease the size). 547 | We have started a new test phase again. 548 | 549 | ```python 550 | class TestSalesPipeline: 551 | 552 | def test_validate_sales_pipeline(self): 553 | oltp_hook = PostgresHook('oltp') 554 | olap_hook = PostgresHook('olap') 555 | 556 | create_table('products', oltp_hook) 557 | create_table('products', olap_hook) 558 | insert_initial_data('products', oltp_hook) 559 | 560 | date = '2020-01-01' 561 | execute_dag('products_sales_pipeline', date) 562 | 563 | # Test load_full_products task 564 | olap_products_size = olap_hook.get_records('select * from products') 565 | assert len(olap_products_size) == 5 566 | 567 | expected_products_data = output_expected_as_df('products') 568 | olap_products_data = olap_hook.get_pandas_df('select * from products') 569 | assert_frame_equal(olap_products_data, expected_products_data) 570 | 571 | # New code! 572 | # Test load_incremental_purchases 573 | olap_purchases_size = olap_hook.get_records('select * from purchases') 574 | assert len(olap_purchases_size) == 3 575 | ``` 576 | 577 | The column of data that corresponds to time is called `purchase_date`. 578 | So if we analyze the sample data we have only 3 entries for date `2020-01-01`. 579 | This date we are already using when we call our DAG, variable `date = '2020-01-01'`. 580 | 581 | I will anticipate some steps that we have already taken with the previous DAG. I will create the `purchases` table in both databases using the file `sql/init/create_purchases.sql`: 582 | 583 | ```sql 584 | CREATE TABLE IF NOT EXISTS purchases ( 585 | purchase_id INTEGER, 586 | purchase_date TEXT, 587 | user_id INTEGER, 588 | product_id INTEGER, 589 | unit_price REAL, 590 | quantity INTEGER, 591 | total_revenue REAL 592 | ) 593 | ``` 594 | 595 | Then, populate the `oltp-db` database with the fake data we created. The following lines have been included: 596 | 597 | ```python 598 | # test_sales_pipeline 599 | class TestSalesPipeline: 600 | 601 | def test_validate_sales_pipeline(self): 602 | oltp_hook = PostgresHook('oltp') 603 | olap_hook = PostgresHook('olap') 604 | 605 | create_table('products', oltp_hook) 606 | create_table('products', olap_hook) 607 | insert_initial_data('products', oltp_hook) 608 | 609 | create_table('purchases', oltp_hook) 610 | create_table('purchases', olap_hook) 611 | insert_initial_data('purchases', oltp_hook) 612 | ``` 613 | 614 | Let's add the new task to DAG. 615 | 616 | ```python 617 | from airflow import DAG 618 | from airflow.utils.dates import days_ago 619 | from airflow.providers.postgres.hooks.postgres import PostgresHook 620 | from airflow.operators.python import PythonOperator 621 | 622 | 623 | def transfer_oltp_olap(**kwargs): 624 | """Get records from OLTP and transfer to OLAP database""" 625 | dest_table = kwargs.get('dest_table') 626 | sql = kwargs.get('sql') 627 | params = kwargs.get('params') 628 | 629 | oltp_hook = PostgresHook(postgres_conn_id='oltp') 630 | olap_hook = PostgresHook(postgres_conn_id='olap') 631 | 632 | data_extracted = oltp_hook.get_records(sql=sql, parameters=params) 633 | olap_hook.insert_rows(dest_table, data_extracted, commit_every=1000) 634 | 635 | 636 | with DAG(dag_id='products_sales_pipeline', 637 | default_args={'owner': 'airflow'}, 638 | schedule_interval=None, 639 | start_date=days_ago(2)) as dag: 640 | 641 | execution_date = '{{ ds }}' 642 | 643 | load_full_products_data = PythonOperator( 644 | task_id='load_full_products', 645 | python_callable=transfer_oltp_olap, 646 | op_kwargs={ 647 | 'dest_table': 'products', 648 | 'sql': 'select * from products', 649 | }) 650 | 651 | load_incremental_purchases_data = PythonOperator( 652 | task_id='load_incremental_purchases', 653 | python_callable=transfer_oltp_olap, 654 | op_kwargs={ 655 | 'dest_table': 'purchases', 656 | 'sql': 'select * from purchases where "purchase_date" = %s', 657 | 'params': [execution_date] 658 | }) 659 | ``` 660 | 661 | A new PythonOperator task called `load_incremental_purchases_data` has been created. It reuses the function `transfer_oltp_olap` created previously. 662 | The only differences were the `where purchase_data =% s` clause and the editing of the function to receive the extra parameter in the query. 663 | The `{{ds}}` syntax is an Airflow convention for accessing context variables. 664 | There are several variables that can be accessed within the context of DAG. 665 | It's kind of obscure at first, for more information read the documentation about [Macros Reference](https://airflow.apache.org/docs/apache-airflow/stable/macros-ref.html). 666 | 667 | We can run our tests now. 668 | Our second task is completed. 669 | Again we can increase our test to better serve the project. 670 | 671 | In this case, we will create the file with the expected data. 672 | Instead of copying the whole `purchases.csv` file as with product data, we will now only need a subset relevant to the tests. 673 | Create a new file called `purchases_2020-01-01.csv` inside the` expected` folder. 674 | 675 | |purchase_id|purchase_date|user_id|product_id|unit_price|quantity|total_revenue| 676 | |--------------|-------------|-------|----------|----------|--------|-------------| 677 | |1 |2020-01-01 |111 |222 |150.0 |2 |300.0 | 678 | |2 |2020-01-01 |101 |225 |75 |1 |75 | 679 | |3 |2020-01-01 |153 |228 |300 |1 |300 | 680 | 681 | It will only have data from day 2020-01-01. 682 | Now let's edit our test function. 683 | 684 | ```python 685 | #test_sales_pipeline.py 686 | 687 | # old 688 | olap_purchases_size = olap_hook.get_records('select * from purchases') 689 | assert len(olap_purchases_size) == 3 690 | 691 | # new 692 | purchase_data = olap_hook.get_pandas_df('select * from purchases') 693 | purchase_size = len(purchase_data) 694 | purchase_expected = output_expected_as_df(f'purchases_{date}') 695 | assert_frame_equal(purchase_data, purchase_expected) 696 | assert purchase_size == 3 697 | ``` 698 | 699 | So we completed the second task. 700 | We arrived at the project stage where we finished the data extraction and loading tasks. 701 | The next tasks will only involve the `olap-db` database. 702 | Now we are going to use another Airflow feature. 703 | 704 | ## TASK: join_purchases_products 705 | 706 | The objective of this task is to join the two tables created previously. 707 | We went back to our test file by creating the new test for the table `join_purchases_products`. 708 | 709 | ```python 710 | # test_sales_pipeline.py 711 | # ... 712 | purchase_data = olap_hook.get_pandas_df('select * from purchases') 713 | purchase_size = len(purchase_data) 714 | purchase_expected = output_expected_as_df(f'purchases_{date}') 715 | assert_frame_equal(purchase_data, purchase_expected) 716 | assert purchase_size == 3 717 | 718 | # Test join_purchases_products 719 | purchases_products_size = olap_hook.get_pandas_df('select * from join_purchases_products') 720 | assert len(purchases_products_size) == 3 721 | ``` 722 | 723 | Explaining why you expect the result to be 3. 724 | In this join we will take the loaded transactions and make a `left` join with the product table. So the maximum size will be 3. 725 | 726 | We can edit the DAG after inserting the test. 727 | 728 | ```python 729 | from airflow import DAG 730 | from airflow.utils.dates import days_ago 731 | from airflow.providers.postgres.hooks.postgres import PostgresHook 732 | from airflow.providers.postgres.operators.postgres import PostgresOperator 733 | from airflow.operators.python import PythonOperator 734 | 735 | def transfer_oltp_olap(**kwargs): 736 | # não foi alterado nesse momento 737 | 738 | with DAG(dag_id='products_sales_pipeline', 739 | default_args={'owner': 'airflow'}, 740 | schedule_interval=None, 741 | template_searchpath='/opt/airflow/sql/sales/', 742 | start_date=days_ago(2)) as dag: 743 | 744 | execution_date = '{{ ds }}' 745 | 746 | load_full_products_data = PythonOperator( 747 | task_id='load_full_products', 748 | python_callable=transfer_oltp_olap, 749 | op_kwargs={ 750 | 'dest_table': 'products', 751 | 'sql': 'select * from products', 752 | }) 753 | 754 | load_incremental_purchases_data = PythonOperator( 755 | task_id='load_incremental_purchases', 756 | python_callable=transfer_oltp_olap, 757 | op_kwargs={ 758 | 'dest_table': 'purchases', 759 | 'sql': 'select * from purchases where "purchase_date" = %s', 760 | 'params': [execution_date] 761 | }) 762 | 763 | join_purchases_with_products = PostgresOperator( 764 | task_id='join_purchases_products', 765 | postgres_conn_id='olap', 766 | sql='join_purchases_with_products.sql' 767 | ) 768 | 769 | [load_full_products_data, load_incremental_purchases_data] >> join_purchases_with_products 770 | ``` 771 | 772 | 1. `template_searchpath='/opt/airflow/sql/sales/'` which was inserted in the instantiation of the DAG. This command allows you to load SQL scripts from another folder. 773 | 2. **PostgresOperator** as we'll now transform the data that is in the `olap-db` database, we can use Operator. 774 | 3. Finally, the task dependency connection was made. 775 | 776 | We need to create our SQL file with the query. 777 | Create it in the `/sql/sales/join_purchases_with_products.sql` directory. 778 | Why the `init` and `sales` folders? I like to leave the files separated by these logical segments where they are used. 779 | 780 | ```sql 781 | create table if not exists join_purchases_products as ( 782 | select 783 | t.*, 784 | p.product_name, 785 | p.product_category 786 | from purchases t 787 | left join products p 788 | on p.product_id = t.product_id 789 | ) 790 | ``` 791 | 792 | After creating the SQL file we can run the tests and we will have our third task completed! 793 | Nearby tasks can be accomplished in the same way using PostgresOperator. 794 | I will leave them as a challenge. 795 | If you have difficulty you can analyze the code that is in the repository to guide you. 796 | 797 | --- 798 | Thank you very much and if you have any suggestions send me a message on [LinkedIn](https://www.linkedin.com/in/marcos-marx-millnitz/?locale=en_US). 799 | -------------------------------------------------------------------------------- /assets/how-to/criar-dag-usando-tdd.md: -------------------------------------------------------------------------------- 1 | # Tutorial explicando como desenvolver uma DAG usando TDD 2 | 3 | ## Introdução 4 | 5 | Nesse tutorial iremos construir a nossa DAG desde o início. 6 | Se você quiser acompanhar aconselho a clonar o repositório e fazer o checkout para a branch `tutorial`. 7 | 8 | ```bash 9 | git clone git@github.com:marcosmarxm/airflow-testing-ci-workflow.git 10 | git checkout tutorial 11 | ``` 12 | 13 | Para tirar mais proveito você deva ter um conhecimento básico sobre **Airflow**, **python** e **pytest**. 14 | Caso você não sabe, eu penso... que como vamos construindo aos poucos talvez você possa ir pesquisando e aprendendo os conceitos na hora. 15 | 16 | Relembrando do pipeline que iremos desenvolver: 17 | 18 | ![Our DAG](../images/our_dag.png) 19 | 20 | Explicando cada task: 21 | 22 | - **load_full_products**: deleta os dados antigos e carrega a tabela `products` completamente todo dia. 23 | - **load_incremental_purchases**: devido ao tamanho dessa tabela será realizado uma carga incremental utilizando o parâmetro de data `execution_date`. 24 | - **join_purchase_products_as_product_sales_daily**: essa task intermediária prepara os dados brutos (products e purchases) carregados do dia do banco de dados `oltp` para serem armazenadas na tabela de resultados `product_sales` que será usada pelo time de analytics. 25 | - **delete_products_sales_exec_date**: essa task tem a função de limpar os dados da tabela de resultado `product_sales` no início do pipeline, dessa forma garante que não terá dados duplicados (idempotência). 26 | - **union_staging_to_products_sales**: carrega os dados do staging `product_sales_daily` para a tabela com dados históricos `product_sales`. 27 | - **rebuild_agg_sales_category**: o resultado da tabela acima já ilustra um formato padrão de consumo para data warehouse, essa task ilustra a criação de um "data mart" simplificado. 28 | 29 | ## Vamos começar! 30 | 31 | Primeiro vamos colocar nosso ambiente de desenvolvimento em pé. 32 | Caso você tenha dúvidas sobre o ambiente recomendo ler novamente o artigo [How to develop data pipeline in Airflow through TDD (test-driven development)](https://blog.magrathealabs.com/how-to-develop-data-pipeline-in-airflow-through-tdd-test-driven-development-c3333439f358). Você pode ver o código no arquivo `Makefile`. 33 | 34 | ```bash 35 | make setup 36 | ``` 37 | 38 | Isso irá demorar alguns minutos. 39 | A imagem docker do Airflow 2.0 com o LocalExecutor está demorando para fazer a configuração inicial. 40 | Após a configuração inicial enviamos alguns comandos para o Airflow: criação de usuário, criação das conexões e criação das variáveis. 41 | 42 | Já temos um diagrama do pipeline esboçado. 43 | Iremos pensar em como construir ele agora. 44 | Também já vimos o formato dos dados e uma pequena amostra deles. 45 | Com isso temos o insumo para realizarmos o início do desenvolvimento do projeto. 46 | 47 | ## TASK: Full load product data 48 | 49 | A primeira tarefa que iremos desenvolver é a `full_load_product_data`. 50 | Ela tem o objetivo de pegar os dados da tabela `products` do banco de dados `oltp-db` e transferir para o `olap-db`. 51 | Primeiro vamos criar nossos dados fake para nos guiar. 52 | Crie um arquivo no diretório `/data` chamado `products.csv`. 53 | Você pode pegar os dados do [arquivo fornecido como exemplo no branch master](https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/master/data/products.csv). 54 | Exemplo abaixo: 55 | 56 | |product_id|product_name |product_category| 57 | |----------|---------------------|----------------| 58 | |220 |Brand A T-Shirt |T-Shirt | 59 | |222 |Dri-FIT T-Shirt |T-Shirt | 60 | |225 |Brand N T-shirt |T-Shirt | 61 | |227 |Casual Shoes |Shoes | 62 | |228 |Generic Running Shoes|Shoes | 63 | 64 | Após iremos começar o desenvolvimento da DAG utilizando a metodologia TDD. 65 | Precisamos criar um teste, executar ele, e teremos uma falha. 66 | Em seguida vamos programar para fazer a parte que falha no teste funcionar. 67 | Entrando num looping teste/código de correção dos erros até finalizar o pipeline. 68 | As vantagens são: 69 | 70 | - rápido feedback do problema, teremos apenas um erro para resolver; 71 | - construção gradativa do nosso código assegurando que ele funciona. 72 | 73 | Crie um arquivo no diretório `/tests` chamado `test_sales_pipeline.py`. 74 | 75 | ```python 76 | # test_sales_pipeline.py 77 | class TestSalesPipeline: 78 | 79 | def test_validate_sales_pipeline(self): 80 | assert len(olap_product_size) == 5 81 | ``` 82 | 83 | **Refletindo**: O objetivo dessa tarefa é comparar os dados que estarão no banco `olap-db` na tabela `products` com os dados de amostra `/data/products.csv`. 84 | 85 | `olap_product_size`: é a variável que estou planejando que receba os valores que devem ser transferidos, é provável que ela seja uma lista com valores ou um dataframe. 86 | Vamos começar com o mais básico possível: 87 | 88 | * Comparar nosso resultado `olap_product_size` e ver se ele tem todos os itens que esperamos que ele tenha. Como podemos ver nos dados de amostra `/data/products.csv` temos 5 entradas, por esse motivo queremos comparar o tamanho de `olap_product_size` com 5. 89 | 90 | Podemos rodar pela primeira vez o nosso teste através do comando: 91 | 92 | ```bash 93 | make testing 94 | ``` 95 | 96 | Teremos como resultado que a variável `olap_product_size` não existe. 97 | No projeto essa variável deve buscar os dados do banco de dados `olap-db` na tabela `products`. 98 | Então, precisamos criar uma conexão com o banco `olap-db` e buscar esses valores. 99 | 100 | Já que estamos usando o Airflow vamos utilizar os **Hooks** que possuem diversos métodos de interação com os banco de dados. 101 | Como já configuramos o container (olap-db) e a conexão do Airflow com ele no setup será bem simples completar essa etapa. 102 | Iremos utilizar o **PostgresHook**, se quiser saber mais sobre Hooks pode acessar a [documentação do Airflow](https://airflow.apache.org/docs/apache-airflow/stable/concepts.html?highlight=hook#hooks). 103 | 104 | ```python 105 | from airflow.providers.postgres.hooks.postgres import PostgresHook 106 | 107 | class TestSalesPipeline: 108 | 109 | def test_validate_sales_pipeline(self): 110 | olap_hook = PostgresHook('olap') 111 | olap_product_size = olap_hook.get_records( 112 | 'select * from products' 113 | ) 114 | assert len(olap_product_size) == 5 115 | ``` 116 | 117 | Importamos o **PostgresHook** e criamos o hook para o banco de dados `olap-db`. 118 | Esse hook possui um método que consegue executar uma query SQL e retornar os valores dela. 119 | Após editar o arquivo de teste conforme apresentado acima podemos rodar `make testing` novamente. 120 | Receberemos o erro que a tabela `products` não existe no banco `olap-db`. 121 | 122 | **Ponto de atenção** Aqui vem uma consideração importante sobre testes. 123 | O nosso pipeline é responsável por transferir os dados e não criar essas tabelas. 124 | Então faz parte do teste configurar esse setup das tabelas. 125 | 126 | ```python 127 | from airflow.providers.postgres.hooks.postgres import PostgresHook 128 | 129 | class TestSalesPipeline: 130 | 131 | def test_validate_sales_pipeline(self): 132 | olap_hook = PostgresHook('olap') 133 | olap_hook.run(''' 134 | CREATE TABLE IF NOT EXISTS products ( 135 | product_id INTEGER, 136 | product_name TEXT, 137 | product_category TEXT 138 | ); 139 | ''') 140 | olap_product_size = olap_hook.get_records( 141 | 'select * from products' 142 | ) 143 | assert len(olap_product_size) == 5 144 | ``` 145 | 146 | O comando `.run(sql statement)` executa um query SQL no banco de dados. Ele é parecido com o `.get_records` que vimos antes, entretanto serve para quando não queremos os dados de retorno. 147 | No exemplo ele irá criar a tabela `products` com as colunas necessárias conforme nossos dados de amostra `/data/products.csv`. 148 | 149 | Rodamos novamente os testes e agora o erro que temos é que o existe uma diferença entre o `olap_product_size` e o valor que esperamos seja igual a 5. 150 | Nesse estágio chegamos a necessidade de iniciar a nossa DAG pois já configuramos o inicialmente nosso teste. 151 | Iremos criar um arquivo chamado `dag_sales_pipeline.py` dentro do diretório `/dags`. 152 | 153 | ```python 154 | from airflow import DAG 155 | from airflow.utils.dates import days_ago 156 | 157 | with DAG(dag_id='products_sales_pipeline', 158 | default_args={'owner': 'airflow'}, 159 | schedule_interval=None, 160 | start_date=days_ago(2)) as dag: 161 | ``` 162 | 163 | O código acima apenas instancia uma nova DAG básica. 164 | Precisamos pensar agora em como resolver nosso problema. 165 | Necessitamos de uma função que transfira os dados do banco `oltp-db` para o `olap-db`. 166 | Já vimos que os **Hooks** no Airflow possuem métodos que podem nos auxiliar: executar um sql e pegar os dados, executar um sql sem retorno dos dados, entre outras interações com o banco de dados. 167 | 168 | ```python 169 | from airflow import DAG 170 | from airflow.utils.dates import days_ago 171 | from airflow.providers.postgres.hooks.postgres import PostgresHook 172 | from airflow.operators.python import PythonOperator 173 | 174 | 175 | def transfer_oltp_olap(**kwargs): 176 | """Get records from OLTP and transfer to OLAP database""" 177 | dest_table = kwargs.get('dest_table') 178 | sql = kwargs.get('sql') 179 | 180 | oltp_hook = PostgresHook(postgres_conn_id='oltp') 181 | olap_hook = PostgresHook(postgres_conn_id='olap') 182 | 183 | data_extracted = oltp_hook.get_records(sql=sql) 184 | olap_hook.insert_rows(dest_table, data_extracted, commit_every=1000) 185 | 186 | 187 | with DAG(dag_id='products_sales_pipeline', 188 | default_args={'owner': 'airflow'}, 189 | schedule_interval=None, 190 | start_date=days_ago(2)) as dag: 191 | 192 | load_full_products_data = PythonOperator( 193 | task_id='load_full_products', 194 | python_callable=transfer_oltp_olap, 195 | op_kwargs={ 196 | 'dest_table': 'products', 197 | 'sql': 'select * from products', 198 | }) 199 | ``` 200 | 201 | Explicando o que foi realizado: 202 | 203 | 1. Criamos a **task** `load_full_products_data`, que é um PythonOperator. Um **Operator** é um conceito no Airflow que consegue invocar comandos básicos/padronizados. Por exemplo o **PythonOperator** chama funções em `python` e o **PostgresOperator** consegue executar queries SQL porém não consegue transferir dados de um banco de dados para outro. Para mais informações recomendo ler a [documentação](https://airflow.apache.org/docs/apache-airflow/stable/concepts.html?highlight=hook#operators). 204 | 2. Criamos a função `transfer_oltp_olap`, que basicamente cria os dois hooks para executar a coleta dos dados no banco `oltp-db` para o `olap-db`. Por que não utilizamos um **PostgresOperator**? O motivo é que o operator só consegue executar a query no limite do banco que ele está associado, ele não transfere dados. Por isso utilizamos os hooks. Os _kwargs_ é uma convenção do Airflow para passar os argumentos em funções chamadas pelo **PythonOperator**. 205 | 206 | Após concluir a DAG podemos acessar o Airflow em http://localhost:8080, usando as credenciais *admin/admin*, e verificar que nossa primeira DAG estará lá! 207 | 208 | ![Our initial DAG](../images/our_initial_dag.png) 209 | 210 | Ao ativá-la e executá-la no UI do Airflow, será registrado o seguint erro nos logs: 211 | 212 | ![Our initial DAG](../images/airflow_first_exec_error.png) 213 | 214 | Vamos avaliar o log do Airflow e identificamos que não foi encontrada a tabela `products` no banco `oltp-db`. 215 | É a mesma situação que a anterior: precisamos criar essa tabela na nossa função de teste. 216 | Então vamos lá alterar novamente o `test_sales_pipeline.py`. 217 | 218 | ```python 219 | from airflow.providers.postgres.hooks.postgres import PostgresHook 220 | 221 | class TestSalesPipeline: 222 | 223 | def test_validate_sales_pipeline(self): 224 | oltp_hook = PostgresHook('oltp') 225 | oltp_hook.run(''' 226 | CREATE TABLE IF NOT EXISTS products ( 227 | product_id INTEGER, 228 | product_name TEXT, 229 | product_category TEXT 230 | ); 231 | ''') 232 | 233 | olap_hook = PostgresHook('olap') 234 | olap_hook.run(''' 235 | CREATE TABLE IF NOT EXISTS products ( 236 | product_id INTEGER, 237 | product_name TEXT, 238 | product_category TEXT 239 | ); 240 | ''') 241 | 242 | olap_product_size = olap_hook.get_records( 243 | 'select * from products' 244 | ) 245 | assert len(olap_product_size) == 5 246 | ``` 247 | 248 | Criamos o hook para acessar o `oltp-db` e criamos a tabela `products` nele. 249 | Rodamos o teste e obtemos o mesmo erro que o tamanho é diferente de 5. 250 | Porém, agora temos nossa DAG e a tabela `products` nos dois bancos. 251 | Se executarmos a DAG no UI do Airflow ela irá ter sucesso na execução. 252 | Agora precisamos fazer ela ser executada durante o nosso teste. 253 | 254 | O Airflow disponibiliza diversos comandos através do seu **cli** (comandos pelo terminal). O comando `airflow dags backfill --start_date DAG_ID` permite disparar uma DAG em uma data especifica (vide [documentação](https://airflow.apache.org/docs/apache-airflow/stable/dag-run.html#backfill)). 255 | Esse comando é perfeito para o nosso caso. 256 | 257 | Podemos executar esse comando no terminal... então iremos aproveitar do Python e executar ele através da biblioteca _subprocess_. 258 | 259 | ```python 260 | import subprocess 261 | from airflow.providers.postgres.hooks.postgres import PostgresHook 262 | 263 | def execute_dag(dag_id, execution_date): 264 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 265 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 266 | 267 | class TestSalesPipeline: 268 | 269 | def test_validate_sales_pipeline(self): 270 | oltp_hook = PostgresHook('oltp') 271 | oltp_hook.run(''' 272 | CREATE TABLE IF NOT EXISTS products ( 273 | product_id INTEGER, 274 | product_name TEXT, 275 | product_category TEXT 276 | ); 277 | ''') 278 | 279 | olap_hook = PostgresHook('olap') 280 | olap_hook.run(''' 281 | CREATE TABLE IF NOT EXISTS products ( 282 | product_id INTEGER, 283 | product_name TEXT, 284 | product_category TEXT 285 | ); 286 | ''') 287 | date = '2020-01-01' 288 | execute_dag('products_sales_pipeline', date) 289 | 290 | olap_product_size = olap_hook.get_records( 291 | 'select * from products' 292 | ) 293 | assert len(olap_product_size) == 5 294 | ``` 295 | 296 | Criamos uma função para nos auxiliar a invocar a execução da DAG durante o teste. 297 | Assim quando executarmos o `make testing`, a DAG será executada automaticamente com a data que passamos, no caso `2020-01-01`. 298 | 299 | O teste irá retornar **FAILED**. 300 | 301 | Nós já criamos as duas tabelas, entretanto o banco de dados `oltp-db` não possui nenhum registro. 302 | Precisamos conseguir inserir os dados fake nele. 303 | Já criamos o arquivo `/data/products.csv`, mas precisamos transportar seus dados para dentro do `oltp-db`. 304 | A forma mais simples que me vem na mente é ler o arquivo *csv* usando a biblioteca _pandas_ e transferir os dados para o banco usando a API do _pandas_. 305 | 306 | ```python 307 | import subprocess 308 | import pandas as pd 309 | from airflow.providers.postgres.hooks.postgres import PostgresHook 310 | 311 | 312 | def execute_dag(dag_id, execution_date): 313 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 314 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 315 | 316 | 317 | class TestSalesPipeline: 318 | 319 | def test_validate_sales_pipeline(self): 320 | oltp_hook = PostgresHook('oltp') 321 | oltp_hook.run(''' 322 | CREATE TABLE IF NOT EXISTS products ( 323 | product_id INTEGER, 324 | product_name TEXT, 325 | product_category TEXT 326 | ); 327 | ''') 328 | 329 | oltp_conn = oltp_hook.get_sqlalchemy_engine() 330 | sample_data = pd.read_csv('./data/products.csv') 331 | sample_data.to_sql( 332 | name='products', # nome da tabela SQL 333 | con=oltp_conn, # conexão SQLalchemy 334 | if_exists='replace', # garante que toda vez teremos os mesmos dados 335 | index=False # não queremos salvar os indices do pandas no banco 336 | ) 337 | 338 | olap_hook = PostgresHook('olap') 339 | olap_hook.run(''' 340 | CREATE TABLE IF NOT EXISTS products ( 341 | product_id INTEGER, 342 | product_name TEXT, 343 | product_category TEXT 344 | ); 345 | ''') 346 | date = '2020-01-01' 347 | execute_dag('products_sales_pipeline', date) 348 | 349 | olap_product_size = olap_hook.get_records( 350 | 'select * from products' 351 | ) 352 | assert len(olap_product_size) == 5 353 | ``` 354 | 355 | Como havia comentado, os hooks do Airflow possuem diversos métodos que auxiliam na comunicação e operações com os bancos. Nesse caso facilmente criamos uma *engine SQLAlchemy* para o _pandas_ enviar os dados do *csv* para a tabela `products` no banco de dados `oltp-db`. 356 | 357 | Agora, momento de tensão... executamos novamente `make testing`... E NOSSO **TESTE PASSOU!** 358 | 359 | O 1 warning é devido ao hook do Airflow utilizar o formato antigo de conexão com o banco Postgresql. 360 | 361 | ```text 362 | ======================== 1 passed, 1 warning in 11.06s ========================= 363 | ``` 364 | 365 | Para verificarmos realmente, podemos acessar o banco `olap-db` através do comando no terminal: 366 | 367 | ```bash 368 | docker exec -ti olap-db psql -U root olap 369 | ``` 370 | 371 | e depois executando `select * from products;` teremos o seguinte resultado. 372 | 373 | ![Our first passed test output](../images/output_products.png) 374 | 375 | Muito bem! Finalmente temos nossa DAG executando a primeira tarefa da forma que esperamos. 376 | Precisamos agora desenvolver as próximas tarefas. 377 | Como já temos o alicerce construído será mais rápido e descomplicado realizar as próximas tarefas. 378 | 379 | **Ponto de atenção: criamos um teste bastante simplista**. 380 | 381 | Seria melhor realizar uma comparação que garanta o resultado da DAG seja compatível com o dado que realmente esperamos. 382 | Nessa tarefa, queremos que os dados da tabela `products` no banco de dados `olap-db` sejam iguais aos do arquivo `/data/products.csv`. Vamos fazer isso agora. 383 | 384 | ```python 385 | import subprocess 386 | import pandas as pd 387 | from pandas._testing import assert_frame_equal 388 | from airflow.providers.postgres.hooks.postgres import PostgresHook 389 | 390 | 391 | def execute_dag(dag_id, execution_date): 392 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 393 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 394 | 395 | 396 | class TestSalesPipeline: 397 | 398 | def test_validate_sales_pipeline(self): 399 | oltp_hook = PostgresHook('oltp') 400 | oltp_hook.run(''' 401 | CREATE TABLE IF NOT EXISTS products ( 402 | product_id INTEGER, 403 | product_name TEXT, 404 | product_category TEXT 405 | ); 406 | ''') 407 | oltp_conn = oltp_hook.get_sqlalchemy_engine() 408 | sample_data = pd.read_csv('./data/products.csv') 409 | sample_data.to_sql('products', con=oltp_conn, if_exists='replace', index=False) 410 | 411 | olap_hook = PostgresHook('olap') 412 | olap_hook.run(''' 413 | CREATE TABLE IF NOT EXISTS products ( 414 | product_id INTEGER, 415 | product_name TEXT, 416 | product_category TEXT 417 | ); 418 | ''') 419 | date = '2020-01-01' 420 | execute_dag('products_sales_pipeline', date) 421 | 422 | # Renomeado variável para _size 423 | olap_product_size = olap_hook.get_records( 424 | 'select * from products' 425 | ) 426 | assert len(olap_product_size) == 5 427 | 428 | # Novo teste 429 | olap_product_data = olap_hook.get_pandas_df('select * from products') 430 | assert_frame_equal(olap_product_data, sample_data) 431 | ``` 432 | 433 | 1. importamos `from pandas._testing import assert_frame_equal` para nos auxiliar a comparar um dataframe pandas. 434 | 2. criamos a variável `olap_product_data` usando novamente o hook porém agora retornando os dados do banco como um dataframe. 435 | 3. como já tínhamos carregado os dados do arquivo `/data/products.csv` para a variável `sample_data` facilitou executarmos a comparação. 436 | 437 | Agora finalmente um teste que compara realmente se o que esperamos está sendo executado. 438 | 439 | **Agora precisamos parar, pensar e respirar.** 440 | 441 | Olhando a próxima tarefa (`load_incremental_purchases`) ela terá os praticamente os mesmos passos. 442 | No código de teste existem várias partes que podem ser refatoradas, modularizando em funções para que sejam reaproveitadas para a próxima tarefa. 443 | Vamos fazer isso. As atividades que serão realizadas: 444 | 445 | * transferir os comandos sql para arquivos deixando o código mais organizado; 446 | * vamos criar arquivos específicos para o resultado **esperado** que iremos comparar com o resultado das tarefas. 447 | * a função (`create_table`) que cria uma tabela em determinado banco de dados, ela recebe o nome do arquivo sql (descrito no item acima) que também será o nome da tabela e o hook do banco de dados para executar a operação; 448 | * a função (`insert_initial_data`) que insere os dados iniciais na tabela especificada; 449 | * a função (`output_expected_as_df`) que pega os dados esperados para comparar com o resultado da DAG. 450 | 451 | Primeiro vamos colocar os scripts de criação das tabelas em arquivos. 452 | Crie um arquivo no path e chamado: `/sql/init/create_products.sql` 453 | 454 | ```sql 455 | CREATE TABLE IF NOT EXISTS products ( 456 | product_id INTEGER, 457 | product_name TEXT, 458 | product_category TEXT 459 | ); 460 | ``` 461 | 462 | Crie uma pasta `expected` dentro da `/data`. 463 | Nesse caso vamos apenas duplicar o arquivo `products.csv` para dentro dela. 464 | 465 | Após essas etapas voltamos a editar o nosso teste. 466 | 467 | ```python 468 | import subprocess 469 | import pandas as pd 470 | from pandas._testing import assert_frame_equal 471 | from airflow.providers.postgres.hooks.postgres import PostgresHook 472 | 473 | 474 | def insert_initial_data(tablename, hook): 475 | """This script will populate database with initial data to run job""" 476 | conn_engine = hook.get_sqlalchemy_engine() 477 | sample_data = pd.read_csv(f'/opt/airflow/data/{tablename}.csv') 478 | sample_data.to_sql(name=tablename, con=conn_engine, if_exists='replace', index=False) 479 | 480 | 481 | def create_table(tablename, hook): 482 | sql_stmt = open(f'/opt/airflow/sql/init/create_{tablename}.sql').read() 483 | hook.run(sql_stmt) 484 | 485 | 486 | def output_expected_as_df(filename): 487 | return pd.read_csv(f'/opt/airflow/data/expected/{filename}.csv') 488 | 489 | 490 | def execute_dag(dag_id, execution_date): 491 | """Execute a DAG in a specific date this process wait for DAG run or fail to continue""" 492 | subprocess.run(["airflow", "dags", "backfill", "-s", execution_date, dag_id]) 493 | 494 | 495 | class TestSalesPipeline: 496 | 497 | def test_validate_sales_pipeline(self): 498 | oltp_hook = PostgresHook('oltp') 499 | create_table('products', oltp_hook) 500 | insert_initial_data('products', oltp_hook) 501 | 502 | olap_hook = PostgresHook('olap') 503 | create_table('products', olap_hook) 504 | 505 | date = '2020-01-01' 506 | execute_dag('products_sales_pipeline', date) 507 | 508 | olap_product_size = olap_hook.get_records('select * from products') 509 | assert len(olap_product_size) == 5 510 | 511 | expected_product_data = output_expected_as_df('products') 512 | olap_product_data = olap_hook.get_pandas_df('select * from products') 513 | assert_frame_equal(olap_product_data, expected_product_data) 514 | 515 | ``` 516 | 517 | Nosso teste refatorado com funções que serão reaproveitadas nas próximas etapas. 518 | Está bem mais legível separado em funções. 519 | Tome um tempo e estude as mudanças que ocorreram. 520 | Isso ajudará muito quem está começando entender e desbravar esse processo de refatoração. (Se tiver uma dúvida cruel pode me enviar uma mensagem) 521 | 522 | ## TASK: Load incremental purchases 523 | 524 | Vamos começar a próxima tarefa! 525 | A única diferença dela para anterior é que teremos uma condição na carga dos dados. 526 | Devemos apenas carregar os dados do dia de execução, `execution_date`. 527 | Primeiro vamos criar nosso arquivo com dados fake. 528 | Crie o arquivo `purchases.csv` dentro do diretório `/data`. 529 | Você pode pegar os dados do [arquivo fornecido como exemplo](https://raw.githubusercontent.com/marcosmarxm/airflow-testing-ci-workflow/master/data/purchases.csv). 530 | 531 | |purchase_id|purchase_date|user_id|product_id|unit_price|quantity|total_revenue| 532 | |--------------|-------------|-------|----------|----------|--------|-------------| 533 | |1 |2020-01-01 |111 |222 |150.0 |2 |300.0 | 534 | |2 |2020-01-01 |101 |225 |75 |1 |75 | 535 | |3 |2020-01-01 |153 |228 |300 |1 |300 | 536 | |4 |2020-01-10 |111 |227 |500 |1 |500 | 537 | |5 |2020-01-10 |199 |222 |150 |3 |450 | 538 | |6 |2020-01-10 |182 |220 |35 |4 |140 | 539 | |7 |2020-01-10 |174 |222 |150 |1 |150 | 540 | |8 |2020-01-15 |166 |227 |500 |1 |500 | 541 | |9 |2020-01-15 |132 |225 |75 |1 |75 | 542 | |10 |2020-01-15 |188 |220 |35 |10 |350 | 543 | 544 | Abaixo temos nossa classe de teste (as outras funções e importações foram omitidas para diminuir o tamanho). 545 | Começamos novamente uma nova etapa de testes. 546 | 547 | ```python 548 | class TestSalesPipeline: 549 | 550 | def test_validate_sales_pipeline(self): 551 | oltp_hook = PostgresHook('oltp') 552 | olap_hook = PostgresHook('olap') 553 | 554 | create_table('products', oltp_hook) 555 | create_table('products', olap_hook) 556 | insert_initial_data('products', oltp_hook) 557 | 558 | date = '2020-01-01' 559 | execute_dag('products_sales_pipeline', date) 560 | 561 | # Test load_full_products task 562 | olap_products_size = olap_hook.get_records('select * from products') 563 | assert len(olap_products_size) == 5 564 | 565 | expected_products_data = output_expected_as_df('products') 566 | olap_products_data = olap_hook.get_pandas_df('select * from products') 567 | assert_frame_equal(olap_products_data, expected_products_data) 568 | 569 | # New code! 570 | # Test load_incremental_purchases 571 | olap_purchases_size = olap_hook.get_records('select * from purchases') 572 | assert len(olap_purchases_size) == 3 573 | ``` 574 | 575 | A coluna dos dados que correspondem ao tempo se chama `purchase_date`. 576 | Então se analisarmos os dados de amostra temos apenas 3 entradas para data `2020-01-01`. 577 | Essa data já estamos utilizando quando chamamos nossa DAG, variável `date = '2020-01-01'`. 578 | 579 | Vou antecipar alguns passos que já fizemos com a DAG anterior. Vou criar a tabela `purchases` nos dois bancos de dados usando o arquivo `sql/init/create_purchases.sql`: 580 | 581 | ```sql 582 | CREATE TABLE IF NOT EXISTS purchases ( 583 | purchase_id INTEGER, 584 | purchase_date TEXT, 585 | user_id INTEGER, 586 | product_id INTEGER, 587 | unit_price REAL, 588 | quantity INTEGER, 589 | total_revenue REAL 590 | ) 591 | ``` 592 | 593 | Depois, popular o banco de dados `oltp-db` com os dados fake que criamos. Foram incluídas as linhas abaixo: 594 | 595 | ```python 596 | # test_sales_pipeline 597 | class TestSalesPipeline: 598 | 599 | def test_validate_sales_pipeline(self): 600 | oltp_hook = PostgresHook('oltp') 601 | olap_hook = PostgresHook('olap') 602 | 603 | create_table('products', oltp_hook) 604 | create_table('products', olap_hook) 605 | insert_initial_data('products', oltp_hook) 606 | 607 | create_table('purchases', oltp_hook) 608 | create_table('purchases', olap_hook) 609 | insert_initial_data('purchases', oltp_hook) 610 | ``` 611 | 612 | Vamos adicionar a nova task à DAG. 613 | 614 | ```python 615 | from airflow import DAG 616 | from airflow.utils.dates import days_ago 617 | from airflow.providers.postgres.hooks.postgres import PostgresHook 618 | from airflow.operators.python import PythonOperator 619 | 620 | 621 | def transfer_oltp_olap(**kwargs): 622 | """Get records from OLTP and transfer to OLAP database""" 623 | dest_table = kwargs.get('dest_table') 624 | sql = kwargs.get('sql') 625 | params = kwargs.get('params') 626 | 627 | oltp_hook = PostgresHook(postgres_conn_id='oltp') 628 | olap_hook = PostgresHook(postgres_conn_id='olap') 629 | 630 | data_extracted = oltp_hook.get_records(sql=sql, parameters=params) 631 | olap_hook.insert_rows(dest_table, data_extracted, commit_every=1000) 632 | 633 | 634 | with DAG(dag_id='products_sales_pipeline', 635 | default_args={'owner': 'airflow'}, 636 | schedule_interval=None, 637 | start_date=days_ago(2)) as dag: 638 | 639 | execution_date = '{{ ds }}' 640 | 641 | load_full_products_data = PythonOperator( 642 | task_id='load_full_products', 643 | python_callable=transfer_oltp_olap, 644 | op_kwargs={ 645 | 'dest_table': 'products', 646 | 'sql': 'select * from products', 647 | }) 648 | 649 | load_incremental_purchases_data = PythonOperator( 650 | task_id='load_incremental_purchases', 651 | python_callable=transfer_oltp_olap, 652 | op_kwargs={ 653 | 'dest_table': 'purchases', 654 | 'sql': 'select * from purchases where "purchase_date" = %s', 655 | 'params': [execution_date] 656 | }) 657 | ``` 658 | 659 | Foi criada uma nova task PythonOperator chamada `load_incremental_purchases_data`. Ela reutiliza a função `transfer_oltp_olap` criada anteriormente. 660 | As únicas diferenças foram a cláusula `where purchase_data = %s` e a edição da função para receber o parâmetro extra na consulta. 661 | A sintaxe `{{ ds }}` é uma convenção do Airflow para acessar variáveis de contexto. 662 | Existem diversas variáveis que podem ser acessadas dentro do contexto da DAG. 663 | É meio obscuro no início, para mais informações leia a documentação [Macros Reference](https://airflow.apache.org/docs/apache-airflow/stable/macros-ref.html). 664 | 665 | Podemos rodar nossos testes agora. 666 | Nossa segunda task está concluída. 667 | Novamente podemos incrementar nosso teste para atender melhor o projeto. 668 | 669 | Nesse caso, vamos criar o arquivo com os dados esperados. 670 | Ao invés de copiar todo o arquivo `purchases.csv` como aconteceu com os dados de produtos, agora iremos apenas precisar de um subconjunto pertinente aos testes. 671 | Crie um novo arquivo chamado `purchases_2020-01-01.csv` dentro da pasta `expected`. 672 | 673 | |purchase_id|purchase_date|user_id|product_id|unit_price|quantity|total_revenue| 674 | |--------------|-------------|-------|----------|----------|--------|-------------| 675 | |1 |2020-01-01 |111 |222 |150.0 |2 |300.0 | 676 | |2 |2020-01-01 |101 |225 |75 |1 |75 | 677 | |3 |2020-01-01 |153 |228 |300 |1 |300 | 678 | 679 | Ele terá apenas dados do dia 2020-01-01. 680 | Agora vamos editar a nossa função de teste. 681 | 682 | ```python 683 | # test_sales_pipeline.py 684 | 685 | # old 686 | olap_purchases_size = olap_hook.get_records('select * from purchases') 687 | assert len(olap_purchases_size) == 3 688 | 689 | # new 690 | purchase_data = olap_hook.get_pandas_df('select * from purchases') 691 | purchase_size = len(purchase_data) 692 | purchase_expected = output_expected_as_df(f'purchases_{date}') 693 | assert_frame_equal(purchase_data, purchase_expected) 694 | assert purchase_size == 3 695 | ``` 696 | 697 | Assim concluímos a segunda tarefa. 698 | Chegamos no estágio do projeto em que finalizamos as tarefas de extração e carga dos dados. 699 | As próximas tarefas irão envolver apenas o banco `olap-db`. 700 | Agora vamos utilizar outro recurso do Airflow para executar ações. 701 | 702 | ## TASK: join_purchases_products 703 | 704 | Objetivo dessa task é realizar o join das duas tabelas criadas anteriormente. 705 | Voltamos ao nosso arquivo de teste criando o novo teste para a tabela `join_purchases_products`. 706 | 707 | ```python 708 | # test_sales_pipeline.py 709 | # ... 710 | purchase_data = olap_hook.get_pandas_df('select * from purchases') 711 | purchase_size = len(purchase_data) 712 | purchase_expected = output_expected_as_df(f'purchases_{date}') 713 | assert_frame_equal(purchase_data, purchase_expected) 714 | assert purchase_size == 3 715 | 716 | # Test join_purchases_products 717 | purchases_products_size = olap_hook.get_pandas_df('select * from join_purchases_products') 718 | assert len(purchases_products_size) == 3 719 | ``` 720 | 721 | Explicando o motivo de esperar que o resultado seja 3. 722 | Nesse join iremos pegar as transações carregadas e fazer um `left` join com a tabela de produtos. Por isso o tamanho máximo será 3. 723 | 724 | Podemos editar a DAG após inserir o teste. 725 | 726 | ```python 727 | from airflow import DAG 728 | from airflow.utils.dates import days_ago 729 | from airflow.providers.postgres.hooks.postgres import PostgresHook 730 | from airflow.providers.postgres.operators.postgres import PostgresOperator 731 | from airflow.operators.python import PythonOperator 732 | 733 | def transfer_oltp_olap(**kwargs): 734 | # não foi alterado nesse momento 735 | 736 | with DAG(dag_id='products_sales_pipeline', 737 | default_args={'owner': 'airflow'}, 738 | schedule_interval=None, 739 | template_searchpath='/opt/airflow/sql/sales/', 740 | start_date=days_ago(2)) as dag: 741 | 742 | execution_date = '{{ ds }}' 743 | 744 | load_full_products_data = PythonOperator( 745 | task_id='load_full_products', 746 | python_callable=transfer_oltp_olap, 747 | op_kwargs={ 748 | 'dest_table': 'products', 749 | 'sql': 'select * from products', 750 | }) 751 | 752 | load_incremental_purchases_data = PythonOperator( 753 | task_id='load_incremental_purchases', 754 | python_callable=transfer_oltp_olap, 755 | op_kwargs={ 756 | 'dest_table': 'purchases', 757 | 'sql': 'select * from purchases where "purchase_date" = %s', 758 | 'params': [execution_date] 759 | }) 760 | 761 | join_purchases_with_products = PostgresOperator( 762 | task_id='join_purchases_products', 763 | postgres_conn_id='olap', 764 | sql='join_purchases_with_products.sql' 765 | ) 766 | 767 | [load_full_products_data, load_incremental_purchases_data] >> join_purchases_with_products 768 | ``` 769 | 770 | 1. `template_searchpath='/opt/airflow/sql/sales/',` que foi inserido na criação da DAG(...) as dag. Esse comando permite carregar scripts SQL de outra pasta. 771 | 2. **PostgresOperator** como agora iremos transformar os dados que estão no banco de dados `olap-db` podemos utilizar o Operator. 772 | 3. Por último foi realizado a conexão de dependência das tarefas. 773 | 774 | Precisamos criar nosso arquivo SQL com a query. 775 | Crie ele no diretório `/sql/sales/join_purchases_with_products.sql`. 776 | Por que as pastas `init` e `sales`? Eu gosto de deixar os arquivos separados por esses segmentos lógicos onde eles são utilizados. 777 | 778 | ```sql 779 | create table if not exists join_purchases_products as ( 780 | select 781 | t.*, 782 | p.product_name, 783 | p.product_category 784 | from purchases t 785 | left join products p 786 | on p.product_id = t.product_id 787 | ) 788 | ``` 789 | 790 | Após criar o arquivo SQL podemos executar os testes e teremos nossa terceira tarefa concluída! 791 | As próximas tarefas podem ser realizadas da mesma forma utilizando o PostgresOperator. 792 | Vou deixá-las como desafio. 793 | Caso tenha dificuldade você pode analisar o código que está no repositório para se guiar. 794 | 795 | --- 796 | Muito obrigado e caso tenha alguma sugestão me envie uma mensagem pelo [LinkedIn](https://www.linkedin.com/in/marcos-marx-millnitz/?locale=en_US). 797 | --------------------------------------------------------------------------------