├── dags ├── __init__.py └── etl.py ├── plugins ├── __init__.py ├── utils │ ├── __init__.py │ ├── helper.py │ └── constants.py └── operators │ ├── __init__.py │ └── data_quality.py ├── requirements.txt ├── images ├── map.png ├── dask.png ├── schema.png ├── airflow.png ├── redshift.png └── dag_graph.png ├── .gitignore └── README.md /dags/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /plugins/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | regex 2 | pandas 3 | s3fs 4 | psycopg2-binary 5 | cryptography 6 | boto3 7 | -------------------------------------------------------------------------------- /images/map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/map.png -------------------------------------------------------------------------------- /images/dask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/dask.png -------------------------------------------------------------------------------- /images/schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/schema.png -------------------------------------------------------------------------------- /images/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/airflow.png -------------------------------------------------------------------------------- /images/redshift.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/redshift.png -------------------------------------------------------------------------------- /images/dag_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/dag_graph.png -------------------------------------------------------------------------------- /plugins/utils/helper.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from airflow.hooks.base_hook import BaseHook 4 | 5 | 6 | def get_extra_from_conn(conn_id): 7 | """ 8 | Obtain extra fields from airflow connection. 9 | 10 | Parameters 11 | ---------- 12 | conn_id : str 13 | Airflow Connection ID 14 | 15 | Returns 16 | ------- 17 | dict 18 | extra kwargs 19 | """ 20 | hook = BaseHook(conn_id) 21 | conn = hook.get_connection(conn_id) 22 | return json.loads(conn.extra) 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # mac 107 | .DS_Store 108 | -------------------------------------------------------------------------------- /plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class DataQualityOperator(BaseOperator): 7 | ui_color = '#89DA59' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | table="", 13 | test_stmt=None, 14 | result=None, 15 | *args, **kwargs): 16 | 17 | super(DataQualityOperator, self).__init__(*args, **kwargs) 18 | self.redshift_conn_id = redshift_conn_id 19 | self.table = table 20 | self.test_stmt = test_stmt 21 | self.result = result 22 | 23 | def execute(self, context): 24 | """ 25 | Perform data quality checks on resulting fact and dimension tables. 26 | 27 | Parameters: 28 | ---------- 29 | redshift_conn_id: string 30 | airflow connection to redshift cluster 31 | table: string 32 | table located in redshift cluster 33 | test_stmt: string 34 | test SQL command to check validity of target table 35 | result: string 36 | result of test_stmt to check validity 37 | """ 38 | pg_hook = PostgresHook(self.redshift_conn_id) 39 | records = pg_hook.get_records(f"SELECT COUNT(*) FROM {self.table}") 40 | if len(records) < 1 or len(records[0]) < 1: 41 | raise ValueError(f"Fail: No results for {self.table}") 42 | num_records = records[0][0] 43 | if num_records < 1: 44 | raise ValueError(f"Fail: 0 rows in {self.table}") 45 | 46 | if self.test_stmt: 47 | output = pg_hook.get_first(self.test_stmt) 48 | if self.result != output: 49 | raise ValueError(f"Fail: {output} != {self.result}") 50 | self.log.info(f"Success: {self.table} has {records[0][0]} records") 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Capstone Project 2 | 3 | ## Scope of Works 4 | The purpose of this project is to demonstrate various skills associated with data engineering projects. In particular, developing ETL pipelines using Airflow, constructing data warehouses through Redshift databases and S3 data storage as well as defining efficient data models e.g. star schema. As an example I will perform a deep dive into US immigration, primarily focusing on the type of visas being issued and the profiles associated. The scope of this project is limited to the data sources listed below with data being aggregated across numerous dimensions such as visatype, gender, port_of_entry, nationality and month. 5 | 6 | Further details and analysis can be found [here](./capstone_notebook.ipynb) 7 | 8 | ## Data Description & Sources 9 | - I94 Immigration Data: This data comes from the US National Tourism and Trade Office found [here](https://travel.trade.gov/research/reports/i94/historical/2016.html). Each report contains international visitor arrival statistics by world regions and select countries (including top 20), type of visa, mode of transportation, age groups, states visited (first intended address only), and the top ports of entry (for select countries). 10 | - World Temperature Data: This dataset came from Kaggle found [here](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data). 11 | - U.S. City Demographic Data: This dataset contains information about the demographics of all US cities and census-designated places with a population greater or equal to 65,000. Dataset comes from OpenSoft found [here](https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/). 12 | - Airport Code Table: This is a simple table of airport codes and corresponding cities. The airport codes may refer to either IATA airport code, a three-letter code which is used in passenger reservation, ticketing and baggage-handling systems, or the ICAO airport code which is a four letter code used by ATC systems and for airports that do not have an IATA airport code (from wikipedia). It comes from [here](https://datahub.io/core/airport-codes#data). 13 | 14 | After extracting various immigration codes from the `I94_SAS_Labels_Descriptions.SAS` file, I was able to define a star schema by extracting the immigration fact table and various dimension tables as shown below: 15 | 16 | 17 | Additionally, airports associated with `port_of_entry` could be identified through the `Airport Code Table`. The table is exhaustive and extends well beyond just the US as highlighted below: 18 | 19 | 20 | ## Data Storage 21 |

22 | Data was stored in S3 buckets in a collection of CSV and PARQUET files. The immigration dataset extends to several million rows and thus this dataset was converted to PARQUET files to allow for easy data manipulation and processing through Dask and the ability to write to Redshift.

23 |

24 | Dask is an extremely powerful and flexible library to handle parallel computing for dataframes in Python. Through this library, I was able to scale pandas and numpy workflows with minimal overhead. Whilst PySpark is a great API to Spark and tool to handle big data, I also highly recommend Dask, which you can read more about [here](https://dask.org/). 25 | 26 | ## ETL Pipeline 27 |

28 | Defining the data model and creating the star schema involves various steps, made significantly easier through the use of Airflow. The process of extracting files from S3 buckets, transforming the data and then writing CSV and PARQUET files to Redshift is accomplished through various tasks highlighted below in the ETL Dag graph. These steps include: 29 | - Extracting data from SAS Documents and writing as CSV files to S3 immigration bucket 30 | - Extracting remaining CSV and PARQUET files from S3 immigration bucket 31 | - Writing CSV and PARQUET files from S3 to Redshift 32 | - Performing data quality checks on the newly created tables 33 | 34 | 35 | ## Conclusion 36 | Overall this project was a small undertaking to demonstrate the steps involved in developing a data warehouse that is easily scalable. Skills include: 37 | * Creating a Redshift Cluster, IAM Roles, Security groups. 38 | * Developing an ETL Pipeline that copies data from S3 buckets into staging tables to be processed into a star schema 39 | * Developing a star schema with optimization to specific queries required by the data analytics team. 40 | * Using Airflow to automate ETL pipelines using Airflow, Python, Amazon Redshift. 41 | * Writing custom operators to perform tasks such as staging data, filling the data warehouse, and validation through data quality checks. 42 | * Transforming data from various sources into a star schema optimized for the analytics team's use cases. 43 | -------------------------------------------------------------------------------- /plugins/utils/constants.py: -------------------------------------------------------------------------------- 1 | class AirflowConnIds: 2 | S3 = 'aws_conn' 3 | REDSHIFT = 'capstoneuser' 4 | 5 | 6 | class S3Buckets: 7 | CAPSTONE = 'us-immigration' 8 | 9 | 10 | class General: 11 | SCHEMA = 'public' 12 | CSV_TABLES = ["airport_codes", "port_of_entry_codes", "nationality_codes", 13 | "port_of_issue_codes", "visa_codes", 14 | "us_cities_demographics", 15 | "i94cit_i94res", "i94port", "i94mode", "i94addr", "i94visa"] 16 | PARQUET_TABLES = ["immigration"] 17 | TABLES = CSV_TABLES + PARQUET_TABLES 18 | 19 | 20 | class SQLQueries: 21 | DROP_TABLE = """ 22 | DROP TABLE IF EXISTS {schema}.{table} 23 | """ # noqa 24 | 25 | COPY_CSV_TABLE = """ 26 | COPY {schema}.{table} FROM '{s3_uri}' 27 | CREDENTIALS 'aws_access_key_id={aws_access_key_id};aws_secret_access_key={aws_secret_access_key}' 28 | IGNOREHEADER 1 29 | COMPUPDATE OFF 30 | TRUNCATECOLUMNS 31 | CSV; 32 | """ # noqa 33 | 34 | COPY_PARQUET_TABLE = """ 35 | COPY {schema}.{table} FROM '{s3_uri}' 36 | IAM_ROLE '{aws_iam_role}' 37 | FORMAT AS PARQUET; 38 | """ # noqa 39 | 40 | INCREMENTAL_APPEND = """ 41 | ALTER TABLE {schema}.{table} APPEND FROM {schema}.{staged_table} FILLTARGET; 42 | """ # noqa 43 | 44 | GRANT_USAGE = """ 45 | GRANT USAGE ON SCHEMA {schema} TO {redshift_user}; 46 | """ # noqa 47 | 48 | GRANT_SELECT = """ 49 | GRANT SELECT ON {schema}.{table} TO {redshift_user}; 50 | """ # noqa 51 | 52 | CREATE = {} 53 | CREATE['immigration'] = """ 54 | CREATE TABLE IF NOT EXISTS public.immigration ( 55 | cicid FLOAT, 56 | i94yr FLOAT, 57 | i94mon FLOAT, 58 | i94cit FLOAT, 59 | i94res FLOAT, 60 | i94port VARCHAR, 61 | arrdate FLOAT, 62 | i94mode FLOAT, 63 | i94addr VARCHAR, 64 | depdate FLOAT, 65 | i94bir FLOAT, 66 | i94visa FLOAT, 67 | count FLOAT, 68 | dtadfile VARCHAR, 69 | visapost VARCHAR, 70 | occup VARCHAR, 71 | entdepa VARCHAR, 72 | entdepd VARCHAR, 73 | entdepu VARCHAR, 74 | matflag VARCHAR, 75 | biryear FLOAT, 76 | dtaddto VARCHAR, 77 | gender VARCHAR, 78 | insnum VARCHAR, 79 | airline VARCHAR, 80 | admnum FLOAT, 81 | fltno VARCHAR, 82 | visatype VARCHAR 83 | ); 84 | """ # noqa 85 | 86 | CREATE['airport_codes'] = """ 87 | CREATE TABLE IF NOT EXISTS public.airport_codes ( 88 | ident VARCHAR, 89 | type VARCHAR, 90 | name VARCHAR, 91 | elevation_ft FLOAT, 92 | continent VARCHAR, 93 | iso_country VARCHAR, 94 | iso_region VARCHAR, 95 | municipality VARCHAR, 96 | gps_code VARCHAR, 97 | iata_code VARCHAR, 98 | local_code VARCHAR, 99 | coordinates VARCHAR, 100 | lat FLOAT, 101 | long FLOAT 102 | ); 103 | """ # noqa 104 | 105 | CREATE['port_of_entry_codes'] = """ 106 | CREATE TABLE IF NOT EXISTS public.port_of_entry_codes ( 107 | code VARCHAR, 108 | location VARCHAR, 109 | city VARCHAR, 110 | state_or_country VARCHAR 111 | ); 112 | """ # noqa 113 | 114 | CREATE['port_of_issue_codes'] = """ 115 | CREATE TABLE IF NOT EXISTS public.port_of_issue_codes ( 116 | port_of_issue VARCHAR, 117 | code VARCHAR 118 | ); 119 | """ # noqa 120 | CREATE['visa_codes'] = """ 121 | CREATE TABLE IF NOT EXISTS public.visa_codes ( 122 | class_of_admission VARCHAR, 123 | ins_status_code VARCHAR, 124 | description VARCHAR, 125 | section_of_law VARCHAR 126 | ); 127 | """ # noqa 128 | 129 | CREATE['nationality_codes'] = """ 130 | CREATE TABLE IF NOT EXISTS public.nationality_codes ( 131 | nationality VARCHAR, 132 | code VARCHAR 133 | ); 134 | """ # noqa 135 | 136 | CREATE['us_cities_demographics'] = """ 137 | CREATE TABLE IF NOT EXISTS public.us_cities_demographics ( 138 | city VARCHAR, 139 | state VARCHAR, 140 | median_age FLOAT, 141 | male_population FLOAT, 142 | female_population FLOAT, 143 | total_population FLOAT, 144 | number_of_veterans FLOAT, 145 | foreign_born FLOAT, 146 | average_household_size FLOAT, 147 | state_code VARCHAR, 148 | race VARCHAR, 149 | count INT 150 | ); 151 | """ # noqa 152 | 153 | CREATE['i94cit_i94res'] = """ 154 | CREATE TABLE IF NOT EXISTS public.i94cit_i94res ( 155 | code INT, 156 | country VARCHAR 157 | ); 158 | """ # noqa 159 | 160 | CREATE['i94port'] = """ 161 | CREATE TABLE IF NOT EXISTS public.i94port ( 162 | code VARCHAR, 163 | port_of_entry VARCHAR, 164 | city VARCHAR, 165 | state_or_country VARCHAR 166 | ); 167 | """ # noqa 168 | 169 | CREATE['i94mode'] = """ 170 | CREATE TABLE IF NOT EXISTS public.i94mode ( 171 | code INT, 172 | transportation VARCHAR 173 | ); 174 | """ # noqa 175 | 176 | CREATE['i94addr'] = """ 177 | CREATE TABLE IF NOT EXISTS public.i94addr ( 178 | code VARCHAR, 179 | state VARCHAR 180 | ); 181 | """ # noqa 182 | 183 | CREATE['i94visa'] = """ 184 | CREATE TABLE IF NOT EXISTS public.i94visa ( 185 | code INT, 186 | reason_for_travel VARCHAR 187 | ); 188 | """ # noqa 189 | -------------------------------------------------------------------------------- /dags/etl.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | from datetime import datetime, timedelta 4 | import s3fs 5 | import logging 6 | 7 | from airflow import DAG 8 | from airflow.operators.dummy_operator import DummyOperator 9 | from airflow.operators.postgres_operator import PostgresOperator 10 | from airflow.operators.python_operator import PythonOperator 11 | 12 | from plugins.operators.data_quality import DataQualityOperator 13 | from plugins.utils.helper import get_extra_from_conn 14 | from plugins.utils import constants 15 | 16 | aws_conn = get_extra_from_conn(constants.AirflowConnIds.S3) 17 | 18 | PARAMS = { 19 | 'base_bucket': constants.S3Buckets.CAPSTONE, 20 | 'schema': constants.General.SCHEMA, 21 | 'redshift_user': constants.AirflowConnIds.REDSHIFT, 22 | 'aws_access_key_id': aws_conn.get('aws_access_key_id'), 23 | 'aws_secret_access_key': aws_conn.get('aws_secret_access_key'), 24 | 'aws_iam_role': aws_conn.get('aws_iam_role'), 25 | } 26 | 27 | default_args = { 28 | 'owner': 'danieldiamond', 29 | 'depends_on_past': False, 30 | 'catchup': False, 31 | 'start_date': datetime.now(), 32 | 'retries': 1, 33 | 'retry_delay': timedelta(minutes=5) 34 | } 35 | 36 | dag = DAG('etl_dag', 37 | default_args=default_args, 38 | description='Load and transform data in Redshift with Airflow', 39 | schedule_interval=None, 40 | ) 41 | 42 | etl_begin = DummyOperator(task_id='etl_begin', dag=dag) 43 | etl_success = DummyOperator(task_id='etl_success', dag=dag) 44 | 45 | 46 | # write sas codes to s3 47 | def write_sas_codes_to_s3(*args, **kwargs): 48 | """ 49 | Grabs the codes from SAS data and save to S3 as CSV files. 50 | """ 51 | s3 = s3fs.S3FileSystem(anon=False, 52 | key=PARAMS['aws_access_key_id'], 53 | secret=PARAMS['aws_secret_access_key']) 54 | 55 | with s3.open(f"{PARAMS['base_bucket']}/sas_data/" 56 | "I94_SAS_Labels_Descriptions.SAS", "r") as f: 57 | file = f.read() 58 | 59 | sas_dict = {} 60 | temp_data = [] 61 | for line in file.split("\n"): 62 | line = re.sub(r"\s+", " ", line) 63 | if "/*" in line and "-" in line: 64 | k, v = [i.strip(" ") for i in line.split("*")[1] 65 | .split("-", 1)] 66 | k = k.replace(' & ', '_').lower() 67 | sas_dict[k] = {'description': v} 68 | elif '=' in line and ';' not in line: 69 | temp_data.append([i.strip(' ').strip("'").title() 70 | for i in line.split('=')]) 71 | elif len(temp_data) > 0: 72 | sas_dict[k]['data'] = temp_data 73 | temp_data = [] 74 | 75 | sas_dict['i94cit_i94res']['df'] = pd.DataFrame( 76 | sas_dict['i94cit_i94res']['data'], columns=['code', 'country']) 77 | 78 | tempdf = pd.DataFrame(sas_dict['i94port']['data'], 79 | columns=['code', 'port_of_entry']) 80 | tempdf['code'] = tempdf['code'].str.upper() 81 | tempdf[['city', 'state_or_country']] = tempdf['port_of_entry' 82 | ].str.rsplit(',', 1, 83 | expand=True) 84 | sas_dict['i94port']['df'] = tempdf 85 | 86 | sas_dict['i94mode']['df'] = pd.DataFrame( 87 | sas_dict['i94mode']['data'], columns=['code', 'transportation']) 88 | 89 | tempdf = pd.DataFrame(sas_dict['i94addr']['data'], 90 | columns=['code', 'state']) 91 | tempdf['code'] = tempdf['code'].str.upper() 92 | sas_dict['i94addr']['df'] = tempdf 93 | 94 | sas_dict['i94visa']['df'] = pd.DataFrame( 95 | sas_dict['i94visa']['data'], columns=['code', 'reason_for_travel']) 96 | 97 | for table in sas_dict.keys(): 98 | if 'df' in sas_dict[table].keys(): 99 | logging.info(f"Writing {table} to S3") 100 | with s3.open(f"{PARAMS['base_bucket']}/{table}.csv", "w") as f: 101 | sas_dict[table]['df'].to_csv(f, index=False) 102 | 103 | 104 | task_write_sas_codes_to_s3 = PythonOperator( 105 | task_id='write_sas_codes_to_s3', 106 | python_callable=write_sas_codes_to_s3, 107 | dag=dag 108 | ) 109 | 110 | # Drop & Create Tables 111 | for table in constants.General.TABLES: 112 | logging.info(f"Drop & Create {table}") 113 | PARAMS['table'] = table 114 | PARAMS['s3_uri'] = ("s3://{base_bucket}/{table}.csv".format(**PARAMS)) 115 | drop_stmt = constants.SQLQueries.DROP_TABLE.format(**PARAMS) 116 | create_stmt = constants.SQLQueries.CREATE[table] 117 | grant_usage_stmt = constants.SQLQueries.GRANT_USAGE.format(**PARAMS) 118 | grant_select_stmt = constants.SQLQueries.GRANT_SELECT.format(**PARAMS) 119 | 120 | # Drop, Create, Grant Access Task 121 | task_create_table = PostgresOperator( 122 | task_id=f"create_{table}", 123 | postgres_conn_id="redshift", 124 | sql=[drop_stmt, create_stmt, grant_usage_stmt, grant_select_stmt], 125 | dag=dag 126 | ) 127 | 128 | if table in constants.General.CSV_TABLES: 129 | PARAMS['s3_uri'] = ('s3://{base_bucket}/{table}.csv'.format(**PARAMS)) 130 | copy_stmt = constants.SQLQueries.COPY_CSV_TABLE.format(**PARAMS) 131 | elif table in constants.General.PARQUET_TABLES: 132 | PARAMS['s3_uri'] = ('s3://{base_bucket}/parquet_data'.format(**PARAMS)) 133 | copy_stmt = constants.SQLQueries.COPY_PARQUET_TABLE.format(**PARAMS) 134 | else: 135 | logging.info(f"WARNING: Unable to COPY {table}") 136 | continue 137 | 138 | # COPY task 139 | task_copy_table = PostgresOperator( 140 | task_id=f"copy_{table}", 141 | postgres_conn_id="redshift", 142 | sql=copy_stmt, 143 | dag=dag 144 | ) 145 | logging.info(f"Successfully Copied {table}") 146 | 147 | # Data Quality Check Task 148 | task_data_quality = DataQualityOperator( 149 | task_id=f"data_quality_check_on_{table}", 150 | redshift_conn_id="redshift", 151 | table=table, 152 | dag=dag 153 | ) 154 | 155 | task_write_sas_codes_to_s3 >> task_create_table 156 | task_create_table >> task_copy_table 157 | task_copy_table >> task_data_quality 158 | task_data_quality >> etl_success 159 | 160 | etl_begin >> task_write_sas_codes_to_s3 161 | --------------------------------------------------------------------------------