├── dags
    ├── __init__.py
    └── etl.py
├── plugins
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── helper.py
    │   └── constants.py
    └── operators
    │   ├── __init__.py
    │   └── data_quality.py
├── requirements.txt
├── images
    ├── map.png
    ├── dask.png
    ├── schema.png
    ├── airflow.png
    ├── redshift.png
    └── dag_graph.png
├── .gitignore
└── README.md


/dags/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/plugins/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/plugins/operators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | regex
2 | pandas
3 | s3fs
4 | psycopg2-binary
5 | cryptography
6 | boto3
7 | 


--------------------------------------------------------------------------------
/images/map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/map.png


--------------------------------------------------------------------------------
/images/dask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/dask.png


--------------------------------------------------------------------------------
/images/schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/schema.png


--------------------------------------------------------------------------------
/images/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/airflow.png


--------------------------------------------------------------------------------
/images/redshift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/redshift.png


--------------------------------------------------------------------------------
/images/dag_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/dag_graph.png


--------------------------------------------------------------------------------
/plugins/utils/helper.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from airflow.hooks.base_hook import BaseHook
 4 | 
 5 | 
 6 | def get_extra_from_conn(conn_id):
 7 |     """
 8 |     Obtain extra fields from airflow connection.
 9 | 
10 |     Parameters
11 |     ----------
12 |     conn_id : str
13 |         Airflow Connection ID
14 | 
15 |     Returns
16 |     -------
17 |     dict
18 |         extra kwargs
19 |     """
20 |     hook = BaseHook(conn_id)
21 |     conn = hook.get_connection(conn_id)
22 |     return json.loads(conn.extra)
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # mac
107 | .DS_Store
108 | 


--------------------------------------------------------------------------------
/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | 
 6 | class DataQualityOperator(BaseOperator):
 7 |     ui_color = '#89DA59'
 8 | 
 9 |     @apply_defaults
10 |     def __init__(self,
11 |                  redshift_conn_id="",
12 |                  table="",
13 |                  test_stmt=None,
14 |                  result=None,
15 |                  *args, **kwargs):
16 | 
17 |         super(DataQualityOperator, self).__init__(*args, **kwargs)
18 |         self.redshift_conn_id = redshift_conn_id
19 |         self.table = table
20 |         self.test_stmt = test_stmt
21 |         self.result = result
22 | 
23 |     def execute(self, context):
24 |         """
25 |         Perform data quality checks on resulting fact and dimension tables.
26 | 
27 |         Parameters:
28 |         ----------
29 |         redshift_conn_id: string
30 |             airflow connection to redshift cluster
31 |         table: string
32 |             table located in redshift cluster
33 |         test_stmt: string
34 |             test SQL command to check validity of target table
35 |         result: string
36 |             result of test_stmt to check validity
37 |         """
38 |         pg_hook = PostgresHook(self.redshift_conn_id)
39 |         records = pg_hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
40 |         if len(records) < 1 or len(records[0]) < 1:
41 |             raise ValueError(f"Fail: No results for {self.table}")
42 |         num_records = records[0][0]
43 |         if num_records < 1:
44 |             raise ValueError(f"Fail: 0 rows in {self.table}")
45 | 
46 |         if self.test_stmt:
47 |             output = pg_hook.get_first(self.test_stmt)
48 |             if self.result != output:
49 |                 raise ValueError(f"Fail: {output} != {self.result}")
50 |         self.log.info(f"Success: {self.table} has {records[0][0]} records")
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering Capstone Project
 2 | 
 3 | ## Scope of Works
 4 | The purpose of this project is to demonstrate various skills associated with data engineering projects. In particular, developing ETL pipelines using Airflow, constructing data warehouses through Redshift databases and S3 data storage as well as defining efficient data models e.g. star schema. As an example I will perform a deep dive into US immigration, primarily focusing on the type of visas being issued and the profiles associated. The scope of this project is limited to the data sources listed below with data being aggregated across numerous dimensions such as visatype, gender, port_of_entry, nationality and month.
 5 | 
 6 | Further details and analysis can be found [here](./capstone_notebook.ipynb)
 7 | 
 8 | ## Data Description & Sources
 9 | - I94 Immigration Data: This data comes from the US National Tourism and Trade Office found [here](https://travel.trade.gov/research/reports/i94/historical/2016.html). Each report contains international visitor arrival statistics by world regions and select countries (including top 20), type of visa, mode of transportation, age groups, states visited (first intended address only), and the top ports of entry (for select countries).
10 | - World Temperature Data: This dataset came from Kaggle found [here](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data).
11 | - U.S. City Demographic Data: This dataset contains information about the demographics of all US cities and census-designated places with a population greater or equal to 65,000. Dataset comes from OpenSoft found [here](https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/).
12 | - Airport Code Table: This is a simple table of airport codes and corresponding cities. The airport codes may refer to either IATA airport code, a three-letter code which is used in passenger reservation, ticketing and baggage-handling systems, or the ICAO airport code which is a four letter code used by ATC systems and for airports that do not have an IATA airport code (from wikipedia). It comes from [here](https://datahub.io/core/airport-codes#data).
13 | 
14 | After extracting various immigration codes from the  `I94_SAS_Labels_Descriptions.SAS` file, I was able to define a star schema by extracting the immigration fact table and various dimension tables as shown below:
15 | <img src="./images/schema.png"/>
16 | 
17 | Additionally, airports associated with `port_of_entry` could be identified through the `Airport Code Table`. The table is exhaustive and extends well beyond just the US as highlighted below:
18 | <img src="./images/map.png"/>
19 | 
20 | ## Data Storage
21 | <p align="center"><img src="./images/redshift.png" style="height: 100%; width: 100%; max-width: 200px" /></p>
22 | Data was stored in S3 buckets in a collection of CSV and PARQUET files. The immigration dataset extends to several million rows and thus this dataset was converted to PARQUET files to allow for easy data manipulation and processing through Dask and the ability to write to Redshift.<br><br>
23 | <p align="center"><img src="./images/dask.png" style="height: 100%; width: 100%; max-width: 200px" /></p>
24 | Dask is an extremely powerful and flexible library to handle parallel computing for dataframes in Python. Through this library, I was able to scale pandas and numpy workflows with minimal overhead. Whilst PySpark is a great API to Spark and tool to handle big data, I also highly recommend Dask, which you can read more about [here](https://dask.org/).
25 | 
26 | ## ETL Pipeline
27 | <p align="center"><img src="./images/airflow.png" style="height: 100%; width: 100%; max-width: 200px" /></p>
28 | Defining the data model and creating the star schema involves various steps, made significantly easier through the use of Airflow. The process of extracting files from S3 buckets, transforming the data and then writing CSV and PARQUET files to Redshift is accomplished through various tasks highlighted below in the ETL Dag graph. These steps include:
29 | - Extracting data from SAS Documents and writing as CSV files to S3 immigration bucket
30 | - Extracting remaining CSV and PARQUET files from S3 immigration bucket
31 | - Writing CSV and PARQUET files from S3 to Redshift
32 | - Performing data quality checks on the newly created tables
33 | <img src="./images/dag_graph.png"/>
34 | 
35 | ## Conclusion
36 | Overall this project was a small undertaking to demonstrate the steps involved in developing a data warehouse that is easily scalable. Skills include:
37 | * Creating a Redshift Cluster, IAM Roles, Security groups.
38 | * Developing an ETL Pipeline that copies data from S3 buckets into staging tables to be processed into a star schema
39 | * Developing a star schema with optimization to specific queries required by the data analytics team.
40 | * Using Airflow to automate ETL pipelines using Airflow, Python, Amazon Redshift.
41 | * Writing custom operators to perform tasks such as staging data, filling the data warehouse, and validation through data quality checks.
42 | * Transforming data from various sources into a star schema optimized for the analytics team's use cases.
43 | 


--------------------------------------------------------------------------------
/plugins/utils/constants.py:
--------------------------------------------------------------------------------
  1 | class AirflowConnIds:
  2 |     S3 = 'aws_conn'
  3 |     REDSHIFT = 'capstoneuser'
  4 | 
  5 | 
  6 | class S3Buckets:
  7 |     CAPSTONE = 'us-immigration'
  8 | 
  9 | 
 10 | class General:
 11 |     SCHEMA = 'public'
 12 |     CSV_TABLES = ["airport_codes", "port_of_entry_codes", "nationality_codes",
 13 |                   "port_of_issue_codes", "visa_codes",
 14 |                   "us_cities_demographics",
 15 |                   "i94cit_i94res", "i94port", "i94mode", "i94addr", "i94visa"]
 16 |     PARQUET_TABLES = ["immigration"]
 17 |     TABLES = CSV_TABLES + PARQUET_TABLES
 18 | 
 19 | 
 20 | class SQLQueries:
 21 |     DROP_TABLE = """
 22 |         DROP TABLE IF EXISTS {schema}.{table}
 23 |     """ # noqa
 24 | 
 25 |     COPY_CSV_TABLE = """
 26 |         COPY {schema}.{table} FROM '{s3_uri}'
 27 |         CREDENTIALS 'aws_access_key_id={aws_access_key_id};aws_secret_access_key={aws_secret_access_key}'
 28 |         IGNOREHEADER 1
 29 |         COMPUPDATE OFF
 30 |         TRUNCATECOLUMNS
 31 |         CSV;
 32 |     """ # noqa
 33 | 
 34 |     COPY_PARQUET_TABLE = """
 35 |         COPY {schema}.{table} FROM '{s3_uri}'
 36 |         IAM_ROLE '{aws_iam_role}'
 37 |         FORMAT AS PARQUET;
 38 |     """ # noqa
 39 | 
 40 |     INCREMENTAL_APPEND = """
 41 |         ALTER TABLE {schema}.{table} APPEND FROM {schema}.{staged_table} FILLTARGET;
 42 |     """ # noqa
 43 | 
 44 |     GRANT_USAGE = """
 45 |         GRANT USAGE ON SCHEMA {schema} TO {redshift_user};
 46 |     """ # noqa
 47 | 
 48 |     GRANT_SELECT = """
 49 |         GRANT SELECT ON {schema}.{table} TO {redshift_user};
 50 |     """ # noqa
 51 | 
 52 |     CREATE = {}
 53 |     CREATE['immigration'] = """
 54 |     	CREATE TABLE IF NOT EXISTS public.immigration (
 55 |     		cicid FLOAT,
 56 |             i94yr FLOAT,
 57 |             i94mon FLOAT,
 58 |             i94cit FLOAT,
 59 |             i94res FLOAT,
 60 |             i94port VARCHAR,
 61 |             arrdate FLOAT,
 62 |             i94mode FLOAT,
 63 |             i94addr VARCHAR,
 64 |             depdate FLOAT,
 65 |             i94bir FLOAT,
 66 |             i94visa FLOAT,
 67 |             count FLOAT,
 68 |             dtadfile VARCHAR,
 69 |             visapost VARCHAR,
 70 |             occup VARCHAR,
 71 |             entdepa VARCHAR,
 72 |             entdepd VARCHAR,
 73 |             entdepu VARCHAR,
 74 |             matflag VARCHAR,
 75 |             biryear FLOAT,
 76 |             dtaddto VARCHAR,
 77 |             gender VARCHAR,
 78 |             insnum VARCHAR,
 79 |             airline VARCHAR,
 80 |             admnum FLOAT,
 81 |             fltno VARCHAR,
 82 |             visatype VARCHAR
 83 |         );
 84 |     """ # noqa
 85 | 
 86 |     CREATE['airport_codes'] = """
 87 |     	CREATE TABLE IF NOT EXISTS public.airport_codes (
 88 |     		ident VARCHAR,
 89 |     		type VARCHAR,
 90 |     		name VARCHAR,
 91 |     		elevation_ft FLOAT,
 92 |     		continent VARCHAR,
 93 |     		iso_country VARCHAR,
 94 |     		iso_region VARCHAR,
 95 |     		municipality VARCHAR,
 96 |     		gps_code VARCHAR,
 97 |     		iata_code VARCHAR,
 98 |     		local_code VARCHAR,
 99 |     		coordinates VARCHAR,
100 |     		lat FLOAT,
101 |     		long FLOAT
102 |         );
103 |     """ # noqa
104 | 
105 |     CREATE['port_of_entry_codes'] = """
106 |     	CREATE TABLE IF NOT EXISTS public.port_of_entry_codes (
107 |     		code VARCHAR,
108 |     		location VARCHAR,
109 |     		city VARCHAR,
110 |     		state_or_country VARCHAR
111 |         );
112 |     """ # noqa
113 | 
114 |     CREATE['port_of_issue_codes'] = """
115 |     	CREATE TABLE IF NOT EXISTS public.port_of_issue_codes (
116 |     		port_of_issue VARCHAR,
117 |     		code VARCHAR
118 |         );
119 |     """ # noqa
120 |     CREATE['visa_codes'] = """
121 |     	CREATE TABLE IF NOT EXISTS public.visa_codes (
122 |     		class_of_admission VARCHAR,
123 |     		ins_status_code VARCHAR,
124 |     		description VARCHAR,
125 |     		section_of_law VARCHAR
126 |         );
127 |     """ # noqa
128 | 
129 |     CREATE['nationality_codes'] = """
130 |     	CREATE TABLE IF NOT EXISTS public.nationality_codes (
131 |     		nationality VARCHAR,
132 |     		code VARCHAR
133 |         );
134 |     """ # noqa
135 | 
136 |     CREATE['us_cities_demographics'] = """
137 |     	CREATE TABLE IF NOT EXISTS public.us_cities_demographics (
138 |     		city VARCHAR,
139 |     		state VARCHAR,
140 |     		median_age FLOAT,
141 |     		male_population FLOAT,
142 |     		female_population FLOAT,
143 |     		total_population FLOAT,
144 |     		number_of_veterans FLOAT,
145 |     		foreign_born FLOAT,
146 |     		average_household_size FLOAT,
147 |     		state_code VARCHAR,
148 |     		race VARCHAR,
149 |     		count INT
150 |         );
151 |     """ # noqa
152 | 
153 |     CREATE['i94cit_i94res'] = """
154 |     	CREATE TABLE IF NOT EXISTS public.i94cit_i94res (
155 |     		code INT,
156 |     		country VARCHAR
157 |         );
158 |     """ # noqa
159 | 
160 |     CREATE['i94port'] = """
161 |     	CREATE TABLE IF NOT EXISTS public.i94port (
162 |     		code VARCHAR,
163 |     		port_of_entry VARCHAR,
164 |     		city VARCHAR,
165 |     		state_or_country VARCHAR
166 |         );
167 |     """ # noqa
168 | 
169 |     CREATE['i94mode'] = """
170 |     	CREATE TABLE IF NOT EXISTS public.i94mode (
171 |     		code INT,
172 |     		transportation VARCHAR
173 |         );
174 |     """ # noqa
175 | 
176 |     CREATE['i94addr'] = """
177 |     	CREATE TABLE IF NOT EXISTS public.i94addr (
178 |     		code VARCHAR,
179 |     		state VARCHAR
180 |         );
181 |     """ # noqa
182 | 
183 |     CREATE['i94visa'] = """
184 |     	CREATE TABLE IF NOT EXISTS public.i94visa (
185 |     		code INT,
186 |     		reason_for_travel VARCHAR
187 |         );
188 |     """ # noqa
189 | 


--------------------------------------------------------------------------------
/dags/etl.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pandas as pd
  3 | from datetime import datetime, timedelta
  4 | import s3fs
  5 | import logging
  6 | 
  7 | from airflow import DAG
  8 | from airflow.operators.dummy_operator import DummyOperator
  9 | from airflow.operators.postgres_operator import PostgresOperator
 10 | from airflow.operators.python_operator import PythonOperator
 11 | 
 12 | from plugins.operators.data_quality import DataQualityOperator
 13 | from plugins.utils.helper import get_extra_from_conn
 14 | from plugins.utils import constants
 15 | 
 16 | aws_conn = get_extra_from_conn(constants.AirflowConnIds.S3)
 17 | 
 18 | PARAMS = {
 19 |           'base_bucket': constants.S3Buckets.CAPSTONE,
 20 |           'schema': constants.General.SCHEMA,
 21 |           'redshift_user': constants.AirflowConnIds.REDSHIFT,
 22 |           'aws_access_key_id': aws_conn.get('aws_access_key_id'),
 23 |           'aws_secret_access_key': aws_conn.get('aws_secret_access_key'),
 24 |           'aws_iam_role': aws_conn.get('aws_iam_role'),
 25 |           }
 26 | 
 27 | default_args = {
 28 |     'owner': 'danieldiamond',
 29 |     'depends_on_past': False,
 30 |     'catchup': False,
 31 |     'start_date': datetime.now(),
 32 |     'retries': 1,
 33 |     'retry_delay': timedelta(minutes=5)
 34 | }
 35 | 
 36 | dag = DAG('etl_dag',
 37 |           default_args=default_args,
 38 |           description='Load and transform data in Redshift with Airflow',
 39 |           schedule_interval=None,
 40 |           )
 41 | 
 42 | etl_begin = DummyOperator(task_id='etl_begin',  dag=dag)
 43 | etl_success = DummyOperator(task_id='etl_success',  dag=dag)
 44 | 
 45 | 
 46 | # write sas codes to s3
 47 | def write_sas_codes_to_s3(*args, **kwargs):
 48 |     """
 49 |     Grabs the codes from SAS data and save to S3 as CSV files.
 50 |     """
 51 |     s3 = s3fs.S3FileSystem(anon=False,
 52 |                            key=PARAMS['aws_access_key_id'],
 53 |                            secret=PARAMS['aws_secret_access_key'])
 54 | 
 55 |     with s3.open(f"{PARAMS['base_bucket']}/sas_data/"
 56 |                  "I94_SAS_Labels_Descriptions.SAS", "r") as f:
 57 |         file = f.read()
 58 | 
 59 |     sas_dict = {}
 60 |     temp_data = []
 61 |     for line in file.split("\n"):
 62 |         line = re.sub(r"\s+", " ", line)
 63 |         if "/*" in line and "-" in line:
 64 |             k, v = [i.strip(" ") for i in line.split("*")[1]
 65 |                                               .split("-", 1)]
 66 |             k = k.replace(' & ', '_').lower()
 67 |             sas_dict[k] = {'description': v}
 68 |         elif '=' in line and ';' not in line:
 69 |             temp_data.append([i.strip(' ').strip("'").title()
 70 |                               for i in line.split('=')])
 71 |         elif len(temp_data) > 0:
 72 |             sas_dict[k]['data'] = temp_data
 73 |             temp_data = []
 74 | 
 75 |     sas_dict['i94cit_i94res']['df'] = pd.DataFrame(
 76 |         sas_dict['i94cit_i94res']['data'], columns=['code', 'country'])
 77 | 
 78 |     tempdf = pd.DataFrame(sas_dict['i94port']['data'],
 79 |                           columns=['code', 'port_of_entry'])
 80 |     tempdf['code'] = tempdf['code'].str.upper()
 81 |     tempdf[['city', 'state_or_country']] = tempdf['port_of_entry'
 82 |                                                   ].str.rsplit(',', 1,
 83 |                                                                expand=True)
 84 |     sas_dict['i94port']['df'] = tempdf
 85 | 
 86 |     sas_dict['i94mode']['df'] = pd.DataFrame(
 87 |         sas_dict['i94mode']['data'], columns=['code', 'transportation'])
 88 | 
 89 |     tempdf = pd.DataFrame(sas_dict['i94addr']['data'],
 90 |                           columns=['code', 'state'])
 91 |     tempdf['code'] = tempdf['code'].str.upper()
 92 |     sas_dict['i94addr']['df'] = tempdf
 93 | 
 94 |     sas_dict['i94visa']['df'] = pd.DataFrame(
 95 |         sas_dict['i94visa']['data'], columns=['code', 'reason_for_travel'])
 96 | 
 97 |     for table in sas_dict.keys():
 98 |         if 'df' in sas_dict[table].keys():
 99 |             logging.info(f"Writing {table} to S3")
100 |             with s3.open(f"{PARAMS['base_bucket']}/{table}.csv", "w") as f:
101 |                 sas_dict[table]['df'].to_csv(f, index=False)
102 | 
103 | 
104 | task_write_sas_codes_to_s3 = PythonOperator(
105 |     task_id='write_sas_codes_to_s3',
106 |     python_callable=write_sas_codes_to_s3,
107 |     dag=dag
108 | )
109 | 
110 | # Drop & Create Tables
111 | for table in constants.General.TABLES:
112 |     logging.info(f"Drop & Create {table}")
113 |     PARAMS['table'] = table
114 |     PARAMS['s3_uri'] = ("s3://{base_bucket}/{table}.csv".format(**PARAMS))
115 |     drop_stmt = constants.SQLQueries.DROP_TABLE.format(**PARAMS)
116 |     create_stmt = constants.SQLQueries.CREATE[table]
117 |     grant_usage_stmt = constants.SQLQueries.GRANT_USAGE.format(**PARAMS)
118 |     grant_select_stmt = constants.SQLQueries.GRANT_SELECT.format(**PARAMS)
119 | 
120 |     # Drop, Create, Grant Access Task
121 |     task_create_table = PostgresOperator(
122 |         task_id=f"create_{table}",
123 |         postgres_conn_id="redshift",
124 |         sql=[drop_stmt, create_stmt, grant_usage_stmt, grant_select_stmt],
125 |         dag=dag
126 |     )
127 | 
128 |     if table in constants.General.CSV_TABLES:
129 |         PARAMS['s3_uri'] = ('s3://{base_bucket}/{table}.csv'.format(**PARAMS))
130 |         copy_stmt = constants.SQLQueries.COPY_CSV_TABLE.format(**PARAMS)
131 |     elif table in constants.General.PARQUET_TABLES:
132 |         PARAMS['s3_uri'] = ('s3://{base_bucket}/parquet_data'.format(**PARAMS))
133 |         copy_stmt = constants.SQLQueries.COPY_PARQUET_TABLE.format(**PARAMS)
134 |     else:
135 |         logging.info(f"WARNING: Unable to COPY {table}")
136 |         continue
137 | 
138 |     # COPY task
139 |     task_copy_table = PostgresOperator(
140 |         task_id=f"copy_{table}",
141 |         postgres_conn_id="redshift",
142 |         sql=copy_stmt,
143 |         dag=dag
144 |     )
145 |     logging.info(f"Successfully Copied {table}")
146 | 
147 |     # Data Quality Check Task
148 |     task_data_quality = DataQualityOperator(
149 |         task_id=f"data_quality_check_on_{table}",
150 |         redshift_conn_id="redshift",
151 |         table=table,
152 |         dag=dag
153 |     )
154 | 
155 |     task_write_sas_codes_to_s3 >> task_create_table
156 |     task_create_table >> task_copy_table
157 |     task_copy_table >> task_data_quality
158 |     task_data_quality >> etl_success
159 | 
160 | etl_begin >> task_write_sas_codes_to_s3
161 | 


--------------------------------------------------------------------------------