├── dags
├── __init__.py
└── etl.py
├── plugins
├── __init__.py
├── utils
│ ├── __init__.py
│ ├── helper.py
│ └── constants.py
└── operators
│ ├── __init__.py
│ └── data_quality.py
├── requirements.txt
├── images
├── map.png
├── dask.png
├── schema.png
├── airflow.png
├── redshift.png
└── dag_graph.png
├── .gitignore
└── README.md
/dags/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/plugins/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/plugins/operators/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | regex
2 | pandas
3 | s3fs
4 | psycopg2-binary
5 | cryptography
6 | boto3
7 |
--------------------------------------------------------------------------------
/images/map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/map.png
--------------------------------------------------------------------------------
/images/dask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/dask.png
--------------------------------------------------------------------------------
/images/schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/schema.png
--------------------------------------------------------------------------------
/images/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/airflow.png
--------------------------------------------------------------------------------
/images/redshift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/redshift.png
--------------------------------------------------------------------------------
/images/dag_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chandu-muthyala/data-engineering-capstone/master/images/dag_graph.png
--------------------------------------------------------------------------------
/plugins/utils/helper.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from airflow.hooks.base_hook import BaseHook
4 |
5 |
6 | def get_extra_from_conn(conn_id):
7 | """
8 | Obtain extra fields from airflow connection.
9 |
10 | Parameters
11 | ----------
12 | conn_id : str
13 | Airflow Connection ID
14 |
15 | Returns
16 | -------
17 | dict
18 | extra kwargs
19 | """
20 | hook = BaseHook(conn_id)
21 | conn = hook.get_connection(conn_id)
22 | return json.loads(conn.extra)
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # mac
107 | .DS_Store
108 |
--------------------------------------------------------------------------------
/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 |
5 |
6 | class DataQualityOperator(BaseOperator):
7 | ui_color = '#89DA59'
8 |
9 | @apply_defaults
10 | def __init__(self,
11 | redshift_conn_id="",
12 | table="",
13 | test_stmt=None,
14 | result=None,
15 | *args, **kwargs):
16 |
17 | super(DataQualityOperator, self).__init__(*args, **kwargs)
18 | self.redshift_conn_id = redshift_conn_id
19 | self.table = table
20 | self.test_stmt = test_stmt
21 | self.result = result
22 |
23 | def execute(self, context):
24 | """
25 | Perform data quality checks on resulting fact and dimension tables.
26 |
27 | Parameters:
28 | ----------
29 | redshift_conn_id: string
30 | airflow connection to redshift cluster
31 | table: string
32 | table located in redshift cluster
33 | test_stmt: string
34 | test SQL command to check validity of target table
35 | result: string
36 | result of test_stmt to check validity
37 | """
38 | pg_hook = PostgresHook(self.redshift_conn_id)
39 | records = pg_hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
40 | if len(records) < 1 or len(records[0]) < 1:
41 | raise ValueError(f"Fail: No results for {self.table}")
42 | num_records = records[0][0]
43 | if num_records < 1:
44 | raise ValueError(f"Fail: 0 rows in {self.table}")
45 |
46 | if self.test_stmt:
47 | output = pg_hook.get_first(self.test_stmt)
48 | if self.result != output:
49 | raise ValueError(f"Fail: {output} != {self.result}")
50 | self.log.info(f"Success: {self.table} has {records[0][0]} records")
51 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Engineering Capstone Project
2 |
3 | ## Scope of Works
4 | The purpose of this project is to demonstrate various skills associated with data engineering projects. In particular, developing ETL pipelines using Airflow, constructing data warehouses through Redshift databases and S3 data storage as well as defining efficient data models e.g. star schema. As an example I will perform a deep dive into US immigration, primarily focusing on the type of visas being issued and the profiles associated. The scope of this project is limited to the data sources listed below with data being aggregated across numerous dimensions such as visatype, gender, port_of_entry, nationality and month.
5 |
6 | Further details and analysis can be found [here](./capstone_notebook.ipynb)
7 |
8 | ## Data Description & Sources
9 | - I94 Immigration Data: This data comes from the US National Tourism and Trade Office found [here](https://travel.trade.gov/research/reports/i94/historical/2016.html). Each report contains international visitor arrival statistics by world regions and select countries (including top 20), type of visa, mode of transportation, age groups, states visited (first intended address only), and the top ports of entry (for select countries).
10 | - World Temperature Data: This dataset came from Kaggle found [here](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data).
11 | - U.S. City Demographic Data: This dataset contains information about the demographics of all US cities and census-designated places with a population greater or equal to 65,000. Dataset comes from OpenSoft found [here](https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/).
12 | - Airport Code Table: This is a simple table of airport codes and corresponding cities. The airport codes may refer to either IATA airport code, a three-letter code which is used in passenger reservation, ticketing and baggage-handling systems, or the ICAO airport code which is a four letter code used by ATC systems and for airports that do not have an IATA airport code (from wikipedia). It comes from [here](https://datahub.io/core/airport-codes#data).
13 |
14 | After extracting various immigration codes from the `I94_SAS_Labels_Descriptions.SAS` file, I was able to define a star schema by extracting the immigration fact table and various dimension tables as shown below:
15 |
16 |
17 | Additionally, airports associated with `port_of_entry` could be identified through the `Airport Code Table`. The table is exhaustive and extends well beyond just the US as highlighted below:
18 |
19 |
20 | ## Data Storage
21 |



34 |
35 | ## Conclusion
36 | Overall this project was a small undertaking to demonstrate the steps involved in developing a data warehouse that is easily scalable. Skills include:
37 | * Creating a Redshift Cluster, IAM Roles, Security groups.
38 | * Developing an ETL Pipeline that copies data from S3 buckets into staging tables to be processed into a star schema
39 | * Developing a star schema with optimization to specific queries required by the data analytics team.
40 | * Using Airflow to automate ETL pipelines using Airflow, Python, Amazon Redshift.
41 | * Writing custom operators to perform tasks such as staging data, filling the data warehouse, and validation through data quality checks.
42 | * Transforming data from various sources into a star schema optimized for the analytics team's use cases.
43 |
--------------------------------------------------------------------------------
/plugins/utils/constants.py:
--------------------------------------------------------------------------------
1 | class AirflowConnIds:
2 | S3 = 'aws_conn'
3 | REDSHIFT = 'capstoneuser'
4 |
5 |
6 | class S3Buckets:
7 | CAPSTONE = 'us-immigration'
8 |
9 |
10 | class General:
11 | SCHEMA = 'public'
12 | CSV_TABLES = ["airport_codes", "port_of_entry_codes", "nationality_codes",
13 | "port_of_issue_codes", "visa_codes",
14 | "us_cities_demographics",
15 | "i94cit_i94res", "i94port", "i94mode", "i94addr", "i94visa"]
16 | PARQUET_TABLES = ["immigration"]
17 | TABLES = CSV_TABLES + PARQUET_TABLES
18 |
19 |
20 | class SQLQueries:
21 | DROP_TABLE = """
22 | DROP TABLE IF EXISTS {schema}.{table}
23 | """ # noqa
24 |
25 | COPY_CSV_TABLE = """
26 | COPY {schema}.{table} FROM '{s3_uri}'
27 | CREDENTIALS 'aws_access_key_id={aws_access_key_id};aws_secret_access_key={aws_secret_access_key}'
28 | IGNOREHEADER 1
29 | COMPUPDATE OFF
30 | TRUNCATECOLUMNS
31 | CSV;
32 | """ # noqa
33 |
34 | COPY_PARQUET_TABLE = """
35 | COPY {schema}.{table} FROM '{s3_uri}'
36 | IAM_ROLE '{aws_iam_role}'
37 | FORMAT AS PARQUET;
38 | """ # noqa
39 |
40 | INCREMENTAL_APPEND = """
41 | ALTER TABLE {schema}.{table} APPEND FROM {schema}.{staged_table} FILLTARGET;
42 | """ # noqa
43 |
44 | GRANT_USAGE = """
45 | GRANT USAGE ON SCHEMA {schema} TO {redshift_user};
46 | """ # noqa
47 |
48 | GRANT_SELECT = """
49 | GRANT SELECT ON {schema}.{table} TO {redshift_user};
50 | """ # noqa
51 |
52 | CREATE = {}
53 | CREATE['immigration'] = """
54 | CREATE TABLE IF NOT EXISTS public.immigration (
55 | cicid FLOAT,
56 | i94yr FLOAT,
57 | i94mon FLOAT,
58 | i94cit FLOAT,
59 | i94res FLOAT,
60 | i94port VARCHAR,
61 | arrdate FLOAT,
62 | i94mode FLOAT,
63 | i94addr VARCHAR,
64 | depdate FLOAT,
65 | i94bir FLOAT,
66 | i94visa FLOAT,
67 | count FLOAT,
68 | dtadfile VARCHAR,
69 | visapost VARCHAR,
70 | occup VARCHAR,
71 | entdepa VARCHAR,
72 | entdepd VARCHAR,
73 | entdepu VARCHAR,
74 | matflag VARCHAR,
75 | biryear FLOAT,
76 | dtaddto VARCHAR,
77 | gender VARCHAR,
78 | insnum VARCHAR,
79 | airline VARCHAR,
80 | admnum FLOAT,
81 | fltno VARCHAR,
82 | visatype VARCHAR
83 | );
84 | """ # noqa
85 |
86 | CREATE['airport_codes'] = """
87 | CREATE TABLE IF NOT EXISTS public.airport_codes (
88 | ident VARCHAR,
89 | type VARCHAR,
90 | name VARCHAR,
91 | elevation_ft FLOAT,
92 | continent VARCHAR,
93 | iso_country VARCHAR,
94 | iso_region VARCHAR,
95 | municipality VARCHAR,
96 | gps_code VARCHAR,
97 | iata_code VARCHAR,
98 | local_code VARCHAR,
99 | coordinates VARCHAR,
100 | lat FLOAT,
101 | long FLOAT
102 | );
103 | """ # noqa
104 |
105 | CREATE['port_of_entry_codes'] = """
106 | CREATE TABLE IF NOT EXISTS public.port_of_entry_codes (
107 | code VARCHAR,
108 | location VARCHAR,
109 | city VARCHAR,
110 | state_or_country VARCHAR
111 | );
112 | """ # noqa
113 |
114 | CREATE['port_of_issue_codes'] = """
115 | CREATE TABLE IF NOT EXISTS public.port_of_issue_codes (
116 | port_of_issue VARCHAR,
117 | code VARCHAR
118 | );
119 | """ # noqa
120 | CREATE['visa_codes'] = """
121 | CREATE TABLE IF NOT EXISTS public.visa_codes (
122 | class_of_admission VARCHAR,
123 | ins_status_code VARCHAR,
124 | description VARCHAR,
125 | section_of_law VARCHAR
126 | );
127 | """ # noqa
128 |
129 | CREATE['nationality_codes'] = """
130 | CREATE TABLE IF NOT EXISTS public.nationality_codes (
131 | nationality VARCHAR,
132 | code VARCHAR
133 | );
134 | """ # noqa
135 |
136 | CREATE['us_cities_demographics'] = """
137 | CREATE TABLE IF NOT EXISTS public.us_cities_demographics (
138 | city VARCHAR,
139 | state VARCHAR,
140 | median_age FLOAT,
141 | male_population FLOAT,
142 | female_population FLOAT,
143 | total_population FLOAT,
144 | number_of_veterans FLOAT,
145 | foreign_born FLOAT,
146 | average_household_size FLOAT,
147 | state_code VARCHAR,
148 | race VARCHAR,
149 | count INT
150 | );
151 | """ # noqa
152 |
153 | CREATE['i94cit_i94res'] = """
154 | CREATE TABLE IF NOT EXISTS public.i94cit_i94res (
155 | code INT,
156 | country VARCHAR
157 | );
158 | """ # noqa
159 |
160 | CREATE['i94port'] = """
161 | CREATE TABLE IF NOT EXISTS public.i94port (
162 | code VARCHAR,
163 | port_of_entry VARCHAR,
164 | city VARCHAR,
165 | state_or_country VARCHAR
166 | );
167 | """ # noqa
168 |
169 | CREATE['i94mode'] = """
170 | CREATE TABLE IF NOT EXISTS public.i94mode (
171 | code INT,
172 | transportation VARCHAR
173 | );
174 | """ # noqa
175 |
176 | CREATE['i94addr'] = """
177 | CREATE TABLE IF NOT EXISTS public.i94addr (
178 | code VARCHAR,
179 | state VARCHAR
180 | );
181 | """ # noqa
182 |
183 | CREATE['i94visa'] = """
184 | CREATE TABLE IF NOT EXISTS public.i94visa (
185 | code INT,
186 | reason_for_travel VARCHAR
187 | );
188 | """ # noqa
189 |
--------------------------------------------------------------------------------
/dags/etl.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pandas as pd
3 | from datetime import datetime, timedelta
4 | import s3fs
5 | import logging
6 |
7 | from airflow import DAG
8 | from airflow.operators.dummy_operator import DummyOperator
9 | from airflow.operators.postgres_operator import PostgresOperator
10 | from airflow.operators.python_operator import PythonOperator
11 |
12 | from plugins.operators.data_quality import DataQualityOperator
13 | from plugins.utils.helper import get_extra_from_conn
14 | from plugins.utils import constants
15 |
16 | aws_conn = get_extra_from_conn(constants.AirflowConnIds.S3)
17 |
18 | PARAMS = {
19 | 'base_bucket': constants.S3Buckets.CAPSTONE,
20 | 'schema': constants.General.SCHEMA,
21 | 'redshift_user': constants.AirflowConnIds.REDSHIFT,
22 | 'aws_access_key_id': aws_conn.get('aws_access_key_id'),
23 | 'aws_secret_access_key': aws_conn.get('aws_secret_access_key'),
24 | 'aws_iam_role': aws_conn.get('aws_iam_role'),
25 | }
26 |
27 | default_args = {
28 | 'owner': 'danieldiamond',
29 | 'depends_on_past': False,
30 | 'catchup': False,
31 | 'start_date': datetime.now(),
32 | 'retries': 1,
33 | 'retry_delay': timedelta(minutes=5)
34 | }
35 |
36 | dag = DAG('etl_dag',
37 | default_args=default_args,
38 | description='Load and transform data in Redshift with Airflow',
39 | schedule_interval=None,
40 | )
41 |
42 | etl_begin = DummyOperator(task_id='etl_begin', dag=dag)
43 | etl_success = DummyOperator(task_id='etl_success', dag=dag)
44 |
45 |
46 | # write sas codes to s3
47 | def write_sas_codes_to_s3(*args, **kwargs):
48 | """
49 | Grabs the codes from SAS data and save to S3 as CSV files.
50 | """
51 | s3 = s3fs.S3FileSystem(anon=False,
52 | key=PARAMS['aws_access_key_id'],
53 | secret=PARAMS['aws_secret_access_key'])
54 |
55 | with s3.open(f"{PARAMS['base_bucket']}/sas_data/"
56 | "I94_SAS_Labels_Descriptions.SAS", "r") as f:
57 | file = f.read()
58 |
59 | sas_dict = {}
60 | temp_data = []
61 | for line in file.split("\n"):
62 | line = re.sub(r"\s+", " ", line)
63 | if "/*" in line and "-" in line:
64 | k, v = [i.strip(" ") for i in line.split("*")[1]
65 | .split("-", 1)]
66 | k = k.replace(' & ', '_').lower()
67 | sas_dict[k] = {'description': v}
68 | elif '=' in line and ';' not in line:
69 | temp_data.append([i.strip(' ').strip("'").title()
70 | for i in line.split('=')])
71 | elif len(temp_data) > 0:
72 | sas_dict[k]['data'] = temp_data
73 | temp_data = []
74 |
75 | sas_dict['i94cit_i94res']['df'] = pd.DataFrame(
76 | sas_dict['i94cit_i94res']['data'], columns=['code', 'country'])
77 |
78 | tempdf = pd.DataFrame(sas_dict['i94port']['data'],
79 | columns=['code', 'port_of_entry'])
80 | tempdf['code'] = tempdf['code'].str.upper()
81 | tempdf[['city', 'state_or_country']] = tempdf['port_of_entry'
82 | ].str.rsplit(',', 1,
83 | expand=True)
84 | sas_dict['i94port']['df'] = tempdf
85 |
86 | sas_dict['i94mode']['df'] = pd.DataFrame(
87 | sas_dict['i94mode']['data'], columns=['code', 'transportation'])
88 |
89 | tempdf = pd.DataFrame(sas_dict['i94addr']['data'],
90 | columns=['code', 'state'])
91 | tempdf['code'] = tempdf['code'].str.upper()
92 | sas_dict['i94addr']['df'] = tempdf
93 |
94 | sas_dict['i94visa']['df'] = pd.DataFrame(
95 | sas_dict['i94visa']['data'], columns=['code', 'reason_for_travel'])
96 |
97 | for table in sas_dict.keys():
98 | if 'df' in sas_dict[table].keys():
99 | logging.info(f"Writing {table} to S3")
100 | with s3.open(f"{PARAMS['base_bucket']}/{table}.csv", "w") as f:
101 | sas_dict[table]['df'].to_csv(f, index=False)
102 |
103 |
104 | task_write_sas_codes_to_s3 = PythonOperator(
105 | task_id='write_sas_codes_to_s3',
106 | python_callable=write_sas_codes_to_s3,
107 | dag=dag
108 | )
109 |
110 | # Drop & Create Tables
111 | for table in constants.General.TABLES:
112 | logging.info(f"Drop & Create {table}")
113 | PARAMS['table'] = table
114 | PARAMS['s3_uri'] = ("s3://{base_bucket}/{table}.csv".format(**PARAMS))
115 | drop_stmt = constants.SQLQueries.DROP_TABLE.format(**PARAMS)
116 | create_stmt = constants.SQLQueries.CREATE[table]
117 | grant_usage_stmt = constants.SQLQueries.GRANT_USAGE.format(**PARAMS)
118 | grant_select_stmt = constants.SQLQueries.GRANT_SELECT.format(**PARAMS)
119 |
120 | # Drop, Create, Grant Access Task
121 | task_create_table = PostgresOperator(
122 | task_id=f"create_{table}",
123 | postgres_conn_id="redshift",
124 | sql=[drop_stmt, create_stmt, grant_usage_stmt, grant_select_stmt],
125 | dag=dag
126 | )
127 |
128 | if table in constants.General.CSV_TABLES:
129 | PARAMS['s3_uri'] = ('s3://{base_bucket}/{table}.csv'.format(**PARAMS))
130 | copy_stmt = constants.SQLQueries.COPY_CSV_TABLE.format(**PARAMS)
131 | elif table in constants.General.PARQUET_TABLES:
132 | PARAMS['s3_uri'] = ('s3://{base_bucket}/parquet_data'.format(**PARAMS))
133 | copy_stmt = constants.SQLQueries.COPY_PARQUET_TABLE.format(**PARAMS)
134 | else:
135 | logging.info(f"WARNING: Unable to COPY {table}")
136 | continue
137 |
138 | # COPY task
139 | task_copy_table = PostgresOperator(
140 | task_id=f"copy_{table}",
141 | postgres_conn_id="redshift",
142 | sql=copy_stmt,
143 | dag=dag
144 | )
145 | logging.info(f"Successfully Copied {table}")
146 |
147 | # Data Quality Check Task
148 | task_data_quality = DataQualityOperator(
149 | task_id=f"data_quality_check_on_{table}",
150 | redshift_conn_id="redshift",
151 | table=table,
152 | dag=dag
153 | )
154 |
155 | task_write_sas_codes_to_s3 >> task_create_table
156 | task_create_table >> task_copy_table
157 | task_copy_table >> task_data_quality
158 | task_data_quality >> etl_success
159 |
160 | etl_begin >> task_write_sas_codes_to_s3
161 |
--------------------------------------------------------------------------------