├── images ├── SparfiyDAG.jpg └── Pipelineview.jpg ├── plugins ├── helpers │ ├── __init__.py │ └── sql_queries.py ├── operators │ ├── __init__.py │ ├── load_dimension.py │ ├── load_fact.py │ ├── data_quality.py │ └── stage_redshift.py └── __init__.py ├── create_tables.py ├── dags ├── sql_statements.py └── Sparkify_Data_Pipeline_dag.py └── README.md /images/SparfiyDAG.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddgope/Data-Pipelines-with-Airflow/HEAD/images/SparfiyDAG.jpg -------------------------------------------------------------------------------- /images/Pipelineview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddgope/Data-Pipelines-with-Airflow/HEAD/images/Pipelineview.jpg -------------------------------------------------------------------------------- /plugins/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.sql_queries import SqlQueries 2 | 3 | __all__ = [ 4 | 'SqlQueries', 5 | ] -------------------------------------------------------------------------------- /plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.stage_redshift import StageToRedshiftOperator 2 | from operators.load_fact import LoadFactOperator 3 | from operators.load_dimension import LoadDimensionOperator 4 | from operators.data_quality import DataQualityOperator 5 | 6 | __all__ = [ 7 | 'StageToRedshiftOperator', 8 | 'LoadFactOperator', 9 | 'LoadDimensionOperator', 10 | 'DataQualityOperator' 11 | ] 12 | -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | import helpers 7 | 8 | # Defining the plugin class 9 | class UdacityPlugin(AirflowPlugin): 10 | name = "udacity_plugin" 11 | operators = [ 12 | operators.StageToRedshiftOperator, 13 | operators.LoadFactOperator, 14 | operators.LoadDimensionOperator, 15 | operators.DataQualityOperator 16 | ] 17 | helpers = [ 18 | helpers.SqlQueries 19 | ] 20 | -------------------------------------------------------------------------------- /create_tables.py: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS artists ( 2 | artistid varchar(256) NOT NULL, 3 | name varchar(256), 4 | location varchar(256), 5 | lattitude numeric(18,0), 6 | longitude numeric(18,0) 7 | ); 8 | 9 | CREATE TABLE IF NOT EXISTS songplays ( 10 | playid varchar(32) NOT NULL, 11 | start_time timestamp NOT NULL, 12 | userid int4 NOT NULL, 13 | "level" varchar(256), 14 | songid varchar(256), 15 | artistid varchar(256), 16 | sessionid int4, 17 | location varchar(256), 18 | user_agent varchar(256), 19 | CONSTRAINT songplays_pkey PRIMARY KEY (playid) 20 | 21 | ); 22 | 23 | CREATE TABLE IF NOT EXISTS songs ( 24 | songid varchar(256) NOT NULL, 25 | title varchar(256), 26 | artistid varchar(256), 27 | "year" int4, 28 | duration numeric(18,0), 29 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 30 | ); 31 | 32 | CREATE TABLE IF NOT EXISTS users ( 33 | userid int4 NOT NULL, 34 | first_name varchar(256), 35 | last_name varchar(256), 36 | gender varchar(256), 37 | "level" varchar(256), 38 | CONSTRAINT users_pkey PRIMARY KEY (userid) 39 | ); 40 | 41 | 42 | CREATE TABLE IF NOT EXISTS time( 43 | start_time timestamp NOT NULL, 44 | hour integer, 45 | day integer, 46 | week integer, 47 | month integer, 48 | year integer, 49 | dayofweek integer) 50 | 51 | -------------------------------------------------------------------------------- /plugins/operators/load_dimension.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from airflow.hooks.postgres_hook import PostgresHook 3 | from airflow.models import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | class LoadDimensionOperator(BaseOperator): 7 | ui_color = '#80BD9E' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | table_name="", 13 | sql_statement="", 14 | append_data="", 15 | *args, **kwargs): 16 | 17 | super(LoadDimensionOperator, self).__init__(*args, **kwargs) 18 | self.redshift_conn_id = redshift_conn_id 19 | self.table_name=table_name 20 | self.sql_statement=sql_statement 21 | self.append_data=append_data 22 | 23 | def execute(self, context): 24 | self.log.info('LoadDimensionOperator has been implemented !') 25 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 26 | if self.append_data == True: 27 | sql_statement = 'INSERT INTO %s %s' % (self.table_name, self.sql_statement) 28 | redshift.run(sql_statement) 29 | else: 30 | sql_statement = 'TRUNCATE TABLE %s;' % (self.table_name) 31 | sql_statement =sql_statement + 'INSERT INTO %s %s' % (self.table_name, self.sql_statement) 32 | redshift.run(sql_statement) 33 | -------------------------------------------------------------------------------- /plugins/helpers/sql_queries.py: -------------------------------------------------------------------------------- 1 | class SqlQueries: 2 | songplay_table_insert = (""" 3 | SELECT 4 | md5(events.sessionid || events.start_time) songplay_id, 5 | events.start_time, 6 | events.userid, 7 | events.level, 8 | songs.song_id, 9 | songs.artist_id, 10 | events.sessionid, 11 | events.location, 12 | events.useragent 13 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 14 | FROM staging_events 15 | WHERE page='NextSong') events 16 | LEFT JOIN staging_songs songs 17 | ON events.song = songs.title 18 | AND events.artist = songs.artist_name 19 | AND events.length = songs.duration 20 | """) 21 | 22 | user_table_insert = (""" 23 | SELECT distinct userid, firstname, lastname, gender, level 24 | FROM staging_events 25 | WHERE page='NextSong' 26 | """) 27 | 28 | song_table_insert = (""" 29 | SELECT distinct song_id, title, artist_id, year, duration 30 | FROM staging_songs 31 | """) 32 | 33 | artist_table_insert = (""" 34 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 35 | FROM staging_songs 36 | """) 37 | 38 | time_table_insert = (""" 39 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 40 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 41 | FROM songplays 42 | """) -------------------------------------------------------------------------------- /plugins/operators/load_fact.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadFactOperator(BaseOperator): 6 | ui_color = '#F98866' 7 | songplay_table_insert = (""" 8 | INSERT INTO songplays (playid,start_time,userid,level,songid,artistid,sessionid,location,user_agent) 9 | SELECT Distinct 10 | md5(events.ts) songplay_id, 11 | events.start_time, 12 | events.userid, 13 | events.level, 14 | songs.song_id, 15 | songs.artist_id, 16 | events.sessionid, 17 | events.location, 18 | events.useragent 19 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 20 | FROM staging_events 21 | WHERE page='NextSong') events 22 | LEFT JOIN staging_songs songs 23 | ON events.song = songs.title 24 | AND events.artist = songs.artist_name 25 | AND events.length = songs.duration 26 | WHERE (songs.song_id<>'' or songs.artist_id<>'') 27 | AND length(events.userid)>0 28 | """) 29 | 30 | @apply_defaults 31 | def __init__(self, 32 | redshift_conn_id="", 33 | *args, **kwargs): 34 | 35 | super(LoadFactOperator, self).__init__(*args, **kwargs) 36 | self.redshift_conn_id = redshift_conn_id 37 | 38 | def execute(self, context): 39 | self.log.info('Loading into fact table Songplay!') 40 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 41 | facts_sql =LoadFactOperator.songplay_table_insert 42 | redshift.run(facts_sql) 43 | -------------------------------------------------------------------------------- /plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from airflow.hooks.postgres_hook import PostgresHook 3 | from airflow.models import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | class DataQualityOperator(BaseOperator): 7 | ui_color = '#89DA59' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | # Define your operators params (with defaults) here 12 | # Example: 13 | # conn_id = your-connection-name 14 | redshift_conn_id = "", 15 | check_sql="", 16 | expected_value="", 17 | describe="", 18 | *args, **kwargs): 19 | self.redshift_conn_id = redshift_conn_id 20 | self.check_sql = check_sql 21 | self.expected_value = expected_value 22 | self.describe = describe 23 | 24 | super(DataQualityOperator, self).__init__(*args, **kwargs) 25 | # Map params here 26 | # Example: 27 | # self.conn_id = conn_id 28 | self.redshift_conn_id = redshift_conn_id 29 | 30 | def execute(self, context): 31 | redshift_hook = PostgresHook(self.redshift_conn_id) 32 | 33 | self.log.info("\n".join([ 34 | 'DataQuality check', 35 | self.describe, 36 | 'expected value is {}'.format(self.expected_value), 37 | self.check_sql 38 | ])) 39 | 40 | records = redshift_hook.get_records(self.check_sql) 41 | if (records[0][0] < 1): # len(records) < 1 or len(records[0][0]) 42 | raise ValueError(f"Data quality check failed. returned no results") 43 | if int(self.expected_value) != records[0][0]: 44 | raise ValueError(f"Data quality check failed. \n expected: {self.expected_value} \n acutal: {records[0][0]}") 45 | self.log.info(f"Data quality on \n {self.describe} \n check passed with \n expected: {self.expected_value} \n acutal: {records[0][0]}") 46 | -------------------------------------------------------------------------------- /plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | from airflow.contrib.hooks.aws_hook import AwsHook 5 | 6 | ''' 7 | StageToRedshift Operator : This is Custome Operator, will be able to load any JSON formatted files from S3 to Amazon Redshift. 8 | The operator creates and runs a SQL COPY statement based on the parameters provided. 9 | ''' 10 | 11 | class StageToRedshiftOperator(BaseOperator): 12 | ui_color = '#358140' 13 | template_fields = ("s3_key",) 14 | copy_sql = """ 15 | COPY {} 16 | FROM '{}' 17 | ACCESS_KEY_ID '{}' 18 | SECRET_ACCESS_KEY '{}' 19 | FORMAT AS JSON '{}' 20 | TIMEFORMAT AS 'epochmillisecs' 21 | region 'us-west-2' 22 | """ 23 | @apply_defaults 24 | def __init__(self, 25 | redshift_conn_id="", 26 | aws_credentials_id="", 27 | table="", 28 | s3_bucket="", 29 | s3_key="", 30 | delimiter=",", 31 | ignore_headers=1, 32 | JSONPaths="", 33 | *args, **kwargs): 34 | 35 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs) 36 | self.table = table 37 | self.redshift_conn_id = redshift_conn_id 38 | self.s3_bucket = s3_bucket 39 | self.s3_key = s3_key 40 | self.delimiter = delimiter 41 | self.ignore_headers = ignore_headers 42 | self.aws_credentials_id = aws_credentials_id 43 | self.JSONPaths = JSONPaths 44 | 45 | def execute(self, context): 46 | aws_hook = AwsHook(self.aws_credentials_id) 47 | credentials = aws_hook.get_credentials() 48 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 49 | 50 | self.log.info("Clearing data from destination Redshift table") 51 | redshift.run("DELETE FROM {}".format(self.table)) 52 | 53 | self.log.info("Copying data from S3 to Redshift") 54 | rendered_key = self.s3_key.format(**context) 55 | s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) 56 | json_path = "s3://{}/{}".format(self.s3_bucket, self.JSONPaths) 57 | if rendered_key=="song_data": # song JSON data files don't have JSON manifest file thatswhy need to mark as Auto 58 | json_path = self.JSONPaths 59 | formatted_sql = StageToRedshiftOperator.copy_sql.format( 60 | self.table, 61 | s3_path, 62 | credentials.access_key, 63 | credentials.secret_key, 64 | #self.ignore_headers, #this is used for CSV file 65 | #self.delimiter, #this is used for CSV file 66 | json_path 67 | ) 68 | redshift.run(formatted_sql) -------------------------------------------------------------------------------- /dags/sql_statements.py: -------------------------------------------------------------------------------- 1 | #Below are the tables need to be create 2 | CREATE_staging_events_TABLE_SQL = """ 3 | CREATE TABLE IF NOT EXISTS staging_events ( 4 | artist varchar(256), 5 | auth varchar(256), 6 | firstname varchar(256), 7 | gender varchar(256), 8 | iteminsession int4, 9 | lastname varchar(256), 10 | length numeric(18,0), 11 | "level" varchar(256), 12 | location varchar(256), 13 | "method" varchar(256), 14 | page varchar(256), 15 | registration numeric(18,0), 16 | sessionid int4, 17 | song varchar(256), 18 | status int4, 19 | ts int8, 20 | useragent varchar(256), 21 | userid int4) 22 | """ 23 | 24 | CREATE_staging_songs_TABLE_SQL = """ 25 | CREATE TABLE IF NOT EXISTS staging_songs ( 26 | num_songs int4, 27 | artist_id varchar(256), 28 | artist_name varchar(256), 29 | artist_latitude numeric(18,0), 30 | artist_longitude numeric(18,0), 31 | artist_location varchar(256), 32 | song_id varchar(256), 33 | title varchar(256), 34 | duration numeric(18,0), 35 | "year" int4) 36 | """ 37 | 38 | CREATE_artists_TABLE_SQL = """ 39 | CREATE TABLE IF NOT EXISTS artists ( 40 | artistid varchar(256) NOT NULL, 41 | name varchar(256), 42 | location varchar(256), 43 | lattitude numeric(18,0), 44 | longitude numeric(18,0) 45 | ) 46 | """ 47 | 48 | CREATE_songplays_TABLE_SQL = """ 49 | CREATE TABLE IF NOT EXISTS songplays ( 50 | playid varchar(32) NOT NULL, 51 | start_time timestamp NOT NULL, 52 | userid int4 NOT NULL, 53 | "level" varchar(256), 54 | songid varchar(256), 55 | artistid varchar(256), 56 | sessionid int4, 57 | location varchar(256), 58 | user_agent varchar(256), 59 | CONSTRAINT songplays_pkey PRIMARY KEY (playid) 60 | ); 61 | """ 62 | 63 | CREATE_songs_TABLE_SQL = """ 64 | CREATE TABLE IF NOT EXISTS songs ( 65 | songid varchar(256) NOT NULL, 66 | title varchar(256), 67 | artistid varchar(256), 68 | "year" int4, 69 | duration numeric(18,0), 70 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 71 | ); 72 | """ 73 | 74 | CREATE_users_TABLE_SQL = """ 75 | CREATE TABLE IF NOT EXISTS users ( 76 | userid int4 NOT NULL, 77 | first_name varchar(256), 78 | last_name varchar(256), 79 | gender varchar(256), 80 | "level" varchar(256), 81 | CONSTRAINT users_pkey PRIMARY KEY (userid) 82 | ); 83 | """ 84 | 85 | CREATE_time_TABLE_SQL = (""" 86 | CREATE TABLE IF NOT EXISTS time( 87 | start_time timestamp NOT NULL, 88 | hour integer, 89 | day integer, 90 | week integer, 91 | month integer, 92 | year integer, 93 | dayofweek integer) 94 | """) 95 | 96 | #Below are the insert statements 97 | songplay_table_insert = (""" 98 | SELECT Distinct 99 | md5(events.start_time) songplay_id, 100 | events.start_time, 101 | events.userid, 102 | events.level, 103 | songs.song_id, 104 | songs.artist_id, 105 | events.sessionid, 106 | events.location, 107 | events.useragent 108 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 109 | FROM staging_events 110 | WHERE page='NextSong') events 111 | LEFT JOIN staging_songs songs 112 | ON events.song = songs.title 113 | AND events.artist = songs.artist_name 114 | AND events.length = songs.duration 115 | WHERE (songs.song_id<>'' or songs.artist_id<>'') 116 | AND length(events.userid)>0 117 | """) 118 | 119 | user_table_insert = (""" 120 | SELECT distinct userid, firstname, lastname, gender, level 121 | FROM staging_events 122 | WHERE page='NextSong' 123 | """) 124 | 125 | song_table_insert = (""" 126 | SELECT distinct song_id, title, artist_id, year, duration 127 | FROM staging_songs 128 | """) 129 | 130 | artist_table_insert = (""" 131 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 132 | FROM staging_songs 133 | """) 134 | 135 | time_table_insert = (""" 136 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 137 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 138 | FROM songplays 139 | """) -------------------------------------------------------------------------------- /dags/Sparkify_Data_Pipeline_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import os 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators.postgres_operator import PostgresOperator 6 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator, 7 | LoadDimensionOperator, DataQualityOperator) 8 | import sql_statements 9 | 10 | # AWS_KEY = os.environ.get('AWS_KEY') 11 | # AWS_SECRET = os.environ.get('AWS_SECRET') 12 | 13 | default_args = { 14 | 'owner': 'udacity', 15 | 'depends_on_past': False, 16 | 'start_date': datetime(2019, 7, 26), 17 | #'end_date': datetime(2018, 11, 12), 18 | 'email': ['airflow@example.com'], 19 | 'email_on_failure': False, 20 | 'email_on_retry': False, 21 | 'retries': 1, 22 | 'retry_delay': timedelta(minutes=1), 23 | 'catchup': False, 24 | } 25 | 26 | dag = DAG('Sparkify_Data_Pipeline_dag', 27 | default_args=default_args, 28 | description='Load and transform data in Redshift with Airflow', 29 | schedule_interval='0 0 * * *' 30 | ) 31 | 32 | start_operator = DummyOperator(task_id='Begin_execution', dag=dag) 33 | 34 | create_staging_events_table = PostgresOperator( 35 | task_id="create_staging_events_table", 36 | dag=dag, 37 | postgres_conn_id="redshift", 38 | sql=sql_statements.CREATE_staging_events_TABLE_SQL 39 | ) 40 | 41 | stage_events_to_redshift = StageToRedshiftOperator( 42 | task_id="Stage_events_from_s3_to_redshift", 43 | dag=dag, 44 | table="staging_events", 45 | redshift_conn_id="redshift", 46 | aws_credentials_id="aws_credentials", 47 | s3_bucket="udacity-dend", 48 | #s3_key="log_data/2018/11/2018-11-01-events.json" 49 | # .strftime("%d-%m-%Y") 50 | #s3_key="log_data/{execution_date.year}/{execution_date.month}/{execution_date.year}-{execution_date.month}-{execution_date.day}-events.json" 51 | s3_key="log_data", 52 | JSONPaths="log_json_path.json" 53 | ) 54 | 55 | create_staging_songs_table = PostgresOperator( 56 | task_id="create_staging_songs_table", 57 | dag=dag, 58 | postgres_conn_id="redshift", 59 | sql=sql_statements.CREATE_staging_songs_TABLE_SQL 60 | ) 61 | 62 | stage_songs_to_redshift = StageToRedshiftOperator( 63 | task_id="Stage_songs_from_s3_to_redshift", 64 | dag=dag, 65 | table="staging_songs", 66 | redshift_conn_id="redshift", 67 | aws_credentials_id="aws_credentials", 68 | s3_bucket="udacity-dend", 69 | s3_key="song_data", 70 | JSONPaths="auto" 71 | ) 72 | 73 | load_songplays_table = LoadFactOperator( 74 | task_id='Load_songplays_fact_table', 75 | dag=dag, 76 | redshift_conn_id="redshift" 77 | ) 78 | 79 | load_user_dimension_table = LoadDimensionOperator( 80 | task_id='Load_user_dim_table', 81 | dag=dag, 82 | redshift_conn_id="redshift", 83 | table_name="users", 84 | sql_statement=sql_statements.user_table_insert, 85 | append_data=True 86 | ) 87 | 88 | load_song_dimension_table = LoadDimensionOperator( 89 | task_id='Load_song_dim_table', 90 | dag=dag, 91 | redshift_conn_id="redshift" , 92 | table_name="songs", 93 | sql_statement=sql_statements.song_table_insert, 94 | append_data=True 95 | ) 96 | 97 | load_artist_dimension_table = LoadDimensionOperator( 98 | task_id='Load_artist_dim_table', 99 | dag=dag, 100 | redshift_conn_id="redshift" , 101 | table_name="artists", 102 | sql_statement=sql_statements.artist_table_insert, 103 | append_data=True 104 | ) 105 | 106 | load_time_dimension_table = LoadDimensionOperator( 107 | task_id='Load_time_dim_table', 108 | dag=dag, 109 | redshift_conn_id="redshift", 110 | table_name="time", 111 | sql_statement=sql_statements.time_table_insert, 112 | append_data=True 113 | ) 114 | 115 | run_quality_checks = DataQualityOperator( 116 | task_id='Run_data_quality_checks', 117 | dag=dag, 118 | redshift_conn_id="redshift", 119 | check_sql="SELECT COUNT(*) FROM songplays", 120 | expected_value="320", 121 | describe="Fact table songplay - whether this table has data or not !" 122 | ) 123 | 124 | end_operator = DummyOperator(task_id='Stop_execution', dag=dag) 125 | 126 | start_operator >> create_staging_events_table 127 | start_operator >> create_staging_songs_table 128 | 129 | create_staging_events_table >> stage_events_to_redshift 130 | create_staging_songs_table >> stage_songs_to_redshift 131 | 132 | stage_events_to_redshift >> load_songplays_table 133 | stage_songs_to_redshift >> load_songplays_table 134 | 135 | load_songplays_table >> load_user_dimension_table 136 | load_songplays_table >> load_song_dimension_table 137 | load_songplays_table >> load_artist_dimension_table 138 | load_songplays_table >> load_time_dimension_table 139 | 140 | load_user_dimension_table >> run_quality_checks 141 | load_song_dimension_table >> run_quality_checks 142 | load_artist_dimension_table >> run_quality_checks 143 | load_time_dimension_table >> run_quality_checks 144 | 145 | run_quality_checks >> end_operator -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Nanodegree 2 | ## Project: Data Pipelines with Airflow 3 | ## Table of Contents 4 | * **Definition** 5 | * **Project Overview** : 6 | A music streaming company, Sparkify, has decided that it is time to introduce more automation and monitoring to their data warehouse ETL pipelines and come to the conclusion that the best tool to achieve this is Apache Airflow. 7 | 8 | * **Problem Statement** : 9 | Sparkify want to create high grade data pipelines that are dynamic and built from reusable tasks, can be monitored, and allow easy backfills. They have also noted that the data quality plays a big part when analyses are executed on top the data warehouse and want to run tests against their datasets after the ETL steps have been executed to catch any discrepancies in the datasets. 10 | The source data resides in S3 and needs to be processed in Sparkify's data warehouse in Amazon Redshift. The source datasets consist of CSV logs that tell about user activity in the application and JSON metadata about the songs the users listen to. 11 | 12 | 13 | * **Design** 14 | * **ETL Design Principles** 15 | 1. Partition Data Tables: Data partitioning can be especially useful when dealing with large-size tables with a long history. When data is partitioned using datestamps, we can leverage dynamic partitions to parallelize backfilling. 16 | 1. Load Data Incrementally: This principle makes ETL more modular and manageable, especially when building dimension tables from the fact tables. In each run, we only need to append the new transactions to the dimension table from previous date partition instead of scanning the entire fact history. 17 | 1. Enforce Idempotency: Many data scientists rely on point-in-time snapshots to perform historical analysis. This means the underlying source table should not be mutable as time progresses, otherwise we would get a different answer. Pipeline should be built so that the same query, when run against the same business logic and time range, returns the same result. 18 | 1. Parameterize Workflow: Just like how templates greatly simplified the organization of HTML pages, Jinja can be used in conjunction with SQL. As we mentioned earlier, one common usage of Jinja template is to incorporate the backfilling logic into a typical Hive query. 19 | 1. Add Data Checks Early and Often: When processing data, it is useful to write data into a staging table, check the data quality, and only then exchange the staging table with the final production table.Checks in this 3-step paradigm are important defensive mechanisms — they can be simple checks such as counting if the total number of records is greater than 0 or something as complex as an anomaly detection system that checks for unseen categories or outliers. 20 | 1. Build Useful Alerts & Monitoring System: Since ETL jobs can often take a long time to run, it’s useful to add alerts and monitoring to them so we do not have to keep an eye on the progress of the DAG constantly. We regularly use EmailOperators to send alert emails for jobs missing SLAs. 21 | 22 | * **Building Pipeline** : 23 | It is often useful to visualize complex data flows using a graph. Visually, a node in a graph represents a task, and an arrow represents the dependency of one task on another. Given that data only needs to be computed once on a given task and the computation then carries forward, the graph is directed and acyclic. This is why Airflow jobs are commonly referred to as “DAGs” (Directed Acyclic Graphs). ![Sparkify Data Model](/images/SparfiyDAG.jpg) 24 | Airflow UI is allows any users to visualize the DAG in a graph view. The author of a data pipeline must define the structure of dependencies among tasks in order to visualize them. This specification is often written in a file called the DAG definition file, which lays out the anatomy of an Airflow job. 25 | While DAGs describe how to run a data pipeline, operators describe what to do in a data pipeline. Typically, there are three broad categories of operators: 26 | 1. Sensors: waits for a certain time, external file, or upstream data source 27 | 2. Operators: triggers a certain action (e.g. run a bash command, execute a python function, or execute a Hive query, etc) 28 | 3. Transfers: moves data from one location to another 29 | 30 | For this project, I have build four different operators that will stage the data, transform the data, and run checks on data quality. 31 | * **StageToRedshift Operator:** The stage operator is expected to be able to load any JSON and CSV formatted files from S3 to Amazon Redshift. The operator creates and runs a SQL COPY statement based on the parameters provided. The operator's parameters should specify where in S3 the file is loaded and what is the target table. The parameters should be used to distinguish between JSON and CSV file. Another important requirement of the stage operator is containing a templated field that allows it to load timestamped files from S3 based on the execution time and run backfills. 32 | * **LoadFactOperator:** 33 | With dimension and fact operators, you can utilize the provided SQL helper class to run data transformations. Most of the logic is within the SQL transformations and the operator is expected to take as input a SQL statement and target database on which to run the query against. You can also define a target table that will contain the results of the transformation. 34 | * **LoadDimensionOperator:** Dimension loads are often done with the truncate-insert pattern where the target table is emptied before the load. Thus, you could also have a parameter that allows switching between insert modes when loading dimensions. Fact tables are usually so massive that they should only allow append type functionality. 35 | * **DataQualityOperator:** The final operator to create is the data quality operator, which is used to run checks on the data itself. The operator's main functionality is to receive one or more SQL based test cases along with the expected results and execute the tests. For each the test, the test result and expected result needs to be checked and if there is no match, the operator should raise an exception and the task should retry and fail eventually. For example one test could be a SQL statement that checks if certain column contains NULL values by counting all the rows that have NULL in the column. We do not want to have any NULLs so expected result would be 0 and the test would compare the SQL statement's outcome to the expected result. 36 | ![Sparkify Data Model](/images/Pipelineview.jpg) 37 | 38 | * **How to Run** : Open the terminal, type as below 39 | 1. create_cluster.ipynb 40 | 1. Open the dwh.cfg and provide the AWS access keys and secret 41 | 2. Launch a redshift cluster using create_cluster.ipynb and create an IAM role that has read access to S3. 42 | 3. Add redshift database like host,dbname,dbuser,password and port number etc, and IAM role info like ARN to dwh.cfg. 43 | 1. python create_tables.py 44 | 1. python etl.py 45 | 1. analysis.ipynb - run you all analysis 46 | 47 | * **Final Result / Analysis** : Now Sparkify Analytics team can run multiple queries using data_analysis.ipynb notebook or Users can connect any tool like Amazon QuickSight, Power BI,tableau to RedShift Cluster. They can do what if analysis or they can slice/dice the data as per their reqirement. 48 | 1. Currently how many users are listening songs ? 49 | 1. How the users are distributes across the geography ? 50 | 1. Which are the songs they are playing ? 51 | 52 | * **Software Requirements** : This project uses the following software and Python libraries: 53 | 1. Python 3.0 54 | 1. psycopg2 55 | 1. Amazon RedShift 56 | 57 | You will also need to have software installed to run and execute a Jupyter Notebook. 58 | If you do not have Python installed yet, it is highly recommended that you install the Anaconda distribution of Python, which already has the above packages and more included. 59 | 60 | * **Acknowledgement** : Must give credit to Udacity for the project. You can't use this for you Udacity capstone project. Otherwise, feel free to use the code here as you would like! 61 | 62 | * **Bonus** : Here are a few key concepts for Airflow: 63 | 1. DAG (Directed Acyclic Graph): a workflow which glues all the tasks with inter-dependencies. 64 | 1. Operator: a template for a specific type of work to be executed. For example, BashOperator represents how to execute a bash script while PythonOperator represents how to execute a python function, etc. 65 | 1. Sensor: a type of special operator which will only execute if a certain condition is met. 66 | 1. Task: a parameterized instance of an operator/sensor which represents a unit of actual work to be executed. 67 | 1. Plugin: an extension to allow users to easily extend Airflow with various custom hooks, operators, sensors, macros, and web views. 68 | 1. Pools: concurrency limit configuration for a set of Airflow tasks. 69 | --------------------------------------------------------------------------------