├── images
    ├── SparfiyDAG.jpg
    └── Pipelineview.jpg
├── plugins
    ├── helpers
    │   ├── __init__.py
    │   └── sql_queries.py
    ├── operators
    │   ├── __init__.py
    │   ├── load_dimension.py
    │   ├── load_fact.py
    │   ├── data_quality.py
    │   └── stage_redshift.py
    └── __init__.py
├── create_tables.py
├── dags
    ├── sql_statements.py
    └── Sparkify_Data_Pipeline_dag.py
└── README.md


/images/SparfiyDAG.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddgope/Data-Pipelines-with-Airflow/HEAD/images/SparfiyDAG.jpg


--------------------------------------------------------------------------------
/images/Pipelineview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddgope/Data-Pipelines-with-Airflow/HEAD/images/Pipelineview.jpg


--------------------------------------------------------------------------------
/plugins/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | from helpers.sql_queries import SqlQueries
2 | 
3 | __all__ = [
4 |     'SqlQueries',
5 | ]


--------------------------------------------------------------------------------
/plugins/operators/__init__.py:
--------------------------------------------------------------------------------
 1 | from operators.stage_redshift import StageToRedshiftOperator
 2 | from operators.load_fact import LoadFactOperator
 3 | from operators.load_dimension import LoadDimensionOperator
 4 | from operators.data_quality import DataQualityOperator
 5 | 
 6 | __all__ = [
 7 |     'StageToRedshiftOperator',
 8 |     'LoadFactOperator',
 9 |     'LoadDimensionOperator',
10 |     'DataQualityOperator'
11 | ]
12 | 


--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, absolute_import, print_function
 2 | 
 3 | from airflow.plugins_manager import AirflowPlugin
 4 | 
 5 | import operators
 6 | import helpers
 7 | 
 8 | # Defining the plugin class
 9 | class UdacityPlugin(AirflowPlugin):
10 |     name = "udacity_plugin"
11 |     operators = [
12 |         operators.StageToRedshiftOperator,
13 |         operators.LoadFactOperator,
14 |         operators.LoadDimensionOperator,
15 |         operators.DataQualityOperator
16 |     ]
17 |     helpers = [
18 |         helpers.SqlQueries
19 |     ]
20 | 


--------------------------------------------------------------------------------
/create_tables.py:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS artists (
 2 | 	artistid varchar(256) NOT NULL,
 3 | 	name varchar(256),
 4 | 	location varchar(256),
 5 | 	lattitude numeric(18,0),
 6 | 	longitude numeric(18,0)
 7 |  );
 8 |  
 9 | CREATE TABLE IF NOT EXISTS songplays (
10 | 	playid varchar(32) NOT NULL,
11 | 	start_time timestamp NOT NULL,
12 | 	userid int4 NOT NULL,
13 | 	"level" varchar(256),
14 | 	songid varchar(256),
15 | 	artistid varchar(256),
16 | 	sessionid int4,
17 | 	location varchar(256),
18 | 	user_agent varchar(256),
19 | 	CONSTRAINT songplays_pkey PRIMARY KEY (playid)
20 |  
21 | );
22 |  
23 | CREATE TABLE IF NOT EXISTS songs (
24 | 	songid varchar(256) NOT NULL,
25 | 	title varchar(256),
26 | 	artistid varchar(256),
27 | 	"year" int4,
28 | 	duration numeric(18,0),
29 | 	CONSTRAINT songs_pkey PRIMARY KEY (songid)
30 |  );
31 |  
32 | CREATE TABLE IF NOT EXISTS users (
33 | 	userid int4 NOT NULL,
34 | 	first_name varchar(256),
35 | 	last_name varchar(256),
36 | 	gender varchar(256),
37 | 	"level" varchar(256),
38 | 	CONSTRAINT users_pkey PRIMARY KEY (userid)
39 | ); 
40 | 
41 | 
42 | CREATE TABLE IF NOT EXISTS time(
43 | start_time timestamp NOT NULL,
44 | hour integer,
45 | day integer,
46 | week integer,
47 | month integer,
48 | year integer,
49 | dayofweek integer)
50 | 
51 | 


--------------------------------------------------------------------------------
/plugins/operators/load_dimension.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from airflow.hooks.postgres_hook import PostgresHook
 3 | from airflow.models import BaseOperator
 4 | from airflow.utils.decorators import apply_defaults
 5 | 
 6 | class LoadDimensionOperator(BaseOperator):
 7 |     ui_color = '#80BD9E'
 8 | 
 9 |     @apply_defaults
10 |     def __init__(self,               
11 |                  redshift_conn_id="",   
12 |                  table_name="",
13 |                  sql_statement="",
14 |                  append_data="",
15 |                  *args, **kwargs):
16 | 
17 |         super(LoadDimensionOperator, self).__init__(*args, **kwargs)       
18 |         self.redshift_conn_id = redshift_conn_id
19 |         self.table_name=table_name
20 |         self.sql_statement=sql_statement
21 |         self.append_data=append_data
22 | 
23 |     def execute(self, context):
24 |         self.log.info('LoadDimensionOperator has been implemented !')
25 |         redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)        
26 |         if self.append_data == True:
27 |             sql_statement = 'INSERT INTO %s %s' % (self.table_name, self.sql_statement)
28 |             redshift.run(sql_statement)
29 |         else:
30 |             sql_statement = 'TRUNCATE TABLE %s;' % (self.table_name)
31 |             sql_statement =sql_statement + 'INSERT INTO %s %s' % (self.table_name, self.sql_statement)
32 |             redshift.run(sql_statement)       
33 | 


--------------------------------------------------------------------------------
/plugins/helpers/sql_queries.py:
--------------------------------------------------------------------------------
 1 | class SqlQueries:
 2 |     songplay_table_insert = ("""
 3 |         SELECT
 4 |                 md5(events.sessionid || events.start_time) songplay_id,
 5 |                 events.start_time, 
 6 |                 events.userid, 
 7 |                 events.level, 
 8 |                 songs.song_id, 
 9 |                 songs.artist_id, 
10 |                 events.sessionid, 
11 |                 events.location, 
12 |                 events.useragent
13 |                 FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, *
14 |             FROM staging_events
15 |             WHERE page='NextSong') events
16 |             LEFT JOIN staging_songs songs
17 |             ON events.song = songs.title
18 |                 AND events.artist = songs.artist_name
19 |                 AND events.length = songs.duration
20 |     """)
21 | 
22 |     user_table_insert = ("""
23 |         SELECT distinct userid, firstname, lastname, gender, level
24 |         FROM staging_events
25 |         WHERE page='NextSong'
26 |     """)
27 | 
28 |     song_table_insert = ("""
29 |         SELECT distinct song_id, title, artist_id, year, duration
30 |         FROM staging_songs
31 |     """)
32 | 
33 |     artist_table_insert = ("""
34 |         SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude
35 |         FROM staging_songs
36 |     """)
37 | 
38 |     time_table_insert = ("""
39 |         SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 
40 |                extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time)
41 |         FROM songplays
42 |     """)


--------------------------------------------------------------------------------
/plugins/operators/load_fact.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | class LoadFactOperator(BaseOperator):
 6 |     ui_color = '#F98866'
 7 |     songplay_table_insert = ("""
 8 |         INSERT INTO songplays (playid,start_time,userid,level,songid,artistid,sessionid,location,user_agent)
 9 |         SELECT Distinct
10 |                 md5(events.ts) songplay_id,
11 |                 events.start_time, 
12 |                 events.userid, 
13 |                 events.level, 
14 |                 songs.song_id, 
15 |                 songs.artist_id, 
16 |                 events.sessionid, 
17 |                 events.location, 
18 |                 events.useragent
19 |                 FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, *
20 |             FROM staging_events
21 |             WHERE page='NextSong') events
22 |             LEFT JOIN staging_songs songs
23 |             ON events.song = songs.title
24 |                 AND events.artist = songs.artist_name
25 |                 AND events.length = songs.duration
26 |              WHERE (songs.song_id<>'' or songs.artist_id<>'')
27 |              AND length(events.userid)>0
28 |     """)
29 | 
30 |     @apply_defaults
31 |     def __init__(self,                               
32 |                  redshift_conn_id="",                
33 |                  *args, **kwargs):
34 | 
35 |         super(LoadFactOperator, self).__init__(*args, **kwargs)      
36 |         self.redshift_conn_id = redshift_conn_id
37 |        
38 |     def execute(self, context):
39 |         self.log.info('Loading into fact table Songplay!')
40 |         redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
41 |         facts_sql =LoadFactOperator.songplay_table_insert         
42 |         redshift.run(facts_sql)
43 | 


--------------------------------------------------------------------------------
/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from airflow.hooks.postgres_hook import PostgresHook
 3 | from airflow.models import BaseOperator
 4 | from airflow.utils.decorators import apply_defaults
 5 | 
 6 | class DataQualityOperator(BaseOperator):
 7 |     ui_color = '#89DA59'
 8 | 
 9 |     @apply_defaults
10 |     def __init__(self,
11 |                  # Define your operators params (with defaults) here
12 |                  # Example:
13 |                  # conn_id = your-connection-name
14 |                  redshift_conn_id = "",
15 |                  check_sql="",
16 |                  expected_value="",
17 |                  describe="",
18 |                  *args, **kwargs):
19 |         self.redshift_conn_id = redshift_conn_id
20 |         self.check_sql = check_sql
21 |         self.expected_value = expected_value
22 |         self.describe = describe
23 | 
24 |         super(DataQualityOperator, self).__init__(*args, **kwargs)
25 |         # Map params here
26 |         # Example:
27 |         # self.conn_id = conn_id
28 |         self.redshift_conn_id = redshift_conn_id
29 | 
30 |     def execute(self, context):
31 |         redshift_hook = PostgresHook(self.redshift_conn_id)
32 | 
33 |         self.log.info("\n".join([
34 |             'DataQuality check',
35 |             self.describe,
36 |             'expected value is {}'.format(self.expected_value),
37 |             self.check_sql
38 |         ]))
39 | 
40 |         records = redshift_hook.get_records(self.check_sql)
41 |         if (records[0][0] < 1): # len(records) < 1 or len(records[0][0])
42 |             raise ValueError(f"Data quality check failed. returned no results")
43 |         if int(self.expected_value) != records[0][0]:
44 |             raise ValueError(f"Data quality check failed. \n expected: {self.expected_value} \n acutal: {records[0][0]}")
45 |         self.log.info(f"Data quality on \n {self.describe} \n check passed with \n expected: {self.expected_value} \n acutal: {records[0][0]}")       
46 | 


--------------------------------------------------------------------------------
/plugins/operators/stage_redshift.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | from airflow.contrib.hooks.aws_hook import AwsHook
 5 | 
 6 | '''
 7 | StageToRedshift Operator : This is Custome Operator, will be able to load any JSON formatted files from S3 to Amazon Redshift. 
 8 | The operator creates and runs a SQL COPY statement based on the parameters provided. 
 9 | '''
10 | 
11 | class StageToRedshiftOperator(BaseOperator):
12 |     ui_color = '#358140'
13 |     template_fields = ("s3_key",)
14 |     copy_sql = """
15 |         COPY {}
16 |         FROM '{}'
17 |         ACCESS_KEY_ID '{}'
18 |         SECRET_ACCESS_KEY '{}'
19 |         FORMAT AS JSON '{}'
20 |         TIMEFORMAT AS 'epochmillisecs'
21 |         region 'us-west-2'
22 |     """
23 |     @apply_defaults
24 |     def __init__(self,
25 |                  redshift_conn_id="",
26 |                  aws_credentials_id="",
27 |                  table="",
28 |                  s3_bucket="",
29 |                  s3_key="",
30 |                  delimiter=",",
31 |                  ignore_headers=1,
32 |                  JSONPaths="",
33 |                  *args, **kwargs):
34 | 
35 |         super(StageToRedshiftOperator, self).__init__(*args, **kwargs)
36 |         self.table = table
37 |         self.redshift_conn_id = redshift_conn_id
38 |         self.s3_bucket = s3_bucket
39 |         self.s3_key = s3_key
40 |         self.delimiter = delimiter
41 |         self.ignore_headers = ignore_headers
42 |         self.aws_credentials_id = aws_credentials_id
43 |         self.JSONPaths = JSONPaths
44 | 
45 |     def execute(self, context):
46 |         aws_hook = AwsHook(self.aws_credentials_id)
47 |         credentials = aws_hook.get_credentials()
48 |         redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
49 | 
50 |         self.log.info("Clearing data from destination Redshift table")
51 |         redshift.run("DELETE FROM {}".format(self.table))
52 | 
53 |         self.log.info("Copying data from S3 to Redshift")
54 |         rendered_key = self.s3_key.format(**context)
55 |         s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)   
56 |         json_path = "s3://{}/{}".format(self.s3_bucket, self.JSONPaths) 
57 |         if rendered_key=="song_data": # song JSON data files don't have JSON manifest file thatswhy need to mark as Auto
58 |             json_path = self.JSONPaths
59 |         formatted_sql = StageToRedshiftOperator.copy_sql.format(
60 |             self.table,
61 |             s3_path,
62 |             credentials.access_key,
63 |             credentials.secret_key,
64 |             #self.ignore_headers, #this is used for CSV file
65 |             #self.delimiter, #this is used for CSV file
66 |             json_path
67 |         )
68 |         redshift.run(formatted_sql)


--------------------------------------------------------------------------------
/dags/sql_statements.py:
--------------------------------------------------------------------------------
  1 | #Below are the tables need to be create
  2 | CREATE_staging_events_TABLE_SQL = """
  3 | CREATE TABLE IF NOT EXISTS staging_events (
  4 | 	artist varchar(256),
  5 | 	auth varchar(256),
  6 | 	firstname varchar(256),
  7 | 	gender varchar(256),
  8 | 	iteminsession int4,
  9 | 	lastname varchar(256),
 10 | 	length numeric(18,0),
 11 | 	"level" varchar(256),
 12 | 	location varchar(256),
 13 | 	"method" varchar(256),
 14 | 	page varchar(256),
 15 | 	registration numeric(18,0),
 16 | 	sessionid int4,
 17 | 	song varchar(256),
 18 | 	status int4,
 19 | 	ts int8,
 20 | 	useragent varchar(256),
 21 | 	userid int4)
 22 | """
 23 | 
 24 | CREATE_staging_songs_TABLE_SQL = """
 25 | CREATE TABLE IF NOT EXISTS staging_songs (
 26 | 	num_songs int4,
 27 | 	artist_id varchar(256),
 28 | 	artist_name varchar(256),
 29 | 	artist_latitude numeric(18,0),
 30 | 	artist_longitude numeric(18,0),
 31 | 	artist_location varchar(256),
 32 | 	song_id varchar(256),
 33 | 	title varchar(256),
 34 | 	duration numeric(18,0),
 35 | 	"year" int4)
 36 | """
 37 | 
 38 | CREATE_artists_TABLE_SQL = """
 39 | CREATE TABLE IF NOT EXISTS artists (
 40 | 	artistid varchar(256) NOT NULL,
 41 | 	name varchar(256),
 42 | 	location varchar(256),
 43 | 	lattitude numeric(18,0),
 44 | 	longitude numeric(18,0)
 45 |     )
 46 | """
 47 | 
 48 | CREATE_songplays_TABLE_SQL = """
 49 | CREATE TABLE IF NOT EXISTS songplays (
 50 | 	playid varchar(32) NOT NULL,
 51 | 	start_time timestamp NOT NULL,
 52 | 	userid int4 NOT NULL,
 53 | 	"level" varchar(256),
 54 | 	songid varchar(256),
 55 | 	artistid varchar(256),
 56 | 	sessionid int4,
 57 | 	location varchar(256),
 58 | 	user_agent varchar(256),
 59 | 	CONSTRAINT songplays_pkey PRIMARY KEY (playid)
 60 | );
 61 | """
 62 | 
 63 | CREATE_songs_TABLE_SQL = """
 64 | CREATE TABLE IF NOT EXISTS songs (
 65 | 	songid varchar(256) NOT NULL,
 66 | 	title varchar(256),
 67 | 	artistid varchar(256),
 68 | 	"year" int4,
 69 | 	duration numeric(18,0),
 70 | 	CONSTRAINT songs_pkey PRIMARY KEY (songid)
 71 | );
 72 | """
 73 | 
 74 | CREATE_users_TABLE_SQL = """
 75 | CREATE TABLE IF NOT EXISTS users (
 76 | 	userid int4 NOT NULL,
 77 | 	first_name varchar(256),
 78 | 	last_name varchar(256),
 79 | 	gender varchar(256),
 80 | 	"level" varchar(256),
 81 | 	CONSTRAINT users_pkey PRIMARY KEY (userid)
 82 | );
 83 | """
 84 | 
 85 | CREATE_time_TABLE_SQL = ("""
 86 | CREATE TABLE IF NOT EXISTS time(
 87 | start_time timestamp NOT NULL,
 88 | hour integer,
 89 | day integer,
 90 | week integer,
 91 | month integer,
 92 | year integer,
 93 | dayofweek integer)
 94 | """)
 95 | 
 96 | #Below are the insert statements
 97 | songplay_table_insert = ("""
 98 |         SELECT Distinct
 99 |                 md5(events.start_time) songplay_id,
100 |                 events.start_time, 
101 |                 events.userid, 
102 |                 events.level, 
103 |                 songs.song_id, 
104 |                 songs.artist_id, 
105 |                 events.sessionid, 
106 |                 events.location, 
107 |                 events.useragent
108 |                 FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, *
109 |             FROM staging_events
110 |             WHERE page='NextSong') events
111 |             LEFT JOIN staging_songs songs
112 |             ON events.song = songs.title
113 |                 AND events.artist = songs.artist_name
114 |                 AND events.length = songs.duration
115 |             WHERE (songs.song_id<>'' or songs.artist_id<>'')
116 |             AND length(events.userid)>0
117 |     """)
118 | 
119 | user_table_insert = ("""
120 |         SELECT distinct userid, firstname, lastname, gender, level
121 |         FROM staging_events
122 |         WHERE page='NextSong'
123 |     """)
124 | 
125 | song_table_insert = ("""
126 |         SELECT distinct song_id, title, artist_id, year, duration
127 |         FROM staging_songs
128 |     """)
129 | 
130 | artist_table_insert = ("""
131 |         SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude
132 |         FROM staging_songs
133 |     """)
134 | 
135 | time_table_insert = ("""
136 |         SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 
137 |                extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time)
138 |         FROM songplays
139 |     """)


--------------------------------------------------------------------------------
/dags/Sparkify_Data_Pipeline_dag.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import os
  3 | from airflow import DAG
  4 | from airflow.operators.dummy_operator import DummyOperator
  5 | from airflow.operators.postgres_operator import PostgresOperator
  6 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator,
  7 |                                 LoadDimensionOperator, DataQualityOperator)
  8 | import sql_statements
  9 | 
 10 | # AWS_KEY = os.environ.get('AWS_KEY')
 11 | # AWS_SECRET = os.environ.get('AWS_SECRET')
 12 | 
 13 | default_args = {
 14 |     'owner': 'udacity',
 15 |     'depends_on_past': False,
 16 |     'start_date': datetime(2019, 7, 26),    
 17 |     #'end_date': datetime(2018, 11, 12),
 18 |     'email': ['airflow@example.com'],
 19 |     'email_on_failure': False,
 20 |     'email_on_retry': False,
 21 |     'retries': 1,
 22 |     'retry_delay': timedelta(minutes=1),
 23 |     'catchup': False,
 24 | }
 25 | 
 26 | dag = DAG('Sparkify_Data_Pipeline_dag',
 27 |           default_args=default_args,
 28 |           description='Load and transform data in Redshift with Airflow',          
 29 |           schedule_interval='0 0 * * *'
 30 |         )
 31 | 
 32 | start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)
 33 | 
 34 | create_staging_events_table = PostgresOperator(
 35 |     task_id="create_staging_events_table",
 36 |     dag=dag,
 37 |     postgres_conn_id="redshift",
 38 |     sql=sql_statements.CREATE_staging_events_TABLE_SQL
 39 | )
 40 | 
 41 | stage_events_to_redshift = StageToRedshiftOperator(
 42 |     task_id="Stage_events_from_s3_to_redshift",
 43 |     dag=dag,
 44 |     table="staging_events",
 45 |     redshift_conn_id="redshift",
 46 |     aws_credentials_id="aws_credentials",
 47 |     s3_bucket="udacity-dend",
 48 |     #s3_key="log_data/2018/11/2018-11-01-events.json" 
 49 |     # .strftime("%d-%m-%Y")
 50 |     #s3_key="log_data/{execution_date.year}/{execution_date.month}/{execution_date.year}-{execution_date.month}-{execution_date.day}-events.json"   
 51 |     s3_key="log_data",
 52 |     JSONPaths="log_json_path.json"
 53 | )
 54 | 
 55 | create_staging_songs_table = PostgresOperator(
 56 |     task_id="create_staging_songs_table",
 57 |     dag=dag,
 58 |     postgres_conn_id="redshift",
 59 |     sql=sql_statements.CREATE_staging_songs_TABLE_SQL
 60 | )
 61 | 
 62 | stage_songs_to_redshift = StageToRedshiftOperator(
 63 |     task_id="Stage_songs_from_s3_to_redshift",
 64 |     dag=dag,
 65 |     table="staging_songs",
 66 |     redshift_conn_id="redshift",
 67 |     aws_credentials_id="aws_credentials",
 68 |     s3_bucket="udacity-dend",
 69 |     s3_key="song_data",
 70 |     JSONPaths="auto"
 71 | )
 72 | 
 73 | load_songplays_table = LoadFactOperator(
 74 |     task_id='Load_songplays_fact_table',
 75 |     dag=dag,
 76 |     redshift_conn_id="redshift"   
 77 | )
 78 | 
 79 | load_user_dimension_table = LoadDimensionOperator(
 80 |     task_id='Load_user_dim_table',
 81 |     dag=dag,
 82 |     redshift_conn_id="redshift",
 83 |     table_name="users",
 84 |     sql_statement=sql_statements.user_table_insert,
 85 |     append_data=True
 86 | )
 87 | 
 88 | load_song_dimension_table = LoadDimensionOperator(
 89 |     task_id='Load_song_dim_table',
 90 |     dag=dag,
 91 |     redshift_conn_id="redshift" ,
 92 |     table_name="songs",
 93 |     sql_statement=sql_statements.song_table_insert,
 94 |     append_data=True
 95 | )
 96 | 
 97 | load_artist_dimension_table = LoadDimensionOperator(
 98 |     task_id='Load_artist_dim_table',
 99 |     dag=dag,
100 |     redshift_conn_id="redshift" ,
101 |     table_name="artists",
102 |     sql_statement=sql_statements.artist_table_insert,
103 |     append_data=True
104 | )
105 | 
106 | load_time_dimension_table = LoadDimensionOperator(
107 |     task_id='Load_time_dim_table',
108 |     dag=dag,
109 |     redshift_conn_id="redshift",
110 |     table_name="time",
111 |     sql_statement=sql_statements.time_table_insert,
112 |     append_data=True
113 | )
114 | 
115 | run_quality_checks = DataQualityOperator(
116 |     task_id='Run_data_quality_checks',
117 |     dag=dag,
118 |     redshift_conn_id="redshift",
119 |     check_sql="SELECT COUNT(*) FROM  songplays",
120 |     expected_value="320",
121 |     describe="Fact table songplay - whether this table has data or not !"
122 | )
123 | 
124 | end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)
125 | 
126 | start_operator >> create_staging_events_table
127 | start_operator >> create_staging_songs_table
128 | 
129 | create_staging_events_table >> stage_events_to_redshift
130 | create_staging_songs_table >> stage_songs_to_redshift
131 | 
132 | stage_events_to_redshift >> load_songplays_table
133 | stage_songs_to_redshift >> load_songplays_table
134 | 
135 | load_songplays_table >> load_user_dimension_table
136 | load_songplays_table >> load_song_dimension_table
137 | load_songplays_table >> load_artist_dimension_table
138 | load_songplays_table >> load_time_dimension_table
139 | 
140 | load_user_dimension_table >> run_quality_checks
141 | load_song_dimension_table >> run_quality_checks
142 | load_artist_dimension_table >> run_quality_checks
143 | load_time_dimension_table >> run_quality_checks
144 | 
145 | run_quality_checks >> end_operator


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering Nanodegree
 2 | ## Project: Data Pipelines with Airflow
 3 | ## Table of Contents
 4 | * **Definition**
 5 |     * **Project Overview** :
 6 |     A music streaming company, Sparkify, has decided that it is time to introduce more automation and monitoring to their data warehouse ETL pipelines and come to the conclusion that the best tool to achieve this is Apache Airflow.
 7 |     
 8 |     * **Problem Statement** : 
 9 |      Sparkify want to create high grade data pipelines that are dynamic and built from reusable tasks, can be monitored, and allow easy backfills. They have also noted that the data quality plays a big part when analyses are executed on top the data warehouse and want to run tests against their datasets after the ETL steps have been executed to catch any discrepancies in the datasets. 
10 |      The source data resides in S3 and needs to be processed in Sparkify's data warehouse in Amazon Redshift. The source datasets consist of CSV logs that tell about user activity in the application and JSON metadata about the songs the users listen to.
11 | 
12 |    
13 | * **Design**
14 | * **ETL Design Principles**
15 |      1. Partition Data Tables: Data partitioning can be especially useful when dealing with large-size tables with a long history. When data is partitioned using datestamps, we can leverage dynamic partitions to parallelize backfilling.
16 |     1. Load Data Incrementally: This principle makes  ETL more modular and manageable, especially when building dimension tables from the fact tables. In each run, we only need to append the new transactions to the dimension table from previous date partition instead of scanning the entire fact history.
17 |     1. Enforce Idempotency: Many data scientists rely on point-in-time snapshots to perform historical analysis. This means the underlying source table should not be mutable as time progresses, otherwise we would get a different answer. Pipeline should be built so that the same query, when run against the same business logic and time range, returns the same result.
18 |     1. Parameterize Workflow: Just like how templates greatly simplified the organization of HTML pages, Jinja can be used in conjunction with SQL. As we mentioned earlier, one common usage of Jinja template is to incorporate the backfilling logic into a typical Hive query.
19 |     1. Add Data Checks Early and Often: When processing data, it is useful to write data into a staging table, check the data quality, and only then exchange the staging table with the final production table.Checks in this 3-step paradigm are important defensive mechanisms — they can be simple checks such as counting if the total number of records is greater than 0 or something as complex as an anomaly detection system that checks for unseen categories or outliers.
20 |     1. Build Useful Alerts & Monitoring System: Since ETL jobs can often take a long time to run, it’s useful to add alerts and monitoring to them so we do not have to keep an eye on the progress of the DAG constantly. We regularly use EmailOperators to send alert emails for jobs missing SLAs. 
21 |         
22 | * **Building Pipeline** :
23 |     It is often useful to visualize complex data flows using a graph. Visually, a node in a graph represents a task, and an arrow represents the dependency of one task on another. Given that data only needs to be computed once on a given task and the computation then carries forward, the graph is directed and acyclic. This is why Airflow jobs are commonly referred to as “DAGs” (Directed Acyclic Graphs).  ![Sparkify Data Model](/images/SparfiyDAG.jpg)    
24 |     Airflow UI is allows any users to visualize the DAG in a graph view. The author of a data pipeline must define the structure of dependencies among tasks in order to visualize them. This specification is often written in a file called the DAG definition file, which lays out the anatomy of an Airflow job.
25 |     While DAGs describe how to run a data pipeline, operators describe what to do in a data pipeline. Typically, there are three broad categories of operators:    
26 |     1. Sensors: waits for a certain time, external file, or upstream data source
27 |     2. Operators: triggers a certain action (e.g. run a bash command, execute a python function, or execute a Hive query, etc)
28 |     3. Transfers: moves data from one location to another
29 |     
30 |    For this project, I have build four different operators that will stage the data, transform the data, and run checks on data quality.
31 |    * **StageToRedshift Operator:** The stage operator is expected to be able to load any JSON and CSV formatted files from S3 to Amazon Redshift. The operator creates and runs a SQL COPY statement based on the parameters provided. The operator's parameters should specify where in S3 the file is loaded and what is the target table. The parameters should be used to distinguish between JSON and CSV file. Another important requirement of the stage operator is containing a templated field that allows it to load timestamped files from S3 based on the execution time and run backfills.
32 |    * **LoadFactOperator:** 
33 |         With dimension and fact operators, you can utilize the provided SQL helper class to run data transformations. Most of the logic is within the SQL transformations and the operator is expected to take as input a SQL statement and target database on which to run the query against. You can also define a target table that will contain the results of the transformation.
34 |    * **LoadDimensionOperator:** Dimension loads are often done with the truncate-insert pattern where the target table is emptied before the load. Thus, you could also have a parameter that allows switching between insert modes when loading dimensions. Fact tables are usually so massive that they should only allow append type functionality.
35 |    * **DataQualityOperator:**    The final operator to create is the data quality operator, which is used to run checks on the data itself. The operator's main functionality is to receive one or more SQL based test cases along with the expected results and execute the tests. For each the test, the test result and expected result needs to be checked and if there is no match, the operator should raise an exception and the task should retry and fail eventually. For example one test could be a SQL statement that checks if certain column contains NULL values by counting all the rows that have NULL in the column. We do not want to have any NULLs so expected result would be 0 and the test would compare the SQL statement's outcome to the expected result.
36 | ![Sparkify Data Model](/images/Pipelineview.jpg)   
37 | 
38 | * **How to Run** : Open the terminal, type as below
39 |     1. create_cluster.ipynb
40 |         1. Open the dwh.cfg and provide the AWS access keys and secret
41 |         2. Launch a redshift cluster using create_cluster.ipynb and create an IAM role that has read access to S3.
42 |         3. Add redshift database like host,dbname,dbuser,password and port number etc, and IAM role info like ARN to dwh.cfg.
43 |     1. python create_tables.py
44 |     1. python etl.py
45 |     1. analysis.ipynb - run you all analysis
46 |     
47 | * **Final Result / Analysis** : Now Sparkify Analytics team can run multiple queries using data_analysis.ipynb notebook or Users can connect any tool like Amazon QuickSight, Power BI,tableau to RedShift Cluster. They can do what if analysis or they can slice/dice the data as per their reqirement. 
48 |     1. Currently how many users are listening songs ?
49 |     1. How the users are distributes across the geography ?
50 |     1. Which are the songs they are playing ?
51 |     
52 | * **Software Requirements** : This project uses the following software and Python libraries:
53 |         1. Python 3.0
54 |         1. psycopg2
55 |         1. Amazon RedShift
56 |         
57 |     You will also need to have software installed to run and execute a Jupyter Notebook.
58 |     If you do not have Python installed yet, it is highly recommended that you install the Anaconda distribution of Python, which already has the above packages and more included.    
59 | 
60 | * **Acknowledgement** : Must give credit to Udacity for the project. You can't use this for you Udacity capstone project. Otherwise, feel free to use the code here as you would like!
61 | 
62 | * **Bonus** : Here are a few key concepts for Airflow:
63 |         1. DAG (Directed Acyclic Graph): a workflow which glues all the tasks with inter-dependencies.
64 |         1. Operator: a template for a specific type of work to be executed. For example, BashOperator represents how to execute a bash script while PythonOperator represents how to execute a python function, etc.
65 |         1. Sensor: a type of special operator which will only execute if a certain condition is met.
66 |         1. Task: a parameterized instance of an operator/sensor which represents a unit of actual work to be executed.
67 |         1. Plugin: an extension to allow users to easily extend Airflow with various custom hooks, operators, sensors, macros, and web views.
68 |         1. Pools: concurrency limit configuration for a set of Airflow tasks.
69 | 


--------------------------------------------------------------------------------