├── .gitignore ├── README.md ├── Song_ERD.png ├── admin-connections.png ├── create-connection.png ├── create_tables.sql ├── dags └── airflow_dag.py ├── example-dag.png ├── plugins ├── __init__.py ├── helpers │ ├── __init__.py │ └── sql_queries.py └── operators │ ├── __init__.py │ ├── data_quality.py │ ├── load_dimension.py │ ├── load_fact.py │ └── stage_redshift.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Perso 2 | .DS_Store 3 | .idea/ 4 | *.cfg 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DEND-Data_Pipeline_Airflow 2 | ## Loading S3 file with Airflow to ETL with Redshift 3 | 4 | The purpose of this project is to build an adapted data model thanks to python to load data in a JSON file format and wrangle them into a star schema (see the ERD) with the pipeline written as a code thanks to [AirFlow](https://airflow.apache.org/). 5 | 6 | ### Prerequisite 7 | 8 | 1. Install [Docker](https://www.docker.com/get-started). 9 | 10 | 2. This project is run with **docker**. 11 | ``` 12 | docker run -d -p 8080:8080 -v /path/to/project/dags:/usr/local/airflow/dags -v /path/to/project/plugins:/usr/local/airflow/plugins -v to/project/requirements.txt:/requirements.txt --name airflow puckel/docker-airflow webserver 13 | ``` 14 | 15 | And everything is setup to launch AirFlow. 16 | 17 | 3. You need also to configure your AWS credential with the AirFlow IU: 18 | 19 | We'll use Airflow's UI to configure your AWS credentials and connection to Redshift. 20 | 21 | 1. Go to the Airflow UI: 22 | ![AirFlow Admin Panel](./admin-connections.png) 23 | 24 | 2. Under Connections, select Create. 25 | ![AirFlow Connection Panel](./create-connection.png) 26 | 27 | 3. On the create connection page, enter the following values: 28 | 29 | * **Conn Id**: Enter `aws_credentials`. 30 | * **Conn Type**: Enter `Amazon Web Services`. 31 | * **Login**: Enter your **Access key ID** from the IAM User credentials you downloaded. 32 | * **Password**: Enter your **Secret access key** from the IAM User credentials you downloaded. 33 | 34 | Once you've entered these values, select Save and Add Another. 35 | 36 | 37 | 38 | ### Main Goal 39 | The compagny Sparkify need to analyses theirs data to better know the way users (free/paid) use theirs services. With this data pipeline we will be able to schedule, monitor and build more easily the ETL of this data. 40 | 41 | ### Data Pipeline 42 | ![Dag](./example-dag.png) 43 | 44 | This data pipeline is easy to read and understand even for a newcomer to AirFlow. 45 | 46 | ### Data Model 47 | 48 | This pipeline finally is made to build this DB star schema below to make easier the data analysis 49 | 50 | ![ERD](./Song_ERD.png) 51 | 52 | ### Run it 53 | Few steps 54 | 55 | Go to the [AirFlow UI](http://localhost:8080/admin/) after several seconds after running your container. 56 | -------------------------------------------------------------------------------- /Song_ERD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gfelot/DEND-Data_Pipeline_Airflow/046b717363e0d02c3667a456a1cf399bc65d738d/Song_ERD.png -------------------------------------------------------------------------------- /admin-connections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gfelot/DEND-Data_Pipeline_Airflow/046b717363e0d02c3667a456a1cf399bc65d738d/admin-connections.png -------------------------------------------------------------------------------- /create-connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gfelot/DEND-Data_Pipeline_Airflow/046b717363e0d02c3667a456a1cf399bc65d738d/create-connection.png -------------------------------------------------------------------------------- /create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE public.artists ( 2 | artistid varchar(256) NOT NULL, 3 | name varchar(256), 4 | location varchar(256), 5 | lattitude numeric(18,0), 6 | longitude numeric(18,0) 7 | ); 8 | 9 | CREATE TABLE public.songplays ( 10 | playid varchar(32) NOT NULL, 11 | start_time timestamp NOT NULL, 12 | userid int4 NOT NULL, 13 | "level" varchar(256), 14 | songid varchar(256), 15 | artistid varchar(256), 16 | sessionid int4, 17 | location varchar(256), 18 | user_agent varchar(256), 19 | CONSTRAINT songplays_pkey PRIMARY KEY (playid) 20 | ); 21 | 22 | CREATE TABLE public.songs ( 23 | songid varchar(256) NOT NULL, 24 | title varchar(256), 25 | artistid varchar(256), 26 | "year" int4, 27 | duration numeric(18,0), 28 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 29 | ); 30 | 31 | CREATE TABLE public.staging_events ( 32 | artist varchar(256), 33 | auth varchar(256), 34 | firstname varchar(256), 35 | gender varchar(256), 36 | iteminsession int4, 37 | lastname varchar(256), 38 | length numeric(18,0), 39 | "level" varchar(256), 40 | location varchar(256), 41 | "method" varchar(256), 42 | page varchar(256), 43 | registration numeric(18,0), 44 | sessionid int4, 45 | song varchar(256), 46 | status int4, 47 | ts int8, 48 | useragent varchar(256), 49 | userid int4 50 | ); 51 | 52 | CREATE TABLE public.staging_songs ( 53 | num_songs int4, 54 | artist_id varchar(256), 55 | artist_name varchar(256), 56 | artist_latitude numeric(18,0), 57 | artist_longitude numeric(18,0), 58 | artist_location varchar(256), 59 | song_id varchar(256), 60 | title varchar(256), 61 | duration numeric(18,0), 62 | "year" int4 63 | ); 64 | 65 | CREATE TABLE public."time" ( 66 | start_time timestamp NOT NULL, 67 | "hour" int4, 68 | "day" int4, 69 | week int4, 70 | "month" varchar(256), 71 | "year" int4, 72 | weekday varchar(256), 73 | CONSTRAINT time_pkey PRIMARY KEY (start_time) 74 | ) ; 75 | 76 | CREATE TABLE public.users ( 77 | userid int4 NOT NULL, 78 | first_name varchar(256), 79 | last_name varchar(256), 80 | gender varchar(256), 81 | "level" varchar(256), 82 | CONSTRAINT users_pkey PRIMARY KEY (userid) 83 | ); 84 | -------------------------------------------------------------------------------- /dags/airflow_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | # import os 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator, 6 | LoadDimensionOperator, DataQualityOperator) 7 | from helpers import SqlQueries 8 | 9 | # AWS_KEY = os.environ.get('AWS_KEY') 10 | # AWS_SECRET = os.environ.get('AWS_SECRET') 11 | 12 | default_args = { 13 | 'owner': 'gil', 14 | 'start_date': datetime(2019, 1, 12), 15 | 'depends_on_past': False, 16 | 'retries': 3, 17 | 'retry_delay': timedelta(minutes=5), 18 | 'email_on_retry': False 19 | } 20 | 21 | # Using the context manager allows me not to duplicate the dag parameter in each operator 22 | with DAG('redshift_ETL_dag', 23 | default_args=default_args, 24 | description='Load and transform data in Redshift with Airflow', 25 | catchup=False, 26 | schedule_interval='0 * * * *' 27 | ) as dag: 28 | 29 | start_operator = DummyOperator(task_id='Begin_execution') 30 | 31 | stage_events_to_redshift = StageToRedshiftOperator( 32 | task_id='Stage_events', 33 | table="staging_events", 34 | redshift_conn_id="redshift", 35 | aws_credentials_id="aws_credentials", 36 | file_typ="json", 37 | s3_bucket="udacity-dend", 38 | s3_key="log_data", 39 | sql=SqlQueries.create_staging_events_table 40 | ) 41 | 42 | stage_songs_to_redshift = StageToRedshiftOperator( 43 | task_id='Stage_songs', 44 | table="staging_songs", 45 | redshift_conn_id="redshift", 46 | aws_credentials_id="aws_credentials", 47 | file_typ="json", 48 | s3_bucket="udacity-dend", 49 | s3_key="song_data", 50 | sql=SqlQueries.create_staging_songs_table 51 | ) 52 | 53 | load_songplays_table = LoadFactOperator( 54 | task_id='Load_songplays_fact_table', 55 | redshift_conn_id="redshift", 56 | create_table_sql=SqlQueries.create_songplays_table, 57 | insert_table_sql=SqlQueries.insert_songplay_table, 58 | mode="append", 59 | target_table="songplays" 60 | ) 61 | 62 | load_user_dimension_table = LoadDimensionOperator( 63 | task_id='Load_user_dim_table', 64 | redshift_conn_id="redshift", 65 | aws_credentials_id="aws_credentials", 66 | create_table_sql=SqlQueries.create_users_table, 67 | insert_table_sql=SqlQueries.insert_user_table, 68 | mode="overwrite", 69 | target_table="users" 70 | ) 71 | 72 | load_song_dimension_table = LoadDimensionOperator( 73 | task_id='Load_song_dim_table', 74 | redshift_conn_id="redshift", 75 | aws_credentials_id="aws_credentials", 76 | create_table_sql=SqlQueries.create_songs_table, 77 | insert_table_sql=SqlQueries.insert_song_table, 78 | mode="overwrite", 79 | target_table="songs" 80 | ) 81 | 82 | load_artist_dimension_table = LoadDimensionOperator( 83 | task_id='Load_artist_dim_table', 84 | redshift_conn_id="redshift", 85 | aws_credentials_id="aws_credentials", 86 | create_table_sql=SqlQueries.create_artists_table, 87 | insert_table_sql=SqlQueries.insert_artist_table, 88 | mode="overwrite", 89 | target_table="artists" 90 | ) 91 | 92 | load_time_dimension_table = LoadDimensionOperator( 93 | task_id='Load_time_dim_table', 94 | redshift_conn_id="redshift", 95 | aws_credentials_id="aws_credentials", 96 | create_table_sql=SqlQueries.create_times_table, 97 | insert_table_sql=SqlQueries.insert_time_table, 98 | mode="overwrite", 99 | target_table="time" 100 | ) 101 | 102 | run_quality_checks = DataQualityOperator( 103 | task_id='Run_data_quality_checks', 104 | redshift_conn_id="redshift", 105 | table_name="songplays" 106 | ) 107 | 108 | end_operator = DummyOperator(task_id='Stop_execution') 109 | 110 | # Make graph 111 | start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] 112 | [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table 113 | load_songplays_table >> [load_artist_dimension_table, load_song_dimension_table, load_time_dimension_table, 114 | load_user_dimension_table] 115 | [load_artist_dimension_table, load_song_dimension_table, load_time_dimension_table, 116 | load_user_dimension_table] >> run_quality_checks 117 | run_quality_checks >> end_operator 118 | -------------------------------------------------------------------------------- /example-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gfelot/DEND-Data_Pipeline_Airflow/046b717363e0d02c3667a456a1cf399bc65d738d/example-dag.png -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | import helpers 7 | 8 | # Defining the plugin class 9 | class UdacityPlugin(AirflowPlugin): 10 | name = "udacity_plugin" 11 | operators = [ 12 | operators.StageToRedshiftOperator, 13 | operators.LoadFactOperator, 14 | operators.LoadDimensionOperator, 15 | operators.DataQualityOperator 16 | ] 17 | helpers = [ 18 | helpers.SqlQueries 19 | ] 20 | -------------------------------------------------------------------------------- /plugins/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.sql_queries import SqlQueries 2 | 3 | __all__ = [ 4 | 'SqlQueries', 5 | ] -------------------------------------------------------------------------------- /plugins/helpers/sql_queries.py: -------------------------------------------------------------------------------- 1 | class SqlQueries: 2 | create_staging_events_table = (""" 3 | DROP TABLE IF EXISTS staging_events; 4 | CREATE TABLE IF NOT EXISTS staging_events ( 5 | artist varchar(256), 6 | auth varchar(256), 7 | firstName varchar(256), 8 | gender varchar(256), 9 | item_in_session int4, 10 | lastName varchar(256), 11 | length numeric(18,0), 12 | level varchar(256), 13 | location varchar(256), 14 | method varchar(256), 15 | page varchar(256), 16 | registration numeric(18,0), 17 | sessionId int4, 18 | song varchar(256), 19 | status int4, 20 | ts int8, 21 | userAgent varchar(256), 22 | userId int4 23 | ); 24 | """) 25 | 26 | create_staging_songs_table = (""" 27 | DROP TABLE IF EXISTS staging_songs; 28 | CREATE TABLE IF NOT EXISTS staging_songs ( 29 | num_songs int4, 30 | artist_id varchar(256), 31 | artist_name varchar(256), 32 | artist_latitude numeric(18,0), 33 | artist_longitude numeric(18,0), 34 | artist_location varchar(256), 35 | song_id varchar(256), 36 | title varchar(256), 37 | duration numeric(18,0), 38 | year int4 39 | ); 40 | """) 41 | 42 | create_songplays_table = (""" 43 | DROP TABLE IF EXISTS songplays; 44 | CREATE TABLE IF NOT EXISTS songplays ( 45 | play_id varchar(32) NOT NULL, 46 | start_time timestamp NOT NULL, 47 | user_id int4 NOT NULL, 48 | level varchar(256), 49 | song_id varchar(256), 50 | artist_id varchar(256), 51 | session_id int4, 52 | location varchar(256), 53 | user_agent varchar(256), 54 | CONSTRAINT songplays_pkey PRIMARY KEY (play_id) 55 | ); 56 | """) 57 | 58 | insert_songplay_table = (""" 59 | SELECT md5(events.sessionid || events.start_time) songplay_id, 60 | events.start_time, 61 | events.user_id, 62 | events.level, 63 | songs.song_id, 64 | songs.artist_id, 65 | events.session_id, 66 | events.location, 67 | events.useragent 68 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 69 | FROM staging_events 70 | WHERE page='NextSong') events 71 | LEFT JOIN staging_songs songs 72 | ON events.song = songs.title 73 | AND events.artist = songs.artist_name 74 | AND events.length = songs.duration 75 | """) 76 | 77 | create_users_table = (""" 78 | DROP TABLE IF EXISTS users; 79 | CREATE TABLE IF NOT EXISTS users ( 80 | user_id int4 NOT NULL, 81 | first_name varchar(256), 82 | last_name varchar(256), 83 | gender varchar(256), 84 | level varchar(256), 85 | CONSTRAINT users_pkey PRIMARY KEY (userid) 86 | ); 87 | """) 88 | 89 | insert_user_table = (""" 90 | SELECT distinct user_id, firstName, lastName, gender, level 91 | FROM staging_events 92 | WHERE page='NextSong' 93 | """) 94 | 95 | create_songs_table = (""" 96 | DROP TABLE IF EXISTS songs; 97 | CREATE TABLE IF NOT EXISTS songs ( 98 | song_id varchar(256) NOT NULL, 99 | title varchar(256), 100 | artist_id varchar(256), 101 | year int4, 102 | duration numeric(18,0), 103 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 104 | ); 105 | """) 106 | 107 | insert_song_table = (""" 108 | SELECT distinct song_id, title, artist_id, year, duration 109 | FROM staging_songs 110 | """) 111 | 112 | create_artists_table = (""" 113 | DROP TABLE IF EXISTS artists; 114 | CREATE TABLE IF NOT EXISTS artists ( 115 | artist_id varchar(256) NOT NULL, 116 | name varchar(256), 117 | location varchar(256), 118 | latitude numeric(18,0), 119 | longitude numeric(18,0) 120 | ); 121 | """) 122 | 123 | insert_artist_table = (""" 124 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 125 | FROM staging_songs 126 | """) 127 | 128 | create_times_table = (""" 129 | DROP TABLE IF EXISTS time; 130 | CREATE TABLE IF NOT EXISTS time ( 131 | start_time timestamp NOT NULL, 132 | hour int4, 133 | day int4 NOT NULL, 134 | week int4 , 135 | month int4 NOT NULL, 136 | year int4 NOT NULL, 137 | weekday int4, 138 | PRIMARY KEY(start_time, day, month, year)); 139 | """) 140 | 141 | insert_time_table = (""" 142 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 143 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 144 | FROM songplays 145 | """) 146 | -------------------------------------------------------------------------------- /plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.stage_redshift import StageToRedshiftOperator 2 | from operators.load_fact import LoadFactOperator 3 | from operators.load_dimension import LoadDimensionOperator 4 | from operators.data_quality import DataQualityOperator 5 | 6 | __all__ = [ 7 | 'StageToRedshiftOperator', 8 | 'LoadFactOperator', 9 | 'LoadDimensionOperator', 10 | 'DataQualityOperator' 11 | ] 12 | -------------------------------------------------------------------------------- /plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class DataQualityOperator(BaseOperator): 6 | 7 | ui_color = '#89DA59' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | table_name="", 13 | *args, **kwargs): 14 | 15 | super(DataQualityOperator, self).__init__(*args, **kwargs) 16 | self.redshift_conn_id = redshift_conn_id 17 | self.table_name = table_name 18 | 19 | def execute(self): 20 | redshift_hook = PostgresHook(self.redshift_conn_id) 21 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {self.table_name}") 22 | if len(records) < 1 or len(records[0]) < 1: 23 | raise ValueError(f"Data quality check failed. {self.table_name} has no values") 24 | num_records = records[0][0] 25 | if num_records < 1: 26 | raise ValueError(f"Data quality check failed. {self.table_name} contained 0 rows") 27 | self.log.info(f"Data quality on table {self.table_name} check passed with {records[0][0]} records") 28 | -------------------------------------------------------------------------------- /plugins/operators/load_dimension.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class LoadDimensionOperator(BaseOperator): 7 | ui_color = '#80BD9E' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | aws_credentials_id="", 13 | create_table_sql="", 14 | insert_table_sql="", 15 | mode="", 16 | target_table="", 17 | *args, **kwargs): 18 | 19 | super(LoadDimensionOperator, self).__init__(*args, **kwargs) 20 | self.redshift_conn_id = redshift_conn_id 21 | self.aws_credentials_id = aws_credentials_id 22 | self.create_table_sql = create_table_sql 23 | self.insert_table_sql = insert_table_sql 24 | self.mode = mode 25 | self.target_table = target_table 26 | 27 | 28 | def execute(self): 29 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 30 | 31 | self.log.info("Creating the respective dimension table in redshift before insert") 32 | redshift.run(format(self.create_table_sql)) 33 | 34 | self.log.info('Insert to the respective dimension table') 35 | 36 | if self.mode == "append": 37 | insert_sql = f"INSERT INTO {self.target_table} {self.insert_table_sql}" 38 | else: 39 | insert_sql = f"DELETE FROM {self.target_table}; INSERT INTO {self.target_table} {self.insert_table_sql}" 40 | self.log.info("Command is " + insert_sql) 41 | redshift.run(insert_sql) 42 | -------------------------------------------------------------------------------- /plugins/operators/load_fact.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadFactOperator(BaseOperator): 6 | 7 | ui_color = '#F98866' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | create_table_sql="", 13 | insert_table_sql="", 14 | mode="", 15 | target_table="", 16 | *args, **kwargs): 17 | 18 | super(LoadFactOperator, self).__init__(*args, **kwargs) 19 | self.redshift_conn_id = redshift_conn_id 20 | self.create_table_sql = create_table_sql 21 | self.insert_table_sql = insert_table_sql 22 | self.mode = mode 23 | self.target_table = target_table 24 | 25 | def execute(self): 26 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 27 | 28 | self.log.info("Create fact table") 29 | redshift.run(format(self.create_table_sql)) 30 | 31 | self.log.info('Insert fact table') 32 | 33 | if self.mode == "append": 34 | insert_sql = f"INSERT INTO {self.target_table} {self.insert_table_sql}" 35 | else: 36 | insert_sql = f"DELETE FROM {self.target_table}; INSERT INTO {self.target_table} {self.insert_table_sql}" 37 | self.log.info("Command is " + insert_sql) 38 | redshift.run(insert_sql) 39 | -------------------------------------------------------------------------------- /plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | from airflow.contrib.hooks.aws_hook import AwsHook 5 | 6 | class StageToRedshiftOperator(BaseOperator): 7 | ui_color = '#358140' 8 | 9 | template_fields = ("s3_key",) 10 | 11 | copy_sql = """ 12 | COPY {} 13 | FROM '{}' 14 | ACCESS_KEY_ID '{}' 15 | SECRET_ACCESS_KEY '{}' 16 | IGNOREHEADER {} 17 | TIMEFORMAT as 'epochmillisecs' 18 | TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL 19 | REGION 'us-west-2' 20 | DELIMITER '{}' 21 | """ 22 | 23 | copy_songsql_json = """ 24 | COPY {} 25 | FROM '{}' 26 | ACCESS_KEY_ID '{}' 27 | SECRET_ACCESS_KEY '{}' 28 | TIMEFORMAT as 'epochmillisecs' 29 | REGION 'us-west-2' 30 | FORMAT AS JSON 'auto' 31 | """ 32 | 33 | copy_eventsql_json = """ 34 | COPY {} 35 | FROM '{}' 36 | ACCESS_KEY_ID '{}' 37 | SECRET_ACCESS_KEY '{}' 38 | TIMEFORMAT as 'epochmillisecs' 39 | TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL 40 | REGION 'us-west-2' 41 | FORMAT AS JSON 's3://udacity-dend/log_json_path.json' 42 | """ 43 | 44 | @apply_defaults 45 | def __init__(self, 46 | redshift_conn_id="", 47 | aws_credentials_id="", 48 | table="", 49 | s3_bucket="", 50 | s3_key="", 51 | delimiter=",", 52 | ignore_headers=1, 53 | file_typ="", 54 | sql="", 55 | *args, **kwargs): 56 | 57 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs) 58 | self.table = table 59 | self.redshift_conn_id = redshift_conn_id 60 | self.s3_bucket = s3_bucket 61 | self.s3_key = s3_key 62 | self.delimiter = delimiter 63 | self.ignore_headers = ignore_headers 64 | self.aws_credentials_id = aws_credentials_id 65 | self.file_typ = file_typ 66 | self.sql = sql 67 | 68 | def execute(self, context): 69 | aws_hook = AwsHook(self.aws_credentials_id) 70 | credentials = aws_hook.get_credentials() 71 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 72 | 73 | self.log.info("Creating the table in redshift before load") 74 | redshift.run(format(self.sql)) 75 | 76 | self.log.info("Clearing data from destination Redshift table") 77 | redshift.run("DELETE FROM {}".format(self.table)) 78 | 79 | self.log.info("Copying data from S3 to Redshift") 80 | rendered_key = self.s3_key.format(**context) 81 | s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) 82 | if self.file_typ == "csv": 83 | formatted_sql = StageToRedshiftOperator.copy_sql.format( 84 | self.table, 85 | s3_path, 86 | credentials.access_key, 87 | credentials.secret_key, 88 | self.ignore_headers, 89 | self.delimiter 90 | ) 91 | else: 92 | if self.table == "staging_songs": 93 | formatted_sql = StageToRedshiftOperator.copy_songsql_json.format( 94 | self.table, 95 | s3_path, 96 | credentials.access_key, 97 | credentials.secret_key 98 | ) 99 | else: 100 | formatted_sql = StageToRedshiftOperator.copy_eventsql_json.format( 101 | self.table, 102 | s3_path, 103 | credentials.access_key, 104 | credentials.secret_key 105 | ) 106 | redshift.run(formatted_sql) 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3===1.9.191 --------------------------------------------------------------------------------