├── Batch-Data-Pipelines ├── assets │ └── images │ │ ├── metabase.png │ │ ├── de_proj_obj.png │ │ ├── de_proj_design.png │ │ └── de_proj_infra.png ├── infra_variables.txt ├── test │ └── dag │ │ └── test_dag_validity.py ├── dags │ ├── scripts │ │ ├── sql │ │ │ ├── unload_user_purchase.sql │ │ │ └── generate_user_behavior_metric.sql │ │ ├── emr │ │ │ └── clean_movie_review.json │ │ └── spark │ │ │ └── random_text_classification.py │ ├── utils.py │ └── user_behaviour.py ├── pgsetup │ └── create_user_purchase.sql ├── Makefile ├── setup_ubuntu_docker.txt ├── redshiftsetup │ └── setup.sql ├── tear_down_infra.sh ├── docker-compose.yaml └── setup_infra.sh ├── LICENSE └── README.md /Batch-Data-Pipelines/assets/images/metabase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/metabase.png -------------------------------------------------------------------------------- /Batch-Data-Pipelines/assets/images/de_proj_obj.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/de_proj_obj.png -------------------------------------------------------------------------------- /Batch-Data-Pipelines/assets/images/de_proj_design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/de_proj_design.png -------------------------------------------------------------------------------- /Batch-Data-Pipelines/assets/images/de_proj_infra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/de_proj_infra.png -------------------------------------------------------------------------------- /Batch-Data-Pipelines/infra_variables.txt: -------------------------------------------------------------------------------- 1 | AWS_REGION="us-east-1" 2 | AWS_EC2_INSTANCE="t2.large" 3 | EC2_IAM_ROLE="sde-ec2-s3-emr-rs-access" 4 | EC2_IMAGE_ID="ami-09d56f8956ab235b3" 5 | EC2_SECURITY_GROUP="sde-security-group" 6 | SERVICE_NAME="sde-batch-de-project" 7 | IAM_ROLE_NAME="sde-spectrum-redshift" 8 | REDSHIFT_USER="sde_user" 9 | REDSHIFT_PASSWORD="sdeP0ssword0987" 10 | REDSHIFT_PORT="5439" 11 | EMR_NODE_TYPE="m4.xlarge" -------------------------------------------------------------------------------- /Batch-Data-Pipelines/test/dag/test_dag_validity.py: -------------------------------------------------------------------------------- 1 | from airflow.models import DagBag 2 | 3 | 4 | def test_no_import_errors(monkeypatch): 5 | 6 | # Set variables 7 | monkeypatch.setenv("AIRFLOW_VAR_BUCKET", "test-bucket") 8 | monkeypatch.setenv("AIRFLOW_VAR_EMR_ID", "test-emr-id") 9 | 10 | dag_bag = DagBag() 11 | assert len(dag_bag.import_errors) == 0, "No Import Failures" 12 | assert dag_bag.size() == 1 13 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/dags/scripts/sql/unload_user_purchase.sql: -------------------------------------------------------------------------------- 1 | COPY ( 2 | select invoice_number, 3 | stock_code, 4 | detail, 5 | quantity, 6 | invoice_date, 7 | unit_price, 8 | customer_id, 9 | country 10 | from retail.user_purchase -- we should have a date filter here to pull only required data 11 | ) TO '{{ params.user_purchase }}' WITH (FORMAT CSV, HEADER); 12 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/pgsetup/create_user_purchase.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA retail; 2 | 3 | CREATE TABLE retail.user_purchase ( 4 | invoice_number varchar(10), 5 | stock_code varchar(20), 6 | detail varchar(1000), 7 | quantity int, 8 | invoice_date timestamp, 9 | unit_price Numeric(8,3), 10 | customer_id int, 11 | country varchar(20) 12 | ); 13 | 14 | COPY retail.user_purchase(invoice_number,stock_code,detail,quantity,invoice_date,unit_price,customer_id,country) 15 | FROM '/input_data/OnlineRetail.csv' DELIMITER ',' CSV HEADER; 16 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/Makefile: -------------------------------------------------------------------------------- 1 | up: 2 | docker compose up airflow-init && docker compose up --build -d 3 | 4 | down: 5 | docker compose down 6 | 7 | sh: 8 | docker exec -ti webserver bash 9 | 10 | pytest: 11 | docker exec -ti webserver pytest -p no:warnings -v /opt/airflow/test 12 | 13 | format: 14 | docker exec -ti webserver python -m black -S --line-length 79 . 15 | 16 | isort: 17 | docker exec -ti webserver isort . 18 | 19 | type: 20 | docker exec -ti webserver mypy --ignore-missing-imports /opt/airflow 21 | 22 | lint: 23 | docker exec -ti webserver flake8 /opt/airflow/dags 24 | 25 | ci: isort format type lint pytest 26 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/setup_ubuntu_docker.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get -y update 4 | 5 | sudo apt-get -y install \ 6 | ca-certificates \ 7 | curl \ 8 | gnupg \ 9 | lsb-release 10 | 11 | sudo apt -y install unzip 12 | 13 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg 14 | 15 | echo \ 16 | "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ 17 | $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 18 | 19 | sudo apt-get -y update 20 | sudo apt-get -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin 21 | sudo chmod 666 /var/run/docker.sock -------------------------------------------------------------------------------- /Batch-Data-Pipelines/dags/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import psycopg2 4 | 5 | from airflow.hooks.postgres_hook import PostgresHook 6 | from airflow.hooks.S3_hook import S3Hook 7 | 8 | 9 | def _local_to_s3( 10 | bucket_name: str, key: str, file_name: str, remove_local: bool = False 11 | ) -> None: 12 | s3 = S3Hook() 13 | s3.load_file( 14 | filename=file_name, bucket_name=bucket_name, replace=True, key=key 15 | ) 16 | if remove_local: 17 | if os.path.isfile(file_name): 18 | os.remove(file_name) 19 | 20 | 21 | def run_redshift_external_query(qry: str) -> None: 22 | rs_hook = PostgresHook(postgres_conn_id="redshift") 23 | rs_conn = rs_hook.get_conn() 24 | rs_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) 25 | rs_cursor = rs_conn.cursor() 26 | rs_cursor.execute(qry) 27 | rs_cursor.close() 28 | rs_conn.commit() 29 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/dags/scripts/sql/generate_user_behavior_metric.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM public.user_behavior_metric 2 | WHERE insert_date = '{{ ds }}'; 3 | INSERT INTO public.user_behavior_metric ( 4 | customerid, 5 | amount_spent, 6 | review_score, 7 | review_count, 8 | insert_date 9 | ) 10 | SELECT ups.customerid, 11 | CAST( 12 | SUM(ups.Quantity * ups.UnitPrice) AS DECIMAL(18, 5) 13 | ) AS amount_spent, 14 | SUM(mrcs.positive_review) AS review_score, 15 | count(mrcs.cid) AS review_count, 16 | '{{ ds }}' 17 | FROM spectrum.user_purchase_staging ups 18 | JOIN ( 19 | SELECT cid, 20 | CASE 21 | WHEN positive_review IS True THEN 1 22 | ELSE 0 23 | END AS positive_review 24 | FROM spectrum.classified_movie_review 25 | WHERE insert_date = '{{ ds }}' 26 | ) mrcs ON ups.customerid = mrcs.cid 27 | WHERE ups.insert_date = '{{ ds }}' 28 | GROUP BY ups.customerid; 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Ramesh chinnaraj 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/dags/scripts/emr/clean_movie_review.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Name": "Move raw data from S3 to HDFS", 4 | "ActionOnFailure": "CANCEL_AND_WAIT", 5 | "HadoopJarStep": { 6 | "Jar": "command-runner.jar", 7 | "Args": [ 8 | "s3-dist-cp", 9 | "--src=s3://{{ params.BUCKET_NAME }}/{{ params.raw_movie_review }}/{{ ds }}/", 10 | "--dest=/movie/{{ ds }}" 11 | ] 12 | } 13 | }, 14 | { 15 | "Name": "Classify movie reviews", 16 | "ActionOnFailure": "CANCEL_AND_WAIT", 17 | "HadoopJarStep": { 18 | "Jar": "command-runner.jar", 19 | "Args": [ 20 | "spark-submit", 21 | "s3://{{ params.BUCKET_NAME }}/scripts/random_text_classification.py", 22 | "--input=/movie/{{ ds }}", 23 | "--run-id={{ ds }}" 24 | ] 25 | } 26 | }, 27 | { 28 | "Name": "Move classified data from HDFS to S3", 29 | "ActionOnFailure": "CANCEL_AND_WAIT", 30 | "HadoopJarStep": { 31 | "Jar": "command-runner.jar", 32 | "Args": [ 33 | "s3-dist-cp", 34 | "--src=/output", 35 | "--dest=s3://{{ params.BUCKET_NAME }}/{{ params.stage_movie_review }}/{{ ds }}" 36 | ] 37 | } 38 | } 39 | ] -------------------------------------------------------------------------------- /Batch-Data-Pipelines/redshiftsetup/setup.sql: -------------------------------------------------------------------------------- 1 | -- This is run as part of the setup_infra.sh script 2 | CREATE EXTERNAL SCHEMA spectrum 3 | FROM DATA CATALOG DATABASE 'spectrumdb' iam_role 'arn:aws:iam::"$AWS_ID":role/"$IAM_ROLE_NAME"' CREATE EXTERNAL DATABASE IF NOT EXISTS; 4 | DROP TABLE IF EXISTS spectrum.user_purchase_staging; 5 | CREATE EXTERNAL TABLE spectrum.user_purchase_staging ( 6 | InvoiceNo VARCHAR(10), 7 | StockCode VARCHAR(20), 8 | detail VARCHAR(1000), 9 | Quantity INTEGER, 10 | InvoiceDate TIMESTAMP, 11 | UnitPrice DECIMAL(8, 3), 12 | customerid INTEGER, 13 | Country VARCHAR(20) 14 | ) PARTITIONED BY (insert_date DATE) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS textfile LOCATION 's3://"$1"/stage/user_purchase/' TABLE PROPERTIES ('skip.header.line.count' = '1'); 15 | DROP TABLE IF EXISTS spectrum.classified_movie_review; 16 | CREATE EXTERNAL TABLE spectrum.classified_movie_review ( 17 | cid VARCHAR(100), 18 | positive_review boolean, 19 | insert_date VARCHAR(12) 20 | ) STORED AS PARQUET LOCATION 's3://"$1"/stage/movie_review/'; 21 | DROP TABLE IF EXISTS public.user_behavior_metric; 22 | CREATE TABLE public.user_behavior_metric ( 23 | customerid INTEGER, 24 | amount_spent DECIMAL(18, 5), 25 | review_score INTEGER, 26 | review_count INTEGER, 27 | insert_date DATE 28 | ); 29 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/dags/scripts/spark/random_text_classification.py: -------------------------------------------------------------------------------- 1 | # pyspark 2 | import argparse 3 | 4 | from pyspark.ml.feature import StopWordsRemover, Tokenizer 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.functions import array_contains, lit 7 | 8 | 9 | def random_text_classifier( 10 | input_loc: str, output_loc: str, run_id: str 11 | ) -> None: 12 | """ 13 | This is a dummy function to show how to use spark, It is supposed to mock 14 | the following steps 15 | 1. clean input data 16 | 2. use a pre-trained model to make prediction 17 | 3. write predictions to a HDFS output 18 | 19 | Since this is meant as an example, we are going to skip building a model, 20 | instead we are naively going to mark reviews having the text "good" as 21 | positive and the rest as negative 22 | """ 23 | 24 | # read input 25 | df_raw = spark.read.option("header", True).csv(input_loc) 26 | # perform text cleaning 27 | 28 | # Tokenize text 29 | tokenizer = Tokenizer(inputCol="review_str", outputCol="review_token") 30 | df_tokens = tokenizer.transform(df_raw).select("cid", "review_token") 31 | 32 | # Remove stop words 33 | remover = StopWordsRemover( 34 | inputCol="review_token", outputCol="review_clean" 35 | ) 36 | df_clean = remover.transform(df_tokens).select("cid", "review_clean") 37 | 38 | # function to check presence of good 39 | df_out = df_clean.select( 40 | "cid", 41 | array_contains(df_clean.review_clean, "good").alias("positive_review"), 42 | ) 43 | df_fin = df_out.withColumn("insert_date", lit(run_id)) 44 | # parquet is a popular column storage format, we use it here 45 | df_fin.write.mode("overwrite").parquet(output_loc) 46 | 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument( 51 | "--input", type=str, help="HDFS input", default="/movie" 52 | ) 53 | parser.add_argument( 54 | "--output", type=str, help="HDFS output", default="/output" 55 | ) 56 | parser.add_argument("--run-id", type=str, help="run id") 57 | args = parser.parse_args() 58 | spark = SparkSession.builder.appName( 59 | "Random Text Classifier" 60 | ).getOrCreate() 61 | random_text_classifier( 62 | input_loc=args.input, output_loc=args.output, run_id=args.run_id 63 | ) 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Batch-Data-Pipelines 2 | 3 | 4 | Design 5 | Setup 6 | Prerequisite 7 | Local run 8 | Deploy to AWS 9 | Stop 10 | Contributing 11 | Design 12 | 13 | We will be using Airflow to orchestrate 14 | 15 | Classifying movie reviews with Apache Spark. 16 | Loading the classified movie reviews into the data warehouse. 17 | Extract user purchase data from an OLTP database and load it into the data warehouse. 18 | Joining the classified movie review data and user purchase data to get user behavior metric data. 19 | 20 | 21 | Data pipeline design 22 | 23 | Setup 24 | Prerequisite 25 | Docker with at least 4GB of RAM and Docker Compose v1.27.0 or later 26 | psql 27 | AWS account 28 | AWS CLI installed and configured 29 | Clone and cd into the project directory. 30 | 31 | git clone https://github.com/josephmachado/beginner_de_project.git 32 | cd beginner_de_project 33 | Local run 34 | When running locally, you can use the make command to manage infrastructure. We use the following docker containers 35 | 36 | ![image](https://user-images.githubusercontent.com/110036451/184506899-c59f4298-bfc1-410b-9be3-48618ec8c7c4.png) 37 | 38 | 39 | Airflow 40 | Postgres DB (as Airflow metadata DB) 41 | Metabase for data visualization 42 | You can start the local containers as shown below. 43 | 44 | make up # start all containers 45 | make ci # runs format checks, type checks, static checks, and tests 46 | make down # stops the containers 47 | 48 | Since we cannot replicate AWS components locally, we have not set them up here. 49 | 50 | We have a dag validity test defined here. 51 | 52 | Deploy to AWS 53 | To set up the AWS infrastructure we have a script called setup_infra.sh. 54 | 55 | Note: We run all of our infrastructure on AWS us-east-1. If you want to change this, please change the corresponding variables in infra_variables.txt. 56 | 57 | Setup can be run as shown below. 58 | 59 | make down # since our AWS infra will be port forwarded to 8080 and 3000 which are used by local Airflow and Metabase respectively 60 | ./setup_infra.sh {your-bucket-name} # e.g ./setup_infra.sh my-test-bucket 61 | In the prompt enter yes to authenticate the ssh connection. 62 | 63 | This sets up the following components 64 | 65 | 1 AWS EC2, running Airflow, Metabase 66 | 1 AWS EMR cluster 67 | 1 AWS Redshift cluster 68 | 1 AWS S3 bucket 69 | The command will also open Airflow running on an EC2 instance. You can also checkout 70 | 71 | Airflow www.localhost:8080 (username and password are both airflow) 72 | Metabase www.localhost:3000 73 | The first time you log in, create a user name and password. To establish a connection to your Redshift cluster, you will need the redshift host, which you can get using the command 74 | 75 | aws redshift describe-clusters --cluster-identifier sde-batch-de-project --query 'Clusters[0].Endpoint.Address' --output text 76 | The port, username, and password are in infra_vairables.txt and the database is dev. 77 | 78 | You can create dashboards in Metabase, as seen below. 79 | 80 | ![image](https://user-images.githubusercontent.com/110036451/184506920-8d19fd4d-d036-4074-bc4d-802c15bc55b7.png) 81 | 82 | 83 | Data pipeline design 84 | 85 | Stop 86 | When you are done, do not forget to turn off your AWS instances. In your terminal run 87 | 88 | ./tear_down_infra.sh {your-bucket-name} # e.g. ./tear_down_infra.sh my-test-bucket 89 | This will stop all the AWS services. Please double-check this by going to the AWS UI S3, EC2, EMR, & Redshift consoles. 90 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/tear_down_infra.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# -eq 0 ]] ; then 4 | echo 'Please enter your bucket name as ./tear_down_infra.sh your-bucket' 5 | exit 0 6 | fi 7 | 8 | AWS_ID=$(aws sts get-caller-identity --query Account --output text | cat) 9 | 10 | echo "Reading infrastructure variables from infra_variables.txt" 11 | source infra_variables.txt 12 | 13 | echo "Reading state values from state.log" 14 | source state.log 15 | 16 | echo "Deleting bucket "$1" and its contents" 17 | aws s3 rm s3://$1 --recursive --output text >> tear_down.log 18 | aws s3api delete-bucket --bucket $1 --output text >> tear_down.log 19 | 20 | echo "Terminating EC2 instance" 21 | aws ec2 terminate-instances --instance-ids $EC2_ID --region $AWS_REGION >> tear_down.log 22 | 23 | MY_IP=$(curl -s http://whatismyip.akamai.com/) 24 | 25 | echo "Delete EC2 security group ingress" 26 | aws ec2 revoke-security-group-ingress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 22 --cidr $MY_IP/24 --output text >> tear_down.log 27 | 28 | echo "Delete EC2 security group ingress" 29 | aws ec2 revoke-security-group-egress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 8080 --cidr $MY_IP/32 --output text >> tear_down.log 30 | 31 | echo "Terminating EMR cluster "$SERVICE_NAME"" 32 | EMR_CLUSTER_ID=$(aws emr list-clusters --active --query 'Clusters[?Name==`'$SERVICE_NAME'`].Id' --output text) 33 | aws emr terminate-clusters --cluster-ids $EMR_CLUSTER_ID >> tear_down.log 34 | 35 | echo "Deleting EC2 security group" 36 | sleep 60 37 | aws ec2 delete-security-group --group-id $EC2_SECURITY_GROUP_ID --output text >> tear_down.log 38 | 39 | echo "Terminating Redshift cluster "$SERVICE_NAME"" 40 | aws redshift delete-cluster --skip-final-cluster-snapshot --cluster-identifier $SERVICE_NAME --output text >> tear_down.log 41 | 42 | echo "Dissociating AmazonS3ReadOnlyAccess policy from "$IAM_ROLE_NAME" role" 43 | aws iam detach-role-policy --role-name $IAM_ROLE_NAME --policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess --output text >> tear_down.log 44 | echo "Dissociating AWSGlueConsoleFullAccess policy from "$IAM_ROLE_NAME" role" 45 | aws iam detach-role-policy --role-name $IAM_ROLE_NAME --policy-arn arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess --output text >> tear_down.log 46 | echo "Deleting role "$IAM_ROLE_NAME"" 47 | aws iam delete-role --role-name $IAM_ROLE_NAME --output text >> tear_down.log 48 | 49 | EC2_IAM_ROLE=sde-ec2-s3-emr-rs-access 50 | echo "Remove role from instance profile" 51 | aws iam remove-role-from-instance-profile --instance-profile-name $EC2_IAM_ROLE-instance-profile --role-name $EC2_IAM_ROLE --output text >> tear_down.log 52 | 53 | echo "Deleting role instance profile "$EC2_IAM_ROLE"-instance-profile" 54 | aws iam delete-instance-profile --instance-profile-name $EC2_IAM_ROLE-instance-profile --output text >> tear_down.log 55 | 56 | echo "Dissociating AmazonS3FullAccess policy from "$EC2_IAM_ROLE" role" 57 | aws iam detach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --output text >> tear_down.log 58 | 59 | echo "Dissociating AmazonEMRFullAccessPolicy_v2 policy from "$EC2_IAM_ROLE" role" 60 | aws iam detach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonEMRFullAccessPolicy_v2 --output text >> tear_down.log 61 | 62 | echo "Dissociating AmazonRedshiftAllCommandsFullAccess policy from "$EC2_IAM_ROLE" role" 63 | aws iam detach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonRedshiftAllCommandsFullAccess --output text >> tear_down.log 64 | 65 | echo "Deleting role "$EC2_IAM_ROLE"" 66 | aws iam delete-role --role-name $EC2_IAM_ROLE --output text >> tear_down.log 67 | 68 | echo "Deleting SSH key" 69 | aws ec2 delete-key-pair --key-name sde-key --region $AWS_REGION >> setup.log 70 | rm -f sde-key.pem 71 | 72 | rm -f tear_down.log setup.log state.log trust-policy.json 73 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:master-python3.8 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_GID - Group ID in Airflow containers 31 | # Default: 50000 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. 35 | # Default: airflow 36 | # 37 | # Feel free to modify this file to suit your needs. 38 | --- 39 | version: '3' 40 | x-airflow-common: 41 | &airflow-common 42 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.0} 43 | environment: 44 | &airflow-common-env 45 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 46 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 47 | AIRFLOW__CORE__FERNET_KEY: '' 48 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 49 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 50 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 51 | AIRFLOW_CONN_POSTGRES_DEFAULT: postgres://airflow:airflow@postgres:5432/airflow 52 | # AWS_SHARED_CREDENTIALS_FILE: /opt/airflow/.aws/credentials 53 | # NOT RECOMMENDED FOR PRODUCTION!!! 54 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- black flake8 mypy isort moto[all] pytest pytest-mock apache-airflow-client} 55 | volumes: 56 | - ./dags:/opt/airflow/dags 57 | - ./logs:/opt/airflow/logs 58 | - ./plugins:/opt/airflow/plugins 59 | - ./test:/opt/airflow/test 60 | - ./data:/opt/airflow/data 61 | - ./temp:/opt/airflow/temp 62 | user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" 63 | depends_on: 64 | postgres: 65 | condition: service_healthy 66 | 67 | services: 68 | postgres: 69 | container_name: postgres 70 | image: postgres:13 71 | environment: 72 | POSTGRES_USER: airflow 73 | POSTGRES_PASSWORD: airflow 74 | POSTGRES_DB: airflow 75 | volumes: 76 | - ./data:/input_data 77 | - ./temp:/temp 78 | - ./pgsetup:/docker-entrypoint-initdb.d 79 | healthcheck: 80 | test: [ "CMD", "pg_isready", "-U", "airflow" ] 81 | interval: 5s 82 | retries: 5 83 | restart: always 84 | ports: 85 | - "5432:5432" 86 | 87 | airflow-webserver: 88 | <<: *airflow-common 89 | container_name: webserver 90 | command: webserver 91 | ports: 92 | - 8080:8080 93 | healthcheck: 94 | test: 95 | [ 96 | "CMD", 97 | "curl", 98 | "--fail", 99 | "http://localhost:8080/health" 100 | ] 101 | interval: 10s 102 | timeout: 10s 103 | retries: 5 104 | restart: always 105 | 106 | airflow-scheduler: 107 | <<: *airflow-common 108 | container_name: scheduler 109 | command: scheduler 110 | healthcheck: 111 | test: 112 | [ 113 | "CMD-SHELL", 114 | 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"' 115 | ] 116 | interval: 10s 117 | timeout: 10s 118 | retries: 5 119 | restart: always 120 | 121 | airflow-init: 122 | <<: *airflow-common 123 | command: version 124 | environment: 125 | <<: *airflow-common-env 126 | _AIRFLOW_DB_UPGRADE: 'true' 127 | _AIRFLOW_WWW_USER_CREATE: 'true' 128 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 129 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 130 | 131 | dashboard: 132 | image: metabase/metabase 133 | container_name: dashboard 134 | ports: 135 | - "3000:3000" 136 | 137 | volumes: 138 | postgres-db-volume: 139 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/dags/user_behaviour.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime, timedelta 3 | 4 | from utils import _local_to_s3, run_redshift_external_query 5 | 6 | from airflow import DAG 7 | from airflow.contrib.operators.emr_add_steps_operator import ( 8 | EmrAddStepsOperator, 9 | ) 10 | from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor 11 | from airflow.models import Variable 12 | from airflow.operators.dummy_operator import DummyOperator 13 | from airflow.operators.postgres_operator import PostgresOperator 14 | from airflow.operators.python import PythonOperator 15 | 16 | # Config 17 | BUCKET_NAME = Variable.get("BUCKET") 18 | EMR_ID = Variable.get("EMR_ID") 19 | EMR_STEPS = {} 20 | with open("./dags/scripts/emr/clean_movie_review.json") as json_file: 21 | EMR_STEPS = json.load(json_file) 22 | 23 | # DAG definition 24 | default_args = { 25 | "owner": "airflow", 26 | "depends_on_past": True, 27 | "wait_for_downstream": True, 28 | "start_date": datetime(2021, 5, 23), 29 | "email": ["airflow@airflow.com"], 30 | "email_on_failure": False, 31 | "email_on_retry": False, 32 | "retries": 2, 33 | "retry_delay": timedelta(minutes=1), 34 | } 35 | 36 | dag = DAG( 37 | "user_behaviour", 38 | default_args=default_args, 39 | schedule_interval="0 0 * * *", 40 | max_active_runs=1, 41 | ) 42 | 43 | extract_user_purchase_data = PostgresOperator( 44 | dag=dag, 45 | task_id="extract_user_purchase_data", 46 | sql="./scripts/sql/unload_user_purchase.sql", 47 | postgres_conn_id="postgres_default", 48 | params={"user_purchase": "/temp/user_purchase.csv"}, 49 | depends_on_past=True, 50 | wait_for_downstream=True, 51 | ) 52 | 53 | user_purchase_to_stage_data_lake = PythonOperator( 54 | dag=dag, 55 | task_id="user_purchase_to_stage_data_lake", 56 | python_callable=_local_to_s3, 57 | op_kwargs={ 58 | "file_name": "/opt/airflow/temp/user_purchase.csv", 59 | "key": "stage/user_purchase/{{ ds }}/user_purchase.csv", 60 | "bucket_name": BUCKET_NAME, 61 | "remove_local": "true", 62 | }, 63 | ) 64 | 65 | user_purchase_stage_data_lake_to_stage_tbl = PythonOperator( 66 | dag=dag, 67 | task_id="user_purchase_stage_data_lake_to_stage_tbl", 68 | python_callable=run_redshift_external_query, 69 | op_kwargs={ 70 | "qry": "alter table spectrum.user_purchase_staging add \ 71 | if not exists partition(insert_date='{{ ds }}') \ 72 | location 's3://" 73 | + BUCKET_NAME 74 | + "/stage/user_purchase/{{ ds }}'", 75 | }, 76 | ) 77 | 78 | movie_review_to_raw_data_lake = PythonOperator( 79 | dag=dag, 80 | task_id="movie_review_to_raw_data_lake", 81 | python_callable=_local_to_s3, 82 | op_kwargs={ 83 | "file_name": "/opt/airflow/data/movie_review.csv", 84 | "key": "raw/movie_review/{{ ds }}/movie.csv", 85 | "bucket_name": BUCKET_NAME, 86 | }, 87 | ) 88 | 89 | spark_script_to_s3 = PythonOperator( 90 | dag=dag, 91 | task_id="spark_script_to_s3", 92 | python_callable=_local_to_s3, 93 | op_kwargs={ 94 | "file_name": "./dags/scripts/spark/random_text_classification.py", 95 | "key": "scripts/random_text_classification.py", 96 | "bucket_name": BUCKET_NAME, 97 | }, 98 | ) 99 | 100 | start_emr_movie_classification_script = EmrAddStepsOperator( 101 | dag=dag, 102 | task_id="start_emr_movie_classification_script", 103 | job_flow_id=EMR_ID, 104 | aws_conn_id="aws_default", 105 | steps=EMR_STEPS, 106 | params={ 107 | "BUCKET_NAME": BUCKET_NAME, 108 | "raw_movie_review": "raw/movie_review", 109 | "text_classifier_script": "scripts/random_text_classifier.py", 110 | "stage_movie_review": "stage/movie_review", 111 | }, 112 | depends_on_past=True, 113 | ) 114 | 115 | last_step = len(EMR_STEPS) - 1 116 | 117 | wait_for_movie_classification_transformation = EmrStepSensor( 118 | dag=dag, 119 | task_id="wait_for_movie_classification_transformation", 120 | job_flow_id=EMR_ID, 121 | step_id='{{ task_instance.xcom_pull\ 122 | ("start_emr_movie_classification_script", key="return_value")[' 123 | + str(last_step) 124 | + "] }}", 125 | depends_on_past=True, 126 | ) 127 | 128 | generate_user_behavior_metric = PostgresOperator( 129 | dag=dag, 130 | task_id="generate_user_behavior_metric", 131 | sql="scripts/sql/generate_user_behavior_metric.sql", 132 | postgres_conn_id="redshift", 133 | ) 134 | 135 | end_of_data_pipeline = DummyOperator(task_id="end_of_data_pipeline", dag=dag) 136 | 137 | ( 138 | extract_user_purchase_data 139 | >> user_purchase_to_stage_data_lake 140 | >> user_purchase_stage_data_lake_to_stage_tbl 141 | ) 142 | ( 143 | [ 144 | movie_review_to_raw_data_lake, 145 | spark_script_to_s3, 146 | ] 147 | >> start_emr_movie_classification_script 148 | >> wait_for_movie_classification_transformation 149 | ) 150 | ( 151 | [ 152 | user_purchase_stage_data_lake_to_stage_tbl, 153 | wait_for_movie_classification_transformation, 154 | ] 155 | >> generate_user_behavior_metric 156 | >> end_of_data_pipeline 157 | ) 158 | -------------------------------------------------------------------------------- /Batch-Data-Pipelines/setup_infra.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# -eq 0 ]] ; then 4 | echo 'Please enter your bucket name as ./setup_infra.sh your-bucket' 5 | exit 0 6 | fi 7 | 8 | # check if AWS is installed and configured 9 | # check if psql is installed 10 | 11 | AWS_ID=$(aws sts get-caller-identity --query Account --output text | cat) 12 | AWS_EC2_INSTANCE_NAME=sde-airflow-pg-$(openssl rand -base64 12) 13 | 14 | echo "Reading infrastructure variables from infra_variables.txt" 15 | source infra_variables.txt 16 | 17 | echo "Creating bucket "$1"" 18 | aws s3api create-bucket --acl public-read-write --region $AWS_REGION --bucket $1 --output text >> setup.log 19 | 20 | echo '{ 21 | "Version": "2012-10-17", 22 | "Statement": [ 23 | { 24 | "Effect": "Allow", 25 | "Principal": { 26 | "Service": "ec2.amazonaws.com" 27 | }, 28 | "Action": "sts:AssumeRole" 29 | } 30 | ] 31 | }' > ./trust-policy.json 32 | 33 | 34 | echo "Creating AWS IAM role for EC2 S3 access" 35 | aws iam create-role --role-name $EC2_IAM_ROLE --assume-role-policy-document file://trust-policy.json --description 'EC2 access to S3' --output text >> setup.log 36 | 37 | echo "Attaching AmazonS3FullAccess Policy to the previous IAM role" 38 | aws iam attach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --output text >> setup.log 39 | 40 | echo "Attaching AmazonEMRFullAccessPolicy_v2 Policy to the previous IAM role" 41 | aws iam attach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonEMRFullAccessPolicy_v2 --output text >> setup.log 42 | 43 | echo "Attaching AmazonRedshiftAllCommandsFullAccess Policy to the previous IAM role" 44 | aws iam attach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonRedshiftAllCommandsFullAccess --output text >> setup.log 45 | 46 | echo 'Creating IAM instance profile to add to EC2' 47 | aws iam create-instance-profile --instance-profile-name $EC2_IAM_ROLE-instance-profile --output text >> setup.log 48 | aws iam add-role-to-instance-profile --role-name $EC2_IAM_ROLE --instance-profile-name $EC2_IAM_ROLE-instance-profile --output text >> setup.log 49 | 50 | echo "Creating ssh key to connect to EC2 instance" 51 | aws ec2 create-key-pair --key-name sde-key --query "KeyMaterial" --output text --region $AWS_REGION > sde-key.pem 52 | chmod 400 sde-key.pem 53 | 54 | MY_IP=$(curl -s http://whatismyip.akamai.com/) 55 | 56 | echo "Creating EC2 security group to only allow access from your IP $MY_IP" 57 | EC2_SECURITY_GROUP_ID=$(aws ec2 create-security-group --description "Security group to allow inbound SCP connection" --group-name $EC2_SECURITY_GROUP --output text) 58 | echo 'EC2_SECURITY_GROUP_ID="'$EC2_SECURITY_GROUP_ID'"' >> state.log 59 | 60 | echo "Add inbound rule to allow ssh from IP $MY_IP" 61 | aws ec2 authorize-security-group-ingress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 22 --cidr $MY_IP/24 --output text >> setup.log 62 | 63 | echo "Add outbound rule to allow our IP $MY_IP to connect to EC2's 8080 port" 64 | aws ec2 authorize-security-group-egress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 8080 --cidr $MY_IP/32 --output text >> setup.log 65 | 66 | echo "Creating EC2 instance" 67 | sleep 5 68 | aws ec2 run-instances --image-id $EC2_IMAGE_ID --instance-type $AWS_EC2_INSTANCE --count 1 --key-name sde-key --user-data file://setup_ubuntu_docker.txt --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='$AWS_EC2_INSTANCE_NAME'}]' --region $AWS_REGION >> setup.log 69 | 70 | echo "Get EC2 ID" 71 | sleep 20 72 | EC2_ID=$(aws --region $AWS_REGION ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=$AWS_EC2_INSTANCE_NAME" --query 'Reservations[*].Instances[*].[InstanceId]' --output text) 73 | echo "EC2 ID is $EC2_ID" 74 | echo 'EC2_ID="'$EC2_ID'"' >> state.log 75 | 76 | echo "Add security group to EC2" 77 | aws ec2 modify-instance-attribute --instance-id $EC2_ID --groups $EC2_SECURITY_GROUP_ID --output text >> setup.log 78 | 79 | while : 80 | do 81 | echo "Waiting for EC2 instance to start, sleeping for 60s before next check" 82 | sleep 60 83 | EC2_STATUS=$(aws ec2 describe-instance-status --instance-ids $EC2_ID --query 'InstanceStatuses[0].InstanceState.Name' --output text) 84 | if [[ "$EC2_STATUS" == "running" ]] 85 | then 86 | break 87 | fi 88 | done 89 | 90 | echo "Attach "$EC2_IAM_ROLE"-instance-profile to EC2 instance" 91 | aws ec2 associate-iam-instance-profile --instance-id $EC2_ID --iam-instance-profile Name=$EC2_IAM_ROLE-instance-profile --output text >> setup.log 92 | 93 | echo "Get EC2 IPV4" 94 | sleep 20 95 | EC2_IPV4=$(aws --region $AWS_REGION ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=instance-id,Values=$EC2_ID" --query 'Reservations[*].Instances[*].[PublicDnsName]' --output text) 96 | echo "EC2 IPV4 is $EC2_IPV4" 97 | 98 | echo "SCP to copy code to remote server" 99 | cd ../ 100 | scp -o "IdentitiesOnly yes" -i ./beginner_de_project/sde-key.pem -r ./beginner_de_project ubuntu@$EC2_IPV4:/home/ubuntu/beginner_de_project 101 | cd beginner_de_project 102 | 103 | echo "Clean up stale data" 104 | sleep 10 105 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && rm -f data.zip && rm -rf data' 106 | 107 | echo "Download data" 108 | sleep 10 109 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && wget https://start-data-engg.s3.amazonaws.com/data.zip && sudo unzip data.zip && sudo chmod 755 data' 110 | 111 | echo "Recreate logs and temp dir" 112 | sleep 10 113 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && rm -rf logs && mkdir logs && rm -rf temp && mkdir temp && chmod 777 temp' 114 | 115 | echo "Creating an AWS EMR Cluster named "$SERVICE_NAME"" 116 | aws emr create-default-roles >> setup.log 117 | aws emr create-cluster --applications Name=Hadoop Name=Spark --release-label emr-6.2.0 --name $SERVICE_NAME --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --service-role EMR_DefaultRole --instance-groups '[ 118 | { 119 | "InstanceCount": 1, 120 | "EbsConfiguration": { 121 | "EbsBlockDeviceConfigs": [ 122 | { 123 | "VolumeSpecification": { 124 | "SizeInGB": 32, 125 | "VolumeType": "gp2" 126 | }, 127 | "VolumesPerInstance": 2 128 | } 129 | ] 130 | }, 131 | "InstanceGroupType": "MASTER", 132 | "InstanceType": "'$EMR_NODE_TYPE'", 133 | "Name": "Master - 1" 134 | }, 135 | { 136 | "InstanceCount": 2, 137 | "BidPrice": "OnDemandPrice", 138 | "EbsConfiguration": { 139 | "EbsBlockDeviceConfigs": [ 140 | { 141 | "VolumeSpecification": { 142 | "SizeInGB": 32, 143 | "VolumeType": "gp2" 144 | }, 145 | "VolumesPerInstance": 2 146 | } 147 | ] 148 | }, 149 | "InstanceGroupType": "CORE", 150 | "InstanceType": "'$EMR_NODE_TYPE'", 151 | "Name": "Core - 2" 152 | } 153 | ]' >> setup.log 154 | 155 | echo '{ 156 | "Version": "2012-10-17", 157 | "Statement": [ 158 | { 159 | "Effect": "Allow", 160 | "Principal": { 161 | "Service": "redshift.amazonaws.com" 162 | }, 163 | "Action": "sts:AssumeRole" 164 | } 165 | ] 166 | }' > ./trust-policy.json 167 | 168 | 169 | echo "Creating AWS IAM role for redshift spectrum S3 access" 170 | aws iam create-role --role-name $IAM_ROLE_NAME --assume-role-policy-document file://trust-policy.json --description 'spectrum access for redshift' >> setup.log 171 | 172 | echo "Attaching AmazonS3ReadOnlyAccess Policy to our IAM role" 173 | aws iam attach-role-policy --role-name $IAM_ROLE_NAME --policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess --output text >> setup.log 174 | echo "Attaching AWSGlueConsoleFullAccess Policy to our IAM role" 175 | aws iam attach-role-policy --role-name $IAM_ROLE_NAME --policy-arn arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess --output text >> setup.log 176 | 177 | echo "Creating an AWS Redshift Cluster named "$SERVICE_NAME"" 178 | aws redshift create-cluster --cluster-identifier $SERVICE_NAME --node-type dc2.large --master-username $REDSHIFT_USER --master-user-password $REDSHIFT_PASSWORD --cluster-type single-node --publicly-accessible --iam-roles "arn:aws:iam::"$AWS_ID":role/"$IAM_ROLE_NAME"" >> setup.log 179 | 180 | while : 181 | do 182 | echo "Waiting for Redshift cluster "$SERVICE_NAME" to start, sleeping for 60s before next check" 183 | sleep 60 184 | REDSHIFT_CLUSTER_STATUS=$(aws redshift describe-clusters --cluster-identifier $SERVICE_NAME --query 'Clusters[0].ClusterStatus' --output text) 185 | if [[ "$REDSHIFT_CLUSTER_STATUS" == "available" ]] 186 | then 187 | break 188 | fi 189 | done 190 | 191 | REDSHIFT_HOST=$(aws redshift describe-clusters --cluster-identifier $SERVICE_NAME --query 'Clusters[0].Endpoint.Address' --output text) 192 | 193 | # TODO read the script from sql file 194 | echo "Running setup script on redshift" 195 | echo "CREATE EXTERNAL SCHEMA spectrum 196 | FROM DATA CATALOG DATABASE 'spectrumdb' iam_role 'arn:aws:iam::"$AWS_ID":role/"$IAM_ROLE_NAME"' CREATE EXTERNAL DATABASE IF NOT EXISTS; 197 | DROP TABLE IF EXISTS spectrum.user_purchase_staging; 198 | CREATE EXTERNAL TABLE spectrum.user_purchase_staging ( 199 | InvoiceNo VARCHAR(10), 200 | StockCode VARCHAR(20), 201 | detail VARCHAR(1000), 202 | Quantity INTEGER, 203 | InvoiceDate TIMESTAMP, 204 | UnitPrice DECIMAL(8, 3), 205 | customerid INTEGER, 206 | Country VARCHAR(20) 207 | ) PARTITIONED BY (insert_date DATE) 208 | ROW FORMAT DELIMITED 209 | FIELDS TERMINATED BY ',' 210 | STORED AS textfile 211 | LOCATION 's3://"$1"/stage/user_purchase/' 212 | TABLE PROPERTIES ('skip.header.line.count' = '1'); 213 | DROP TABLE IF EXISTS spectrum.classified_movie_review; 214 | CREATE EXTERNAL TABLE spectrum.classified_movie_review ( 215 | cid VARCHAR(100), 216 | positive_review boolean, 217 | insert_date VARCHAR(12) 218 | ) STORED AS PARQUET LOCATION 's3://"$1"/stage/movie_review/'; 219 | DROP TABLE IF EXISTS public.user_behavior_metric; 220 | CREATE TABLE public.user_behavior_metric ( 221 | customerid INTEGER, 222 | amount_spent DECIMAL(18, 5), 223 | review_score INTEGER, 224 | review_count INTEGER, 225 | insert_date DATE 226 | );" > ./redshift_setup.sql 227 | 228 | psql -f ./redshift_setup.sql postgres://$REDSHIFT_USER:$REDSHIFT_PASSWORD@$REDSHIFT_HOST:$REDSHIFT_PORT/dev 229 | rm ./redshift_setup.sql 230 | 231 | echo "Spinning up remote Airflow docker containers" 232 | sleep 60 233 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && echo -e "AIRFLOW_UID=$(id -u)\nAIRFLOW_GID=0" > .env && docker compose up airflow-init && docker compose up --build -d' 234 | 235 | echo "Sleeping 5 Minutes to let Airflow containers reach a healthy state" 236 | sleep 300 237 | 238 | echo "adding redshift connections to Airflow connection param" 239 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow connections add 'redshift' --conn-type 'Postgres' --conn-login $REDSHIFT_USER --conn-password $REDSHIFT_PASSWORD --conn-host $REDSHIFT_HOST --conn-port $REDSHIFT_PORT --conn-schema 'dev'" 240 | 241 | echo "adding postgres connections to Airflow connection param" 242 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow connections add 'postgres_default' --conn-type 'Postgres' --conn-login 'airflow' --conn-password 'airflow' --conn-host 'localhost' --conn-port 5432 --conn-schema 'airflow'" 243 | 244 | echo "adding S3 bucket name to Airflow variables" 245 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow variables set BUCKET $1" 246 | 247 | echo "adding EMR ID to Airflow variables" 248 | EMR_CLUSTER_ID=$(aws emr list-clusters --active --query 'Clusters[?Name==`'$SERVICE_NAME'`].Id' --output text) 249 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow variables set EMR_ID $EMR_CLUSTER_ID" 250 | 251 | echo "set Airflow AWS region to "$AWS_REGION"" 252 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow connections add 'aws_default' --conn-type 'aws' --conn-extra '{\"region_name\":\"'$AWS_REGION'\"}'" 253 | 254 | echo "Successfully setup local Airflow containers, S3 bucket "$1", EMR Cluster "$SERVICE_NAME", redshift cluster "$SERVICE_NAME", and added config to Airflow connections and variables" 255 | 256 | echo "Forwardin Metabase port to http://localhost:3000" 257 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 -N -f -L 3000:$EC2_IPV4:3000 258 | 259 | echo "Opening Airflow UI ..." 260 | sleep 60 261 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 -N -f -L 8080:$EC2_IPV4:8080 262 | open http://localhost:8080 263 | --------------------------------------------------------------------------------