├── Batch-Data-Pipelines
    ├── assets
    │   └── images
    │   │   ├── metabase.png
    │   │   ├── de_proj_obj.png
    │   │   ├── de_proj_design.png
    │   │   └── de_proj_infra.png
    ├── infra_variables.txt
    ├── test
    │   └── dag
    │   │   └── test_dag_validity.py
    ├── dags
    │   ├── scripts
    │   │   ├── sql
    │   │   │   ├── unload_user_purchase.sql
    │   │   │   └── generate_user_behavior_metric.sql
    │   │   ├── emr
    │   │   │   └── clean_movie_review.json
    │   │   └── spark
    │   │   │   └── random_text_classification.py
    │   ├── utils.py
    │   └── user_behaviour.py
    ├── pgsetup
    │   └── create_user_purchase.sql
    ├── Makefile
    ├── setup_ubuntu_docker.txt
    ├── redshiftsetup
    │   └── setup.sql
    ├── tear_down_infra.sh
    ├── docker-compose.yaml
    └── setup_infra.sh
├── LICENSE
└── README.md


/Batch-Data-Pipelines/assets/images/metabase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/metabase.png


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/assets/images/de_proj_obj.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/de_proj_obj.png


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/assets/images/de_proj_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/de_proj_design.png


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/assets/images/de_proj_infra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rameshei87/Batch-Data-Pipelines/HEAD/Batch-Data-Pipelines/assets/images/de_proj_infra.png


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/infra_variables.txt:
--------------------------------------------------------------------------------
 1 | AWS_REGION="us-east-1"
 2 | AWS_EC2_INSTANCE="t2.large"
 3 | EC2_IAM_ROLE="sde-ec2-s3-emr-rs-access"
 4 | EC2_IMAGE_ID="ami-09d56f8956ab235b3"
 5 | EC2_SECURITY_GROUP="sde-security-group"
 6 | SERVICE_NAME="sde-batch-de-project"
 7 | IAM_ROLE_NAME="sde-spectrum-redshift"
 8 | REDSHIFT_USER="sde_user"
 9 | REDSHIFT_PASSWORD="sdeP0ssword0987"
10 | REDSHIFT_PORT="5439"
11 | EMR_NODE_TYPE="m4.xlarge"


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/test/dag/test_dag_validity.py:
--------------------------------------------------------------------------------
 1 | from airflow.models import DagBag
 2 | 
 3 | 
 4 | def test_no_import_errors(monkeypatch):
 5 | 
 6 |     # Set variables
 7 |     monkeypatch.setenv("AIRFLOW_VAR_BUCKET", "test-bucket")
 8 |     monkeypatch.setenv("AIRFLOW_VAR_EMR_ID", "test-emr-id")
 9 | 
10 |     dag_bag = DagBag()
11 |     assert len(dag_bag.import_errors) == 0, "No Import Failures"
12 |     assert dag_bag.size() == 1
13 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/dags/scripts/sql/unload_user_purchase.sql:
--------------------------------------------------------------------------------
 1 | COPY (
 2 |        select invoice_number,
 3 |               stock_code,
 4 |               detail,
 5 |               quantity,
 6 |               invoice_date,
 7 |               unit_price,
 8 |               customer_id,
 9 |               country
10 |        from retail.user_purchase -- we should have a date filter here to pull only required data
11 | ) TO '{{ params.user_purchase }}' WITH (FORMAT CSV, HEADER);
12 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/pgsetup/create_user_purchase.sql:
--------------------------------------------------------------------------------
 1 | CREATE SCHEMA retail;
 2 | 
 3 | CREATE TABLE retail.user_purchase (
 4 |     invoice_number varchar(10),
 5 |     stock_code varchar(20),
 6 |     detail varchar(1000),
 7 |     quantity int,
 8 |     invoice_date timestamp,
 9 |     unit_price Numeric(8,3),
10 |     customer_id int,
11 |     country varchar(20)
12 | );
13 | 
14 | COPY retail.user_purchase(invoice_number,stock_code,detail,quantity,invoice_date,unit_price,customer_id,country) 
15 | FROM '/input_data/OnlineRetail.csv' DELIMITER ','  CSV HEADER;
16 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/Makefile:
--------------------------------------------------------------------------------
 1 | up:
 2 | 	docker compose up airflow-init && docker compose up --build -d
 3 | 
 4 | down:
 5 | 	docker compose down
 6 | 
 7 | sh:
 8 | 	docker exec -ti webserver bash
 9 | 
10 | pytest:
11 | 	docker exec -ti webserver pytest -p no:warnings -v /opt/airflow/test
12 | 
13 | format:
14 | 	docker exec -ti webserver python -m black -S --line-length 79 .
15 | 
16 | isort:
17 | 	docker exec -ti webserver isort .
18 | 
19 | type:
20 | 	docker exec -ti webserver mypy --ignore-missing-imports /opt/airflow
21 | 
22 | lint: 
23 | 	docker exec -ti webserver flake8 /opt/airflow/dags
24 | 
25 | ci: isort format type lint pytest
26 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/setup_ubuntu_docker.txt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt-get -y update
 4 | 
 5 | sudo apt-get -y install \
 6 | ca-certificates \
 7 | curl \
 8 | gnupg \
 9 | lsb-release
10 | 
11 | sudo apt -y install unzip
12 | 
13 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
14 | 
15 | echo \
16 |   "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
17 |   $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
18 | 
19 | sudo apt-get -y update
20 | sudo apt-get -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin
21 | sudo chmod 666 /var/run/docker.sock


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/dags/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import psycopg2
 4 | 
 5 | from airflow.hooks.postgres_hook import PostgresHook
 6 | from airflow.hooks.S3_hook import S3Hook
 7 | 
 8 | 
 9 | def _local_to_s3(
10 |     bucket_name: str, key: str, file_name: str, remove_local: bool = False
11 | ) -> None:
12 |     s3 = S3Hook()
13 |     s3.load_file(
14 |         filename=file_name, bucket_name=bucket_name, replace=True, key=key
15 |     )
16 |     if remove_local:
17 |         if os.path.isfile(file_name):
18 |             os.remove(file_name)
19 | 
20 | 
21 | def run_redshift_external_query(qry: str) -> None:
22 |     rs_hook = PostgresHook(postgres_conn_id="redshift")
23 |     rs_conn = rs_hook.get_conn()
24 |     rs_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
25 |     rs_cursor = rs_conn.cursor()
26 |     rs_cursor.execute(qry)
27 |     rs_cursor.close()
28 |     rs_conn.commit()
29 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/dags/scripts/sql/generate_user_behavior_metric.sql:
--------------------------------------------------------------------------------
 1 | DELETE FROM public.user_behavior_metric
 2 | WHERE insert_date = '{{ ds }}';
 3 | INSERT INTO public.user_behavior_metric (
 4 |         customerid,
 5 |         amount_spent,
 6 |         review_score,
 7 |         review_count,
 8 |         insert_date
 9 |     )
10 | SELECT ups.customerid,
11 |     CAST(
12 |         SUM(ups.Quantity * ups.UnitPrice) AS DECIMAL(18, 5)
13 |     ) AS amount_spent,
14 |     SUM(mrcs.positive_review) AS review_score,
15 |     count(mrcs.cid) AS review_count,
16 |     '{{ ds }}'
17 | FROM spectrum.user_purchase_staging ups
18 |     JOIN (
19 |         SELECT cid,
20 |             CASE
21 |                 WHEN positive_review IS True THEN 1
22 |                 ELSE 0
23 |             END AS positive_review
24 |         FROM spectrum.classified_movie_review
25 |         WHERE insert_date = '{{ ds }}'
26 |     ) mrcs ON ups.customerid = mrcs.cid
27 | WHERE ups.insert_date = '{{ ds }}'
28 | GROUP BY ups.customerid;
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ramesh chinnaraj
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/dags/scripts/emr/clean_movie_review.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "Name": "Move raw data from S3 to HDFS",
 4 |     "ActionOnFailure": "CANCEL_AND_WAIT",
 5 |     "HadoopJarStep": {
 6 |       "Jar": "command-runner.jar",
 7 |       "Args": [
 8 |         "s3-dist-cp",
 9 |         "--src=s3://{{ params.BUCKET_NAME }}/{{ params.raw_movie_review }}/{{ ds }}/",
10 |         "--dest=/movie/{{ ds }}"
11 |       ]
12 |     }
13 |   },
14 |   {
15 |     "Name": "Classify movie reviews",
16 |     "ActionOnFailure": "CANCEL_AND_WAIT",
17 |     "HadoopJarStep": {
18 |       "Jar": "command-runner.jar",
19 |       "Args": [
20 |         "spark-submit",
21 |         "s3://{{ params.BUCKET_NAME }}/scripts/random_text_classification.py",
22 |         "--input=/movie/{{ ds }}",
23 |         "--run-id={{ ds }}"
24 |       ]
25 |     }
26 |   },
27 |   {
28 |     "Name": "Move classified data from HDFS to S3",
29 |     "ActionOnFailure": "CANCEL_AND_WAIT",
30 |     "HadoopJarStep": {
31 |       "Jar": "command-runner.jar",
32 |       "Args": [
33 |         "s3-dist-cp",
34 |         "--src=/output",
35 |         "--dest=s3://{{ params.BUCKET_NAME }}/{{ params.stage_movie_review }}/{{ ds }}"
36 |       ]
37 |     }
38 |   }
39 | ]


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/redshiftsetup/setup.sql:
--------------------------------------------------------------------------------
 1 | -- This is run as part of the setup_infra.sh script
 2 | CREATE EXTERNAL SCHEMA spectrum
 3 | FROM DATA CATALOG DATABASE 'spectrumdb' iam_role 'arn:aws:iam::"$AWS_ID":role/"$IAM_ROLE_NAME"' CREATE EXTERNAL DATABASE IF NOT EXISTS;
 4 | DROP TABLE IF EXISTS spectrum.user_purchase_staging;
 5 | CREATE EXTERNAL TABLE spectrum.user_purchase_staging (
 6 |     InvoiceNo VARCHAR(10),
 7 |     StockCode VARCHAR(20),
 8 |     detail VARCHAR(1000),
 9 |     Quantity INTEGER,
10 |     InvoiceDate TIMESTAMP,
11 |     UnitPrice DECIMAL(8, 3),
12 |     customerid INTEGER,
13 |     Country VARCHAR(20)
14 | ) PARTITIONED BY (insert_date DATE) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS textfile LOCATION 's3://"$1"/stage/user_purchase/' TABLE PROPERTIES ('skip.header.line.count' = '1');
15 | DROP TABLE IF EXISTS spectrum.classified_movie_review;
16 | CREATE EXTERNAL TABLE spectrum.classified_movie_review (
17 |     cid VARCHAR(100),
18 |     positive_review boolean,
19 |     insert_date VARCHAR(12)
20 | ) STORED AS PARQUET LOCATION 's3://"$1"/stage/movie_review/';
21 | DROP TABLE IF EXISTS public.user_behavior_metric;
22 | CREATE TABLE public.user_behavior_metric (
23 |     customerid INTEGER,
24 |     amount_spent DECIMAL(18, 5),
25 |     review_score INTEGER,
26 |     review_count INTEGER,
27 |     insert_date DATE
28 | );
29 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/dags/scripts/spark/random_text_classification.py:
--------------------------------------------------------------------------------
 1 | # pyspark
 2 | import argparse
 3 | 
 4 | from pyspark.ml.feature import StopWordsRemover, Tokenizer
 5 | from pyspark.sql import SparkSession
 6 | from pyspark.sql.functions import array_contains, lit
 7 | 
 8 | 
 9 | def random_text_classifier(
10 |     input_loc: str, output_loc: str, run_id: str
11 | ) -> None:
12 |     """
13 |     This is a dummy function to show how to use spark, It is supposed to mock
14 |     the following steps
15 |         1. clean input data
16 |         2. use a pre-trained model to make prediction
17 |         3. write predictions to a HDFS output
18 | 
19 |     Since this is meant as an example, we are going to skip building a model,
20 |     instead we are naively going to mark reviews having the text "good" as
21 |     positive and the rest as negative
22 |     """
23 | 
24 |     # read input
25 |     df_raw = spark.read.option("header", True).csv(input_loc)
26 |     # perform text cleaning
27 | 
28 |     # Tokenize text
29 |     tokenizer = Tokenizer(inputCol="review_str", outputCol="review_token")
30 |     df_tokens = tokenizer.transform(df_raw).select("cid", "review_token")
31 | 
32 |     # Remove stop words
33 |     remover = StopWordsRemover(
34 |         inputCol="review_token", outputCol="review_clean"
35 |     )
36 |     df_clean = remover.transform(df_tokens).select("cid", "review_clean")
37 | 
38 |     # function to check presence of good
39 |     df_out = df_clean.select(
40 |         "cid",
41 |         array_contains(df_clean.review_clean, "good").alias("positive_review"),
42 |     )
43 |     df_fin = df_out.withColumn("insert_date", lit(run_id))
44 |     # parquet is a popular column storage format, we use it here
45 |     df_fin.write.mode("overwrite").parquet(output_loc)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument(
51 |         "--input", type=str, help="HDFS input", default="/movie"
52 |     )
53 |     parser.add_argument(
54 |         "--output", type=str, help="HDFS output", default="/output"
55 |     )
56 |     parser.add_argument("--run-id", type=str, help="run id")
57 |     args = parser.parse_args()
58 |     spark = SparkSession.builder.appName(
59 |         "Random Text Classifier"
60 |     ).getOrCreate()
61 |     random_text_classifier(
62 |         input_loc=args.input, output_loc=args.output, run_id=args.run_id
63 |     )
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Batch-Data-Pipelines
 2 | 
 3 | 
 4 | Design
 5 | Setup
 6 | Prerequisite
 7 | Local run
 8 | Deploy to AWS
 9 | Stop
10 | Contributing
11 | Design
12 | 
13 | We will be using Airflow to orchestrate
14 | 
15 | Classifying movie reviews with Apache Spark.
16 | Loading the classified movie reviews into the data warehouse.
17 | Extract user purchase data from an OLTP database and load it into the data warehouse.
18 | Joining the classified movie review data and user purchase data to get user behavior metric data.
19 | 
20 | 
21 | Data pipeline design
22 | 
23 | Setup
24 | Prerequisite
25 | Docker with at least 4GB of RAM and Docker Compose v1.27.0 or later
26 | psql
27 | AWS account
28 | AWS CLI installed and configured
29 | Clone and cd into the project directory.
30 | 
31 | git clone https://github.com/josephmachado/beginner_de_project.git
32 | cd beginner_de_project
33 | Local run
34 | When running locally, you can use the make command to manage infrastructure. We use the following docker containers
35 | 
36 | ![image](https://user-images.githubusercontent.com/110036451/184506899-c59f4298-bfc1-410b-9be3-48618ec8c7c4.png)
37 | 
38 | 
39 | Airflow
40 | Postgres DB (as Airflow metadata DB)
41 | Metabase for data visualization
42 | You can start the local containers as shown below.
43 | 
44 | make up # start all containers
45 | make ci # runs format checks, type checks, static checks, and tests
46 | make down # stops the containers
47 | 
48 | Since we cannot replicate AWS components locally, we have not set them up here.
49 | 
50 | We have a dag validity test defined here.
51 | 
52 | Deploy to AWS
53 | To set up the AWS infrastructure we have a script called setup_infra.sh.
54 | 
55 | Note: We run all of our infrastructure on AWS us-east-1. If you want to change this, please change the corresponding variables in infra_variables.txt.
56 | 
57 | Setup can be run as shown below.
58 | 
59 | make down # since our AWS infra will be port forwarded to 8080 and 3000 which are used by local Airflow and Metabase respectively
60 | ./setup_infra.sh {your-bucket-name} # e.g ./setup_infra.sh my-test-bucket 
61 | In the prompt enter yes to authenticate the ssh connection.
62 | 
63 | This sets up the following components
64 | 
65 | 1 AWS EC2, running Airflow, Metabase
66 | 1 AWS EMR cluster
67 | 1 AWS Redshift cluster
68 | 1 AWS S3 bucket
69 | The command will also open Airflow running on an EC2 instance. You can also checkout
70 | 
71 | Airflow www.localhost:8080 (username and password are both airflow)
72 | Metabase www.localhost:3000
73 | The first time you log in, create a user name and password. To establish a connection to your Redshift cluster, you will need the redshift host, which you can get using the command
74 | 
75 | aws redshift describe-clusters --cluster-identifier sde-batch-de-project --query 'Clusters[0].Endpoint.Address' --output text
76 | The port, username, and password are in infra_vairables.txt and the database is dev.
77 | 
78 | You can create dashboards in Metabase, as seen below.
79 | 
80 | ![image](https://user-images.githubusercontent.com/110036451/184506920-8d19fd4d-d036-4074-bc4d-802c15bc55b7.png)
81 | 
82 | 
83 | Data pipeline design
84 | 
85 | Stop
86 | When you are done, do not forget to turn off your AWS instances. In your terminal run
87 | 
88 | ./tear_down_infra.sh {your-bucket-name} # e.g. ./tear_down_infra.sh my-test-bucket
89 | This will stop all the AWS services. Please double-check this by going to the AWS UI S3, EC2, EMR, & Redshift consoles.
90 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/tear_down_infra.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -eq 0 ]] ; then
 4 |     echo 'Please enter your bucket name as ./tear_down_infra.sh your-bucket'
 5 |     exit 0
 6 | fi
 7 | 
 8 | AWS_ID=$(aws sts get-caller-identity --query Account --output text | cat)
 9 | 
10 | echo "Reading infrastructure variables from infra_variables.txt"
11 | source infra_variables.txt
12 | 
13 | echo "Reading state values from state.log"
14 | source state.log
15 | 
16 | echo "Deleting bucket "$1" and its contents"
17 | aws s3 rm s3://$1 --recursive --output text >> tear_down.log
18 | aws s3api delete-bucket --bucket $1 --output text >> tear_down.log
19 | 
20 | echo "Terminating EC2 instance"
21 | aws ec2 terminate-instances --instance-ids $EC2_ID --region $AWS_REGION  >> tear_down.log
22 | 
23 | MY_IP=$(curl -s http://whatismyip.akamai.com/)
24 | 
25 | echo "Delete EC2 security group ingress"
26 | aws ec2 revoke-security-group-ingress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 22 --cidr $MY_IP/24 --output text >> tear_down.log
27 | 
28 | echo "Delete EC2 security group ingress"
29 | aws ec2 revoke-security-group-egress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 8080 --cidr $MY_IP/32 --output text >> tear_down.log
30 | 
31 | echo "Terminating EMR cluster "$SERVICE_NAME""
32 | EMR_CLUSTER_ID=$(aws emr list-clusters --active --query 'Clusters[?Name==`'$SERVICE_NAME'`].Id' --output text)
33 | aws emr terminate-clusters --cluster-ids $EMR_CLUSTER_ID >> tear_down.log
34 | 
35 | echo "Deleting EC2 security group"
36 | sleep 60
37 | aws ec2 delete-security-group --group-id $EC2_SECURITY_GROUP_ID --output text >> tear_down.log
38 | 
39 | echo "Terminating Redshift cluster "$SERVICE_NAME""
40 | aws redshift delete-cluster --skip-final-cluster-snapshot --cluster-identifier $SERVICE_NAME --output text >> tear_down.log
41 | 
42 | echo "Dissociating AmazonS3ReadOnlyAccess policy from "$IAM_ROLE_NAME" role"
43 | aws iam detach-role-policy --role-name $IAM_ROLE_NAME  --policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess --output text >> tear_down.log
44 | echo "Dissociating AWSGlueConsoleFullAccess policy from "$IAM_ROLE_NAME" role"
45 | aws iam detach-role-policy --role-name $IAM_ROLE_NAME  --policy-arn arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess --output text >> tear_down.log
46 | echo "Deleting role "$IAM_ROLE_NAME""
47 | aws iam delete-role --role-name $IAM_ROLE_NAME  --output text >> tear_down.log
48 | 
49 | EC2_IAM_ROLE=sde-ec2-s3-emr-rs-access
50 | echo "Remove role from instance profile"
51 | aws iam remove-role-from-instance-profile --instance-profile-name $EC2_IAM_ROLE-instance-profile --role-name $EC2_IAM_ROLE --output text >> tear_down.log
52 | 
53 | echo "Deleting role instance profile "$EC2_IAM_ROLE"-instance-profile"
54 | aws iam delete-instance-profile --instance-profile-name $EC2_IAM_ROLE-instance-profile --output text >> tear_down.log
55 | 
56 | echo "Dissociating AmazonS3FullAccess policy from "$EC2_IAM_ROLE" role"
57 | aws iam detach-role-policy --role-name $EC2_IAM_ROLE  --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --output text >> tear_down.log
58 | 
59 | echo "Dissociating AmazonEMRFullAccessPolicy_v2 policy from "$EC2_IAM_ROLE" role"
60 | aws iam detach-role-policy --role-name $EC2_IAM_ROLE  --policy-arn arn:aws:iam::aws:policy/AmazonEMRFullAccessPolicy_v2 --output text >> tear_down.log
61 | 
62 | echo "Dissociating AmazonRedshiftAllCommandsFullAccess policy from "$EC2_IAM_ROLE" role"
63 | aws iam detach-role-policy --role-name $EC2_IAM_ROLE  --policy-arn arn:aws:iam::aws:policy/AmazonRedshiftAllCommandsFullAccess --output text >> tear_down.log
64 | 
65 | echo "Deleting role "$EC2_IAM_ROLE""
66 | aws iam delete-role --role-name $EC2_IAM_ROLE  --output text >> tear_down.log
67 | 
68 | echo "Deleting SSH key"
69 | aws ec2 delete-key-pair --key-name sde-key --region $AWS_REGION >> setup.log
70 | rm -f sde-key.pem
71 | 
72 | rm -f tear_down.log setup.log state.log trust-policy.json
73 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME         - Docker image name used to run Airflow.
 27 | #                              Default: apache/airflow:master-python3.8
 28 | # AIRFLOW_UID                - User ID in Airflow containers
 29 | #                              Default: 50000
 30 | # AIRFLOW_GID                - Group ID in Airflow containers
 31 | #                              Default: 50000
 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account.
 33 | #                              Default: airflow
 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account.
 35 | #                              Default: airflow
 36 | #
 37 | # Feel free to modify this file to suit your needs.
 38 | ---
 39 | version: '3'
 40 | x-airflow-common:
 41 |   &airflow-common
 42 |   image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.0}
 43 |   environment:
 44 |     &airflow-common-env
 45 |     AIRFLOW__CORE__EXECUTOR: LocalExecutor
 46 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 47 |     AIRFLOW__CORE__FERNET_KEY: ''
 48 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 49 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 50 |     AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
 51 |     AIRFLOW_CONN_POSTGRES_DEFAULT: postgres://airflow:airflow@postgres:5432/airflow
 52 |     # AWS_SHARED_CREDENTIALS_FILE: /opt/airflow/.aws/credentials
 53 |     # NOT RECOMMENDED FOR PRODUCTION!!!
 54 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- black flake8 mypy isort moto[all] pytest pytest-mock apache-airflow-client}
 55 |   volumes:
 56 |     - ./dags:/opt/airflow/dags
 57 |     - ./logs:/opt/airflow/logs
 58 |     - ./plugins:/opt/airflow/plugins
 59 |     - ./test:/opt/airflow/test
 60 |     - ./data:/opt/airflow/data
 61 |     - ./temp:/opt/airflow/temp
 62 |   user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
 63 |   depends_on:
 64 |     postgres:
 65 |       condition: service_healthy
 66 | 
 67 | services:
 68 |   postgres:
 69 |     container_name: postgres
 70 |     image: postgres:13
 71 |     environment:
 72 |       POSTGRES_USER: airflow
 73 |       POSTGRES_PASSWORD: airflow
 74 |       POSTGRES_DB: airflow
 75 |     volumes:
 76 |       - ./data:/input_data
 77 |       - ./temp:/temp
 78 |       - ./pgsetup:/docker-entrypoint-initdb.d
 79 |     healthcheck:
 80 |       test: [ "CMD", "pg_isready", "-U", "airflow" ]
 81 |       interval: 5s
 82 |       retries: 5
 83 |     restart: always
 84 |     ports:
 85 |       - "5432:5432"
 86 | 
 87 |   airflow-webserver:
 88 |     <<: *airflow-common
 89 |     container_name: webserver
 90 |     command: webserver
 91 |     ports:
 92 |       - 8080:8080
 93 |     healthcheck:
 94 |       test:
 95 |         [
 96 |           "CMD",
 97 |           "curl",
 98 |           "--fail",
 99 |           "http://localhost:8080/health"
100 |         ]
101 |       interval: 10s
102 |       timeout: 10s
103 |       retries: 5
104 |     restart: always
105 | 
106 |   airflow-scheduler:
107 |     <<: *airflow-common
108 |     container_name: scheduler
109 |     command: scheduler
110 |     healthcheck:
111 |       test:
112 |         [
113 |           "CMD-SHELL",
114 |           'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'
115 |         ]
116 |       interval: 10s
117 |       timeout: 10s
118 |       retries: 5
119 |     restart: always
120 | 
121 |   airflow-init:
122 |     <<: *airflow-common
123 |     command: version
124 |     environment:
125 |       <<: *airflow-common-env
126 |       _AIRFLOW_DB_UPGRADE: 'true'
127 |       _AIRFLOW_WWW_USER_CREATE: 'true'
128 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
129 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
130 | 
131 |   dashboard:
132 |     image: metabase/metabase
133 |     container_name: dashboard
134 |     ports:
135 |       - "3000:3000"
136 | 
137 | volumes:
138 |   postgres-db-volume:
139 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/dags/user_behaviour.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime, timedelta
  3 | 
  4 | from utils import _local_to_s3, run_redshift_external_query
  5 | 
  6 | from airflow import DAG
  7 | from airflow.contrib.operators.emr_add_steps_operator import (
  8 |     EmrAddStepsOperator,
  9 | )
 10 | from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor
 11 | from airflow.models import Variable
 12 | from airflow.operators.dummy_operator import DummyOperator
 13 | from airflow.operators.postgres_operator import PostgresOperator
 14 | from airflow.operators.python import PythonOperator
 15 | 
 16 | # Config
 17 | BUCKET_NAME = Variable.get("BUCKET")
 18 | EMR_ID = Variable.get("EMR_ID")
 19 | EMR_STEPS = {}
 20 | with open("./dags/scripts/emr/clean_movie_review.json") as json_file:
 21 |     EMR_STEPS = json.load(json_file)
 22 | 
 23 | # DAG definition
 24 | default_args = {
 25 |     "owner": "airflow",
 26 |     "depends_on_past": True,
 27 |     "wait_for_downstream": True,
 28 |     "start_date": datetime(2021, 5, 23),
 29 |     "email": ["airflow@airflow.com"],
 30 |     "email_on_failure": False,
 31 |     "email_on_retry": False,
 32 |     "retries": 2,
 33 |     "retry_delay": timedelta(minutes=1),
 34 | }
 35 | 
 36 | dag = DAG(
 37 |     "user_behaviour",
 38 |     default_args=default_args,
 39 |     schedule_interval="0 0 * * *",
 40 |     max_active_runs=1,
 41 | )
 42 | 
 43 | extract_user_purchase_data = PostgresOperator(
 44 |     dag=dag,
 45 |     task_id="extract_user_purchase_data",
 46 |     sql="./scripts/sql/unload_user_purchase.sql",
 47 |     postgres_conn_id="postgres_default",
 48 |     params={"user_purchase": "/temp/user_purchase.csv"},
 49 |     depends_on_past=True,
 50 |     wait_for_downstream=True,
 51 | )
 52 | 
 53 | user_purchase_to_stage_data_lake = PythonOperator(
 54 |     dag=dag,
 55 |     task_id="user_purchase_to_stage_data_lake",
 56 |     python_callable=_local_to_s3,
 57 |     op_kwargs={
 58 |         "file_name": "/opt/airflow/temp/user_purchase.csv",
 59 |         "key": "stage/user_purchase/{{ ds }}/user_purchase.csv",
 60 |         "bucket_name": BUCKET_NAME,
 61 |         "remove_local": "true",
 62 |     },
 63 | )
 64 | 
 65 | user_purchase_stage_data_lake_to_stage_tbl = PythonOperator(
 66 |     dag=dag,
 67 |     task_id="user_purchase_stage_data_lake_to_stage_tbl",
 68 |     python_callable=run_redshift_external_query,
 69 |     op_kwargs={
 70 |         "qry": "alter table spectrum.user_purchase_staging add \
 71 |             if not exists partition(insert_date='{{ ds }}') \
 72 |             location 's3://"
 73 |         + BUCKET_NAME
 74 |         + "/stage/user_purchase/{{ ds }}'",
 75 |     },
 76 | )
 77 | 
 78 | movie_review_to_raw_data_lake = PythonOperator(
 79 |     dag=dag,
 80 |     task_id="movie_review_to_raw_data_lake",
 81 |     python_callable=_local_to_s3,
 82 |     op_kwargs={
 83 |         "file_name": "/opt/airflow/data/movie_review.csv",
 84 |         "key": "raw/movie_review/{{ ds }}/movie.csv",
 85 |         "bucket_name": BUCKET_NAME,
 86 |     },
 87 | )
 88 | 
 89 | spark_script_to_s3 = PythonOperator(
 90 |     dag=dag,
 91 |     task_id="spark_script_to_s3",
 92 |     python_callable=_local_to_s3,
 93 |     op_kwargs={
 94 |         "file_name": "./dags/scripts/spark/random_text_classification.py",
 95 |         "key": "scripts/random_text_classification.py",
 96 |         "bucket_name": BUCKET_NAME,
 97 |     },
 98 | )
 99 | 
100 | start_emr_movie_classification_script = EmrAddStepsOperator(
101 |     dag=dag,
102 |     task_id="start_emr_movie_classification_script",
103 |     job_flow_id=EMR_ID,
104 |     aws_conn_id="aws_default",
105 |     steps=EMR_STEPS,
106 |     params={
107 |         "BUCKET_NAME": BUCKET_NAME,
108 |         "raw_movie_review": "raw/movie_review",
109 |         "text_classifier_script": "scripts/random_text_classifier.py",
110 |         "stage_movie_review": "stage/movie_review",
111 |     },
112 |     depends_on_past=True,
113 | )
114 | 
115 | last_step = len(EMR_STEPS) - 1
116 | 
117 | wait_for_movie_classification_transformation = EmrStepSensor(
118 |     dag=dag,
119 |     task_id="wait_for_movie_classification_transformation",
120 |     job_flow_id=EMR_ID,
121 |     step_id='{{ task_instance.xcom_pull\
122 |         ("start_emr_movie_classification_script", key="return_value")['
123 |     + str(last_step)
124 |     + "] }}",
125 |     depends_on_past=True,
126 | )
127 | 
128 | generate_user_behavior_metric = PostgresOperator(
129 |     dag=dag,
130 |     task_id="generate_user_behavior_metric",
131 |     sql="scripts/sql/generate_user_behavior_metric.sql",
132 |     postgres_conn_id="redshift",
133 | )
134 | 
135 | end_of_data_pipeline = DummyOperator(task_id="end_of_data_pipeline", dag=dag)
136 | 
137 | (
138 |     extract_user_purchase_data
139 |     >> user_purchase_to_stage_data_lake
140 |     >> user_purchase_stage_data_lake_to_stage_tbl
141 | )
142 | (
143 |     [
144 |         movie_review_to_raw_data_lake,
145 |         spark_script_to_s3,
146 |     ]
147 |     >> start_emr_movie_classification_script
148 |     >> wait_for_movie_classification_transformation
149 | )
150 | (
151 |     [
152 |         user_purchase_stage_data_lake_to_stage_tbl,
153 |         wait_for_movie_classification_transformation,
154 |     ]
155 |     >> generate_user_behavior_metric
156 |     >> end_of_data_pipeline
157 | )
158 | 


--------------------------------------------------------------------------------
/Batch-Data-Pipelines/setup_infra.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [[ $# -eq 0 ]] ; then
  4 |     echo 'Please enter your bucket name as ./setup_infra.sh your-bucket'
  5 |     exit 0
  6 | fi
  7 | 
  8 | # check if AWS is installed and configured
  9 | # check if psql is installed
 10 | 
 11 | AWS_ID=$(aws sts get-caller-identity --query Account --output text | cat)
 12 | AWS_EC2_INSTANCE_NAME=sde-airflow-pg-$(openssl rand -base64 12)
 13 | 
 14 | echo "Reading infrastructure variables from infra_variables.txt"
 15 | source infra_variables.txt
 16 | 
 17 | echo "Creating bucket "$1""
 18 | aws s3api create-bucket --acl public-read-write --region $AWS_REGION --bucket $1 --output text >> setup.log
 19 | 
 20 | echo '{
 21 |   "Version": "2012-10-17",
 22 |   "Statement": [
 23 |     {
 24 |       "Effect": "Allow",
 25 |       "Principal": {
 26 |         "Service": "ec2.amazonaws.com"
 27 |       },
 28 |       "Action": "sts:AssumeRole"
 29 |     }
 30 |   ]
 31 | }' > ./trust-policy.json
 32 | 
 33 | 
 34 | echo "Creating AWS IAM role for EC2 S3 access"
 35 | aws iam create-role --role-name $EC2_IAM_ROLE --assume-role-policy-document file://trust-policy.json --description 'EC2 access to S3' --output text >> setup.log
 36 | 
 37 | echo "Attaching AmazonS3FullAccess Policy to the previous IAM role"
 38 | aws iam attach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --output text >> setup.log
 39 | 
 40 | echo "Attaching AmazonEMRFullAccessPolicy_v2 Policy to the previous IAM role"
 41 | aws iam attach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonEMRFullAccessPolicy_v2 --output text >> setup.log
 42 | 
 43 | echo "Attaching AmazonRedshiftAllCommandsFullAccess Policy to the previous IAM role"
 44 | aws iam attach-role-policy --role-name $EC2_IAM_ROLE --policy-arn arn:aws:iam::aws:policy/AmazonRedshiftAllCommandsFullAccess --output text >> setup.log
 45 | 
 46 | echo 'Creating IAM instance profile to add to EC2'
 47 | aws iam create-instance-profile --instance-profile-name $EC2_IAM_ROLE-instance-profile --output text >> setup.log
 48 | aws iam add-role-to-instance-profile --role-name $EC2_IAM_ROLE --instance-profile-name $EC2_IAM_ROLE-instance-profile --output text >> setup.log
 49 | 
 50 | echo "Creating ssh key to connect to EC2 instance"
 51 | aws ec2 create-key-pair --key-name sde-key --query "KeyMaterial" --output text --region $AWS_REGION > sde-key.pem
 52 | chmod 400 sde-key.pem
 53 | 
 54 | MY_IP=$(curl -s http://whatismyip.akamai.com/)
 55 | 
 56 | echo "Creating EC2 security group to only allow access from your IP $MY_IP"
 57 | EC2_SECURITY_GROUP_ID=$(aws ec2 create-security-group --description "Security group to allow inbound SCP connection" --group-name $EC2_SECURITY_GROUP --output text)
 58 | echo 'EC2_SECURITY_GROUP_ID="'$EC2_SECURITY_GROUP_ID'"' >> state.log
 59 | 
 60 | echo "Add inbound rule to allow ssh from IP $MY_IP"
 61 | aws ec2 authorize-security-group-ingress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 22 --cidr $MY_IP/24 --output text >> setup.log
 62 | 
 63 | echo "Add outbound rule to allow our IP $MY_IP to connect to EC2's 8080 port"
 64 | aws ec2 authorize-security-group-egress --group-id $EC2_SECURITY_GROUP_ID --protocol tcp --port 8080 --cidr $MY_IP/32 --output text >> setup.log
 65 | 
 66 | echo "Creating EC2 instance"
 67 | sleep 5
 68 | aws ec2 run-instances --image-id $EC2_IMAGE_ID --instance-type $AWS_EC2_INSTANCE --count 1 --key-name sde-key --user-data file://setup_ubuntu_docker.txt --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='$AWS_EC2_INSTANCE_NAME'}]' --region $AWS_REGION >> setup.log
 69 | 
 70 | echo "Get EC2 ID"
 71 | sleep 20
 72 | EC2_ID=$(aws --region $AWS_REGION ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=$AWS_EC2_INSTANCE_NAME" --query 'Reservations[*].Instances[*].[InstanceId]' --output text)
 73 | echo "EC2 ID is $EC2_ID"
 74 | echo 'EC2_ID="'$EC2_ID'"' >> state.log
 75 | 
 76 | echo "Add security group to EC2"
 77 | aws ec2 modify-instance-attribute --instance-id $EC2_ID --groups $EC2_SECURITY_GROUP_ID --output text >> setup.log
 78 | 
 79 | while :
 80 | do
 81 |    echo "Waiting for EC2 instance to start, sleeping for 60s before next check"
 82 |    sleep 60
 83 |    EC2_STATUS=$(aws ec2 describe-instance-status --instance-ids $EC2_ID --query 'InstanceStatuses[0].InstanceState.Name' --output text)
 84 |    if [[ "$EC2_STATUS" == "running" ]]
 85 |    then
 86 | 	break
 87 |    fi
 88 | done
 89 | 
 90 | echo "Attach "$EC2_IAM_ROLE"-instance-profile to EC2 instance"
 91 | aws ec2 associate-iam-instance-profile --instance-id $EC2_ID --iam-instance-profile Name=$EC2_IAM_ROLE-instance-profile --output text >> setup.log
 92 | 
 93 | echo "Get EC2 IPV4"
 94 | sleep 20
 95 | EC2_IPV4=$(aws --region $AWS_REGION ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=instance-id,Values=$EC2_ID" --query 'Reservations[*].Instances[*].[PublicDnsName]' --output text)
 96 | echo "EC2 IPV4 is $EC2_IPV4"
 97 | 
 98 | echo "SCP to copy code to remote server"
 99 | cd ../
100 | scp -o "IdentitiesOnly yes" -i ./beginner_de_project/sde-key.pem -r ./beginner_de_project ubuntu@$EC2_IPV4:/home/ubuntu/beginner_de_project
101 | cd beginner_de_project
102 | 
103 | echo "Clean up stale data"
104 | sleep 10
105 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && rm -f data.zip && rm -rf data'
106 | 
107 | echo "Download data"
108 | sleep 10
109 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && wget https://start-data-engg.s3.amazonaws.com/data.zip && sudo unzip data.zip && sudo chmod 755 data'
110 | 
111 | echo "Recreate logs and temp dir"
112 | sleep 10
113 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && rm -rf logs && mkdir logs && rm -rf temp && mkdir temp && chmod 777 temp'
114 | 
115 | echo "Creating an AWS EMR Cluster named "$SERVICE_NAME""
116 | aws emr create-default-roles >> setup.log
117 | aws emr create-cluster --applications Name=Hadoop Name=Spark --release-label emr-6.2.0 --name $SERVICE_NAME --scale-down-behavior TERMINATE_AT_TASK_COMPLETION  --service-role EMR_DefaultRole --instance-groups '[
118 |     {
119 |         "InstanceCount": 1,
120 |         "EbsConfiguration": {
121 |             "EbsBlockDeviceConfigs": [
122 |                 {
123 |                     "VolumeSpecification": {
124 |                         "SizeInGB": 32,
125 |                         "VolumeType": "gp2"
126 |                     },
127 |                     "VolumesPerInstance": 2
128 |                 }
129 |             ]
130 |         },
131 |         "InstanceGroupType": "MASTER",
132 |         "InstanceType": "'$EMR_NODE_TYPE'",
133 |         "Name": "Master - 1"
134 |     },
135 |     {
136 |         "InstanceCount": 2,
137 |         "BidPrice": "OnDemandPrice",
138 |         "EbsConfiguration": {
139 |             "EbsBlockDeviceConfigs": [
140 |                 {
141 |                     "VolumeSpecification": {
142 |                         "SizeInGB": 32,
143 |                         "VolumeType": "gp2"
144 |                     },
145 |                     "VolumesPerInstance": 2
146 |                 }
147 |             ]
148 |         },
149 |         "InstanceGroupType": "CORE",
150 |         "InstanceType": "'$EMR_NODE_TYPE'",
151 |         "Name": "Core - 2"
152 |     }
153 |         ]' >> setup.log
154 | 
155 | echo '{
156 |   "Version": "2012-10-17",
157 |   "Statement": [
158 |     {
159 |       "Effect": "Allow",
160 |       "Principal": {
161 |         "Service": "redshift.amazonaws.com"
162 |       },
163 |       "Action": "sts:AssumeRole"
164 |     }
165 |   ]
166 | }' > ./trust-policy.json
167 | 
168 | 
169 | echo "Creating AWS IAM role for redshift spectrum S3 access"
170 | aws iam create-role --role-name $IAM_ROLE_NAME --assume-role-policy-document file://trust-policy.json --description 'spectrum access for redshift' >> setup.log
171 | 
172 | echo "Attaching AmazonS3ReadOnlyAccess Policy to our IAM role"
173 | aws iam attach-role-policy --role-name $IAM_ROLE_NAME --policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess --output text >> setup.log
174 | echo "Attaching AWSGlueConsoleFullAccess Policy to our IAM role"
175 | aws iam attach-role-policy --role-name $IAM_ROLE_NAME --policy-arn arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess --output text >> setup.log
176 | 
177 | echo "Creating an AWS Redshift Cluster named "$SERVICE_NAME""
178 | aws redshift create-cluster --cluster-identifier $SERVICE_NAME --node-type dc2.large --master-username $REDSHIFT_USER --master-user-password $REDSHIFT_PASSWORD --cluster-type single-node --publicly-accessible --iam-roles "arn:aws:iam::"$AWS_ID":role/"$IAM_ROLE_NAME"" >> setup.log
179 | 
180 | while :
181 | do
182 |    echo "Waiting for Redshift cluster "$SERVICE_NAME" to start, sleeping for 60s before next check"
183 |    sleep 60
184 |    REDSHIFT_CLUSTER_STATUS=$(aws redshift describe-clusters --cluster-identifier $SERVICE_NAME --query 'Clusters[0].ClusterStatus' --output text)
185 |    if [[ "$REDSHIFT_CLUSTER_STATUS" == "available" ]]
186 |    then
187 | 	break
188 |    fi
189 | done
190 | 
191 | REDSHIFT_HOST=$(aws redshift describe-clusters --cluster-identifier $SERVICE_NAME --query 'Clusters[0].Endpoint.Address' --output text)
192 | 
193 | # TODO read the script from sql file
194 | echo "Running setup script on redshift"
195 | echo "CREATE EXTERNAL SCHEMA spectrum
196 | FROM DATA CATALOG DATABASE 'spectrumdb' iam_role 'arn:aws:iam::"$AWS_ID":role/"$IAM_ROLE_NAME"' CREATE EXTERNAL DATABASE IF NOT EXISTS;
197 | DROP TABLE IF EXISTS spectrum.user_purchase_staging;
198 | CREATE EXTERNAL TABLE spectrum.user_purchase_staging (
199 |     InvoiceNo VARCHAR(10),
200 |     StockCode VARCHAR(20),
201 |     detail VARCHAR(1000),
202 |     Quantity INTEGER,
203 |     InvoiceDate TIMESTAMP,
204 |     UnitPrice DECIMAL(8, 3),
205 |     customerid INTEGER,
206 |     Country VARCHAR(20)
207 | ) PARTITIONED BY (insert_date DATE) 
208 | ROW FORMAT DELIMITED 
209 | FIELDS TERMINATED BY ',' 
210 | STORED AS textfile 
211 | LOCATION 's3://"$1"/stage/user_purchase/' 
212 | TABLE PROPERTIES ('skip.header.line.count' = '1');
213 | DROP TABLE IF EXISTS spectrum.classified_movie_review;
214 | CREATE EXTERNAL TABLE spectrum.classified_movie_review (
215 |     cid VARCHAR(100),
216 |     positive_review boolean,
217 |     insert_date VARCHAR(12)
218 | ) STORED AS PARQUET LOCATION 's3://"$1"/stage/movie_review/';
219 | DROP TABLE IF EXISTS public.user_behavior_metric;
220 | CREATE TABLE public.user_behavior_metric (
221 |     customerid INTEGER,
222 |     amount_spent DECIMAL(18, 5),
223 |     review_score INTEGER,
224 |     review_count INTEGER,
225 |     insert_date DATE
226 | );" > ./redshift_setup.sql
227 | 
228 | psql -f ./redshift_setup.sql postgres://$REDSHIFT_USER:$REDSHIFT_PASSWORD@$REDSHIFT_HOST:$REDSHIFT_PORT/dev
229 | rm ./redshift_setup.sql
230 | 
231 | echo "Spinning up remote Airflow docker containers"
232 | sleep 60
233 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 'cd beginner_de_project && echo -e "AIRFLOW_UID=$(id -u)\nAIRFLOW_GID=0" > .env && docker compose up airflow-init && docker compose up --build -d'
234 | 
235 | echo "Sleeping 5 Minutes to let Airflow containers reach a healthy state"
236 | sleep 300
237 | 
238 | echo "adding redshift connections to Airflow connection param"
239 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow connections add 'redshift' --conn-type 'Postgres' --conn-login $REDSHIFT_USER --conn-password $REDSHIFT_PASSWORD --conn-host $REDSHIFT_HOST --conn-port $REDSHIFT_PORT --conn-schema 'dev'"
240 | 
241 | echo "adding postgres connections to Airflow connection param"
242 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow connections add 'postgres_default' --conn-type 'Postgres' --conn-login 'airflow' --conn-password 'airflow' --conn-host 'localhost' --conn-port 5432 --conn-schema 'airflow'"
243 | 
244 | echo "adding S3 bucket name to Airflow variables"
245 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow variables set BUCKET $1"
246 | 
247 | echo "adding EMR ID to Airflow variables"
248 | EMR_CLUSTER_ID=$(aws emr list-clusters --active --query 'Clusters[?Name==`'$SERVICE_NAME'`].Id' --output text)
249 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow variables set EMR_ID $EMR_CLUSTER_ID"
250 | 
251 | echo "set Airflow AWS region to "$AWS_REGION""
252 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 "docker exec -d webserver airflow connections add 'aws_default' --conn-type 'aws' --conn-extra '{\"region_name\":\"'$AWS_REGION'\"}'"
253 | 
254 | echo "Successfully setup local Airflow containers, S3 bucket "$1", EMR Cluster "$SERVICE_NAME", redshift cluster "$SERVICE_NAME", and added config to Airflow connections and variables"
255 | 
256 | echo "Forwardin Metabase port to http://localhost:3000"
257 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 -N -f -L 3000:$EC2_IPV4:3000
258 | 
259 | echo "Opening Airflow UI ..."
260 | sleep 60
261 | ssh -o "IdentitiesOnly yes" -i "sde-key.pem" ubuntu@$EC2_IPV4 -N -f -L 8080:$EC2_IPV4:8080
262 | open http://localhost:8080
263 | 


--------------------------------------------------------------------------------