├── .DS_Store
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Deriving insights from transactional data using Redshift ML
    ├── Datasets
    │   ├── inference_part1.csv
    │   ├── inference_part2.csv
    │   ├── inference_part3.csv
    │   ├── inference_part4.csv
    │   └── train_data.csv
    ├── Notebook
    │   └── Redshift_Notebook.ipynb
    └── Readme.md
├── LICENSE
├── README.md
├── auto-ml-data-engineering
    ├── .DS_Store
    ├── Auto-ML.ipynb
    ├── Images
    │   ├── .DS_Store
    │   ├── Analyze.png
    │   ├── Batch_Inference_results.png
    │   ├── Build_pre_preview.png
    │   ├── Create_A_Model.png
    │   ├── Create_Standard_Build.png
    │   ├── Data_Flow_Create_Model.png
    │   ├── Data_Flow_Creation.png
    │   ├── Data_Flow_Data_Tab.png
    │   ├── Data_Flow_File_Selection.png
    │   ├── Data_Visualizer.png
    │   ├── Export Canvas Dataset.png
    │   ├── Import_Test_No_Label.png
    │   ├── Model_Created.png
    │   ├── My_Models.png
    │   ├── Predict.png
    │   ├── Screenshot 2024-10-16 at 2.59.19 PM.png
    │   ├── Single_Prediction.png
    │   ├── Target_Column.png
    │   ├── name_dataset.png
    │   ├── preview_data_test.png
    │   ├── select_test_data.png
    │   └── view_created_dataset.png
    ├── README.md
    └── data
    │   └── data.zip
├── aws-glue-sm-studio-integration
    ├── README.md
    ├── code
    │   ├── 1.DataProcessingGlue.ipynb
    │   ├── 1.DataProcessingUsingGlue.ipynb
    │   ├── 2.ModelBuildingDeployment.ipynb
    │   └── img
    │   │   ├── img1.png
    │   │   ├── img2.png
    │   │   └── img3.png
    └── img
    │   ├── img1.png
    │   ├── img2.png
    │   ├── img3.png
    │   ├── img4.png
    │   ├── img5.png
    │   └── img6.png
├── build-a-managed-analytics-platform-for-ecommerce-business
    ├── .DS_Store
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── code
    │   ├── .DS_Store
    │   ├── ecomm-simulation-app
    │   │   └── stream-data-app-simulation.py
    │   ├── flink-app
    │   │   └── sql-flink-ecomm-notebook-1.zpln
    │   └── serverless-app
    │   │   └── lambda_function.py
    └── img
    │   ├── .DS_Store
    │   ├── img-flink.png
    │   ├── img1.png
    │   ├── img10-1.png
    │   ├── img10.png
    │   ├── img11.png
    │   ├── img12.png
    │   ├── img13.png
    │   ├── img14.png
    │   ├── img15-1.png
    │   ├── img15.png
    │   ├── img16.png
    │   ├── img17-1.png
    │   ├── img17.png
    │   ├── img18-1.png
    │   ├── img18.png
    │   ├── img19-1.png
    │   ├── img19.png
    │   ├── img2-1.png
    │   ├── img2.png
    │   ├── img20.png
    │   ├── img21-1.png
    │   ├── img21.png
    │   ├── img22-1.png
    │   ├── img22.png
    │   ├── img23.png
    │   ├── img24-1.png
    │   ├── img24.png
    │   ├── img25-1.png
    │   ├── img25.png
    │   ├── img26-1.png
    │   ├── img26.png
    │   ├── img27.png
    │   ├── img28-1.png
    │   ├── img28.png
    │   ├── img29.png
    │   ├── img3-1.png
    │   ├── img3.png
    │   ├── img30-1.png
    │   ├── img30.png
    │   ├── img4.png
    │   ├── img5-1.png
    │   ├── img6-1.png
    │   ├── img7-1.png
    │   ├── img8-1.png
    │   └── img9-1.png
├── create-an-etl-pipeline-apache-spark
    ├── .DS_Store
    ├── LICENSE
    ├── README.md
    ├── emr-etl-job.py
    ├── images
    │   ├── Architecture.png
    │   ├── athena_q1.png
    │   ├── athena_q2.png
    │   ├── emr_1-new.png
    │   ├── emr_1.png
    │   ├── emr_2-new.png
    │   ├── emr_2.png
    │   ├── emr_3-new.png
    │   ├── emr_3.png
    │   ├── emr_4-new.png
    │   ├── emr_4.png
    │   ├── emr_terminate.png
    │   ├── glue_crawler_1.png
    │   ├── glue_crawler_2.png
    │   ├── glue_crawler_3.png
    │   ├── glue_crawler_4.png
    │   ├── glue_crawler_5.png
    │   ├── glue_db_delete.png
    │   ├── glue_run.png
    │   ├── glue_run_complete.png
    │   ├── glue_ui.png
    │   ├── key_pair.png
    │   ├── key_pair_2.png
    │   ├── s3_1-new.png
    │   ├── s3_1.png
    │   ├── s3_2-new.png
    │   ├── s3_2.png
    │   ├── s3_3.png
    │   ├── s3_cleaned_data.png
    │   ├── upload_csv-new.png
    │   └── upload_csv.png
    └── sql_queries.sql
├── dataset
    └── SalesData.csv
├── ml-sagemaker-studio
    ├── 01_glue_data_prep.ipynb
    ├── 02_model_building.ipynb
    └── 03_scheduled_inference.ipynb
└── sagemaker-studio-emr-spark
    ├── README.md
    ├── code
        ├── CFN-SagemakerEMRNoAuthProductWithStudio-v3.yaml
        └── demo-sm-emr.ipynb
    └── img
        ├── img1.png
        ├── img2.png
        ├── img3.png
        ├── img4.png
        ├── img5.png
        └── img6.png


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/.DS_Store


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/Deriving insights from transactional data using Redshift ML/Notebook/Redshift_Notebook.ipynb:
--------------------------------------------------------------------------------
1 | {"metadata":{"title":"demo_notebook_provisioned_formatted","kernelspec":{"display_name":"Redshift","language":"postgresql","name":"Redshift"},"language_info":{"file_extension":".sql","name":"Redshift"},"version":1},"nbformat":4,"nbformat_minor":0,"cells":[{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Create a new database named copy_data.\r\n","create database copy_data"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Create a new table for copying training data from S3\r\n","CREATE TABLE bank_details_training(\r\n","   age numeric,\r\n","   jobtype char (25),\r\n","   marital char (25),\r\n","   education char (25),\r\n","   default_col char (25),\r\n","   housing char (25),\r\n","   loan char (25),\r\n","   contact char (25),\r\n","   month char (25),\r\n","   day_of_week char (25),\r\n","   duration numeric,\r\n","   campaign numeric,\r\n","   pdays numeric,\r\n","   previous numeric,\r\n","   poutcome char (25),\r\n","   emp_var_rate numeric,\r\n","   cons_price_idx numeric,     \r\n","   cons_conf_idx numeric,     \r\n","   euribor3m numeric,\r\n","   nr_employed numeric,\r\n","   y char(1) ) ;"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Create a new table for copying inference data from S3\r\n","CREATE TABLE bank_details_inference(\r\n","   age numeric,\r\n","   jobtype char (25),\r\n","   marital char (25),\r\n","   education char (25),\r\n","   default_col char (25),\r\n","   housing char (25),\r\n","   loan char (25),\r\n","   contact char (25),\r\n","   month char (25),\r\n","   day_of_week char (25),\r\n","   duration numeric,\r\n","   campaign numeric,\r\n","   pdays numeric,\r\n","   previous numeric,\r\n","   poutcome char (25),\r\n","   emp_var_rate numeric,\r\n","   cons_price_idx numeric,     \r\n","   cons_conf_idx numeric,     \r\n","   euribor3m numeric,\r\n","   nr_employed numeric,\r\n","   y char(1) ) ;"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Update the placeholder for S3 bucket to point to the snapshot export location. eg: s3://bucket_name/snapshot_export/db_name/training_table_name/\r\n","-- Update the placeholder for IAM role to use the role that has access to S3 bucket. Make sure to attach it to the Redshift cluster\r\n","-- Run a COPY command to copy training data from S3 to bank_details_training\r\n","copy bank_details_training\r\n","from '<s3 training bucket name>'\r\n","FORMAT AS PARQUET\r\n","iam_role 'arn:aws:iam::123456789012:role/iam_role'"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Update the placeholder for S3 bucket to point to the snapshot export location. eg: s3://bucket_name/snapshot_export/db_name/inference_table_name/\r\n","-- Update the placeholder for IAM role to use the role that has access to S3 bucket. Make sure to attach it to the Redshift cluster\r\n","-- Run a COPY command to copy training data from S3 to bank_details_inference\r\n","copy bank_details_inference\r\n","from '<s3 inference bucket name>'\r\n","FORMAT AS PARQUET\r\n","iam_role 'arn:aws:iam::123456789012:role/iam_role'"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Validation of training table load\r\n","select * from bank_details_training"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Creating a new training table to add new column for row number. This will be used for including only a subset of records from the original training table for training the ML model\r\n","create table bank_details_training_rnum as (select row_number() over (partition by 1) as rnum, t.* from bank_details_training t)"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Validation of records in the new training table. It should give 4000\r\n","select count(*) from bank_details_training_rnum where rnum > 119 order by 1"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Create ML model. Provide only the bucket name (without any prefix such as s3://)\r\n","CREATE MODEL td_subscription_demo\r\n","FROM \r\n","(SELECT age\r\n",",jobtype\r\n",",marital\r\n",",education\r\n",",default_col\r\n",",housing\r\n",",loan\r\n",",contact\r\n",",month\r\n",",day_of_week\r\n",",duration\r\n",",campaign\r\n",",pdays\r\n",",previous\r\n",",poutcome\r\n",",emp_var_rate\r\n",",cons_price_idx\r\n",",cons_conf_idx\r\n",",euribor3m\r\n",",nr_employed\r\n",",y\r\n","FROM bank_details_training_rnum\r\n","where rnum > 119\r\n",")\r\n","TARGET y\r\n","FUNCTION predict_td_subscription_demo\r\n","iam_role 'arn:aws:iam::123456789012:role/iam_role'\r\n","SETTINGS (\r\n","  S3_BUCKET 'bucket_name'\r\n",");"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Check model status. Wait until the status changes to READY. It takes approx. 1 hour for this sample dataset.\r\n","show model td_subscription_demo"],"cell_type":"code","execution_count":0,"outputs":[]},{"metadata":{"displayMode":"maximized","width":12,"isLimitOn":true},"source":["-- Check the model accuracy by comparing the output of ML model's prediction and the actual data. \r\n","\r\n","WITH infer_data\r\n"," AS (\r\n","    SELECT  y as actual, predict_td_subscription_demo(age,jobtype,marital,education,default_col,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed) AS predicted,\r\n","     CASE WHEN actual = predicted THEN 1::INT\r\n","         ELSE 0::INT END AS correct\r\n","    FROM bank_details_inference\r\n","    ),\r\n"," aggr_data AS (\r\n","     SELECT SUM(correct) as num_correct, COUNT(*) as total FROM infer_data\r\n"," )\r\n","SELECT cast(num_correct as decimal(10,2)) AS Correct_Entries, cast(total as decimal(10,2)) AS Total_Entries, (Correct_Entries/Total_Entries) AS Accuracy FROM aggr_data;\r\n","\r\n","--Predict how many will subscribe for term deposit vs not subscribe\r\n","\r\n","WITH term_data AS ( SELECT predict_td_subscription_demo( age,jobtype,marital,education,default_col,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed) AS predicted\r\n","FROM bank_details_inference )\r\n","SELECT\r\n","CASE WHEN predicted = 'y'  THEN 'Yes-will-do-a-term-deposit'\r\n","     WHEN predicted = 'n'  THEN 'No-term-deposit'\r\n","     ELSE 'Neither' END as deposit_prediction,\r\n","COUNT(1) AS count\r\n","from term_data GROUP BY 1;"],"cell_type":"code","execution_count":0,"outputs":[]}]}


--------------------------------------------------------------------------------
/Deriving insights from transactional data using Redshift ML/Readme.md:
--------------------------------------------------------------------------------
 1 | # **Overview**
 2 | This solution talks about how we can derive insights from transactional data residing in RDS MySQL by exporting the data to S3 and building ML model  using Redshift ML without building complex ETL pipelines and not disrupting the source RDS database. The use case considered for this solution is the Bank Marketing data from https://archive.ics.uci.edu/ml/datasets/bank+marketing. This is a classification problem, where the goal is to predict if the customer will  subscribe to a term deposit or not.
 3 | 
 4 | Video recording of the demo is available at the following location: https://dofe88k1s0p2q.cloudfront.net/Full%20video.mp4
 5 | 
 6 | The solution leverages the training and inference datasets of the following Redshift Immersion Lab: https://catalog.workshops.aws/redshift-immersion/en-US/lab17a
 7 | 
 8 | **Prerequisites** 
 9 | 1/ Create IAM role with read and write access to S3. Attach it to the Redshift cluster. More details at https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_ExportSnapshot.html#USER_ExportSnapshot.SetupIAMRole.
10 | 
11 | 2/ Create a symmetric encryption AWS KMS key for the server-side encryption. The KMS key will be used by the snapshot export task to set up AWS KMS server-side encryption when writing the export data to S3. The KMS key policy must include both the kms:Encrypt and kms:Decrypt permissions. More details at https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_ExportSnapshot.html.
12 | 
13 | 3/ Create two RDS MySQL tables, one each for training and inference. For detailed steps on how to create RDS MySQL instance and connect using SQL client, please refer https://aws.amazon.com/getting-started/hands-on/create-mysql-db/.
14 | 
15 | **Table definition of training data**
16 | 
17 | CREATE TABLE <db_name>.<training_table_name>(
18 |    age numeric,
19 |    jobtype char (25),
20 |    marital char (25),
21 |    education char (25),
22 |    default_col char (25),
23 |    housing char (25),
24 |    loan char (25),
25 |    contact char (25),
26 |    month char (25),
27 |    day_of_week char (25),
28 |    duration numeric,
29 |    campaign numeric,
30 |    pdays numeric,
31 |    previous numeric,
32 |    poutcome char (25),
33 |    emp_var_rate numeric,
34 |    cons_price_idx numeric,     
35 |    cons_conf_idx numeric,     
36 |    euribor3m numeric,
37 |    nr_employed numeric,
38 |    y char(1) ) ;
39 |    
40 | **Table definition of inference data** 
41 | 
42 | CREATE TABLE <db_name>.<inference_table_name>(
43 |    age numeric,
44 |    jobtype char (25),
45 |    marital char (25),
46 |    education char (25),
47 |    default_col char (25),
48 |    housing char (25),
49 |    loan char (25),
50 |    contact char (25),
51 |    month char (25),
52 |    day_of_week char (25),
53 |    duration numeric,
54 |    campaign numeric,
55 |    pdays numeric,
56 |    previous numeric,
57 |    poutcome char (25),
58 |    emp_var_rate numeric,
59 |    cons_price_idx numeric,     
60 |    cons_conf_idx numeric,     
61 |    euribor3m numeric,
62 |    nr_employed numeric,
63 |    y char(1) ) ;
64 | 
65 | 4/ Load sample data in RDS MySQL 
66 | LOAD DATA LOCAL INFILE '<path to train_data.csv>' INTO TABLE steps.train FIELDS TERMINATED BY ',' ignore 1 lines;
67 | 
68 | LOAD DATA LOCAL INFILE '<path to inference_part1.csv' INTO TABLE test.bank_details_inference FIELDS TERMINATED BY ',' ignore 1 lines;
69 | LOAD DATA LOCAL INFILE '<path to inference_part2.csv' INTO TABLE test.bank_details_inference FIELDS TERMINATED BY ',';
70 | LOAD DATA LOCAL INFILE '<path to inference_part3.csv' INTO TABLE test.bank_details_inference FIELDS TERMINATED BY ',';
71 | LOAD DATA LOCAL INFILE '<path to inference_part4.csv' INTO TABLE test.bank_details_inference FIELDS TERMINATED BY ',';
72 | 
73 | 5/ Validate the data by doing select * on the newly loaded tables via. SQL client or EC2 connect.
74 | 
75 | 6/ Create Redshift provisioned cluster (RA3 2 node cluster preferred) or Redshift serverless endpoint (leave all to default). More details at https://docs.aws.amazon.com/redshift/latest/gsg/rs-gsg-sample-data-load-create-cluster.html (cluster).
76 | 
77 | # **Solution execution steps**
78 |    
79 | In this solution, the data from two tables created above will be exported to S3 by taking a manual snapshot of the RDS DB instance and exporting it to S3. Post that, the data will be copied to Redshift for adding a new row number column. This column will be used for selecting a subset of records from the training table for creating the model. This can be extended for other use cases such as joining with other tables, deriving new columns etc.
80 | 
81 | 1/ Navigate to RDS console, select the database instance and take DB Snapshot by selecting the database and choose Actions > Take snapshot
82 | 
83 | 2/ Once the snapshot is created, select the snapshot and export to S3 by selecting the snapshot choose Actions > Export to Amazon S3
84 |    2.a/ If only the training and inference tables to be exported to S3, please choose 'Partial' under 'Exported data' section. In the box below, enter the names of the tables in the format schema.table_name separated by space
85 |    2.b/ Choose the appropriate bucket or create new one.
86 |    2.c/ Choose the appropriate IAM role that has write access to the above S3 bucket.
87 |    2.d/ Choose the appropriate KMS key created in prerequisites.
88 | 
89 | 3/ Navigate to Redshift console and choose the Redshift provisioned cluster/serverless namespace created in prerequisites. Click 'Query data' button and 'Query in query editor v2'.
90 | 
91 | 4/ Create new notebook and import the sample notebook given.
92 | 
93 | 5/ Run cell by cell and follow the instructions given in the comments of each cell. (For model training, it will take about an hour).
94 | 
95 | 6/ Once the notebook is run, please delete the following resources to save costs: 1/ Redshift cluster/serverless endpoint; 2/ RDS DB instance; 3/ Snapshots; 4/ S3 data exported; 5/ EC2 instance (if used for RDS connect).
96 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Data Engineering On AWS
2 | 
3 | <WIP>
4 | 
5 | 


--------------------------------------------------------------------------------
/auto-ml-data-engineering/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/.DS_Store


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Auto-ML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 17,
  6 |    "id": "96c5f350-f21c-4f43-a0d8-b3c46a9322f0",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "trip_url = \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet\"\n",
 11 |     "zone_lookup_url = \"https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv\"\n",
 12 |     "# import pandas module to read the given URLs into dataframes\n",
 13 |     "import pandas as pd\n",
 14 |     "trip_df = pd.read_parquet(trip_url)\n",
 15 |     "zone_lookup_df = pd.read_csv(zone_lookup_url)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 18,
 21 |    "id": "ca20486d-7759-4037-b111-d7fa7878c05f",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# Perform data cleaning on trip_df. Drop missing values in the dataframe. Remove invalid values in the tip_amount and fare_amount columns\n",
 26 |     "trip_df = trip_df.dropna()\n",
 27 |     "trip_df = trip_df[(trip_df.tip_amount > 0) & (trip_df.fare_amount > 0)]"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 19,
 33 |    "id": "8574359e-9cea-4b50-90fd-cdab735165f4",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Calculate the tip percentage by dividing the tip_amount by total_amount\n",
 38 |     "trip_df['tip_percentage'] = (trip_df['tip_amount'] / trip_df['total_amount']) * 100"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 20,
 44 |    "id": "bd61cd1b-6ba4-478d-b7f4-f9087fc19d20",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Merge the two DataFrames on the columns PULocationID and LocationID. Name it as trip_df\n",
 49 |     "trip_df = trip_df.merge(zone_lookup_df, left_on='PULocationID', right_on='LocationID')"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 21,
 55 |    "id": "4ae62441-c0b0-425c-bb25-00469e82e3e0",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Perform one-hot encoding on the following categorical columns: VendorID, Borough, store_and_fwd_flag, and payment_type\n",
 60 |     "trip_df = pd.get_dummies(trip_df, columns=['VendorID', 'Borough', 'store_and_fwd_flag', 'payment_type'])"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 22,
 66 |    "id": "ff2a1085-4654-475e-9fdd-c33240352ec4",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Drop the following columns from the preprocessed DataFrame: tpep_pickup_datetime, tpep_dropoff_datetime, Zone, PULocationID, DOLocationID\n",
 71 |     "trip_df = trip_df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'Zone', 'PULocationID', 'DOLocationID', 'tip_amount', 'total_amount'], axis=1)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "d6ddd125-905e-46c9-8097-6e624fea9902",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Partition trip_df into training, validation, and test sets and save it to s3://q-workshop-<account-id>/lab4/tlc/outputs as CSV files using AWS SDK for S3\n",
 82 |     "from sklearn.model_selection import train_test_split\n",
 83 |     "train_df, val_test_df = train_test_split(trip_df, test_size=0.2, random_state=42)\n",
 84 |     "val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)\n",
 85 |     "import boto3\n",
 86 |     "s3 = boto3.client('s3')\n",
 87 |     "s3.put_object(Body=train_df.to_csv(index=False), Bucket='q-workshop-<account-ID>', Key='lab4/tlc/output/train.csv')\n",
 88 |     "s3.put_object(Body=val_df.to_csv(index=False), Bucket='q-workshop-<account-ID>', Key='lab4/tlc/output/validation.csv')\n",
 89 |     "s3.put_object(Body=test_df.to_csv(index=False), Bucket='q-workshop-<account-ID>', Key='lab4/tlc/output/test.csv')"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 24,
 95 |    "id": "771ac2cb-43c3-4f5d-a4c5-9117db5e77e1",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "test_2_df = test_df.drop(['tip_percentage'], axis=1)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "be8d313e-7c8f-48de-a974-681033531ef7",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "s3.put_object(Body=test_2_df.to_csv(index=False), Bucket='q-workshop-<account-ID>', Key='lab4/tlc/output/test_2.csv')"
110 |    ]
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 3 (ipykernel)",
116 |    "language": "python",
117 |    "name": "python3"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 3
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython3",
129 |    "version": "3.11.9"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 5
134 | }
135 | 


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/.DS_Store


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Analyze.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Analyze.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Batch_Inference_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Batch_Inference_results.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Build_pre_preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Build_pre_preview.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Create_A_Model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Create_A_Model.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Create_Standard_Build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Create_Standard_Build.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Data_Flow_Create_Model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Data_Flow_Create_Model.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Data_Flow_Creation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Data_Flow_Creation.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Data_Flow_Data_Tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Data_Flow_Data_Tab.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Data_Flow_File_Selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Data_Flow_File_Selection.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Data_Visualizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Data_Visualizer.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Export Canvas Dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Export Canvas Dataset.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Import_Test_No_Label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Import_Test_No_Label.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Model_Created.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Model_Created.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/My_Models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/My_Models.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Predict.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Predict.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Screenshot 2024-10-16 at 2.59.19 PM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Screenshot 2024-10-16 at 2.59.19 PM.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Single_Prediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Single_Prediction.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/Target_Column.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/Target_Column.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/name_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/name_dataset.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/preview_data_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/preview_data_test.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/select_test_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/select_test_data.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/Images/view_created_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/Images/view_created_dataset.png


--------------------------------------------------------------------------------
/auto-ml-data-engineering/README.md:
--------------------------------------------------------------------------------
  1 | # SageMaker Notebooks Data Engineering with SageMaker Canvas Auto ML
  2 | 
  3 | ## Authors:
  4 | [Tanner Jones](https://github.com/Tjones1701)
  5 | 
  6 | This project will discuss a continuation on from Lab 4 of the [Accelerate data engineering on AWS with Amazon Q Developer](https://catalog.us-east-1.prod.workshops.aws/workshops/2c7b3627-a1f8-4310-8e5b-211b306d10d9/en-US/lab4/sagemaker)
  7 | 
  8 | ## Introduction
  9 | 
 10 | Welcome to this hands-on lab where we'll explore the power of Amazon SageMaker in solving real-world data engineering and machine learning challenges. In this session, we'll step into the role of a data engineer working for a ride-sharing company operating in New York City. Our mission is to analyze the factors influencing passenger tipping behavior and build a machine learning model to predict the likelihood of a passenger leaving a tip based on various trip features.
 11 | 
 12 | Using the New York City Taxi and Limousine Commission (TLC) dataset, which includes detailed information about yellow taxi trips, we'll leverage Amazon SageMaker's comprehensive suite of tools to process, analyze, and model this data. We'll work with two key datasets: the Yellow Taxi Trip Records and the Taxi Zone Lookup Table, combining them to create a rich feature set for our predictive model.
 13 | 
 14 | Throughout this lab, we'll utilize various components of Amazon SageMaker, including Notebooks, Studio, Canvas, and Auto ML. We'll take you through the entire machine learning lifecycle, from data preparation and feature engineering to model training and evaluation. By the end of this session, you'll have hands-on experience in building a sophisticated machine learning solution that can provide valuable insights for our ride-sharing company, potentially improving driver earnings and customer satisfaction.
 15 | 
 16 | Let's dive in and discover how Amazon SageMaker can accelerate our data engineering and machine learning workflows in this exciting real-world scenario.
 17 | 
 18 | This repo contains the SageMaker Studio Jupyter Notebook from the lab with a slight edit to account for the continuation where we will save multiple datasets to be used with the Auto ML job in SageMaker Canvas. 
 19 | 
 20 | ## Step 1: Running Sagemaker Notebook using Jupyter Lab
 21 | 
 22 | The instructions for Step 1 are outlined in Lab 4 of the workshop. Once you login to SageMaker Studio, click on Jupyter Lab (either creating a new one or using the existing one from the workshop) and upload the Auto-ML notebook. Run all the cells sequentially and confirm that you have the appropriate files stored in S3. 
 23 | 
 24 | Remember, if you are not using the workshop to configure your accounts then you will need to create the appropriate bucket and folders to properly store the files otherwise you will need to adjust the jupyter notebook to account for the custom S3 structure you provide.
 25 | 
 26 | You may also need to configure the appropriate IAM read and write permissions for the notebook to access the S3 bucket if you are not using the workshop.
 27 | 
 28 | (Note: Once data is stored in S3, you are ready to move on)
 29 | 
 30 | (Note: The training, validation, and test_no_labels sets will also be provided in the data folder if you would rather go through the Auto ML steps without performing data engineering)
 31 | 
 32 | ## Step 2: Create a SageMaker Auto ML model
 33 | 
 34 | ### Part 1: Create a Data Flow using Data Wrangler
 35 | Select Import and Prepare -> Tabular
 36 | 
 37 | ![Data Flow](./images/Data_Flow_Creation.png)
 38 | 
 39 | Select the appropriate train and validation sets from S3. This is what we will use to train our model!
 40 | 
 41 | ![Data Selection](./Images/Data_Flow_File_Selection.png)
 42 | 
 43 | Ensure previewing the data shows the correct information and then select import.
 44 | 
 45 | Navigate to the Data Wrangler -> Data Flow tab and you should see your newly created Data Flow
 46 | 
 47 | ### Part 2: Model Creation
 48 | 
 49 | Now we will begin creating the model. Select the three dots next to Data Types and select Create Model
 50 | 
 51 | ![Data Flow Page](./Images/Data_Flow_Create_Model.png)
 52 | 
 53 | Select the Target Column as `tip_percentage`
 54 | 
 55 | ![Tip Percentage](./Images/Target_Column.png)
 56 | 
 57 | Provide a good name for the Dataset and for the Model and make sure the problem type is set to prediction
 58 | 
 59 | ![Create A Model](./Images/Create_A_Model.png)
 60 | 
 61 | Once the model is created, navigate back to our Data Flow to view the new pipeline
 62 | 
 63 | ![Model Created](./Images/Model_Created.png)
 64 | 
 65 | From there, you can navigate to My Models tab on left hand side to view the new model there as well.
 66 | 
 67 | ![My Models](./Images/My_Models.png)
 68 | 
 69 | Go ahead and select the new Model and we will begin to Analyze the data
 70 | 
 71 | ### Part 3: Data Visualization and Model Building
 72 | 
 73 | For part 3, we will begin viewing the relationships between our data columns and we will also train our Auto ML model for prediction using the Standard Build feature.
 74 | 
 75 | Select the Data Visualizer Tab to pull up a visualization of the correlation between variables in our data
 76 | 
 77 | ![Data Visualizer](./Images/Data_Visualizer.png)
 78 | 
 79 | This allows us to see how variables relate to one another. If variables have a high number such as fare amount and trip distance then it means the variables are pretty dependent on one another. This makes sense as your fare is generally going to depend on how far you traveled.
 80 | 
 81 | Navigate back to model page
 82 | 
 83 | Now you can either choose to preview the model to get a general sense of what it will do as well as what columns are most impactful to the prediction or you can start the Standard Build. The Standard Build will take longer than the preview or the Quick Build but should provide a higher accuracy.
 84 | 
 85 | Use the dropdown to configure the build type.
 86 | 
 87 | ![Create Standard Build](./Images/Create_Standard_Build.png)
 88 | 
 89 | This process should take around 2-4 hours if you chose to run a standard build. The greyed out buttons means the Model is building.
 90 | 
 91 | ### Part 4: Analyze
 92 | 
 93 | Once the Model has completed the build process, you should be able to navigate to the Analyze screen.
 94 | 
 95 | ![Analyze](./Images/Analyze.png)
 96 | 
 97 | This page shows us the RMSE (Root Mean Squared Error) for our model as well as the most impactful variables on prediction. These may change with your own building of the model depending on what the model deems most impactful.
 98 | 
 99 | ### Part 5: Predict
100 | 
101 | Now that our model has been built, its time to test it with some of our own inputs. 
102 | 
103 | Navigate to the Predict Page.
104 | 
105 | ![Predict](./Images/Predict.png)
106 | 
107 | Here is the page where we will perform our predictive analysis of our model to see how well it performs on our own test cases.
108 | 
109 | We have two options, Batch prediction or single prediction. I will go over in Part 5.5 how to create a batch manual prediction job but for now we will use the single prediction feature.
110 | 
111 | Select single prediction.
112 | 
113 | ![Single Prediction](./Images/Single_Prediction.png)
114 | 
115 | Now you can update the values in the spaces below with appropriate values and see how much of an impact they had on the overall prediction for tip percentage
116 | 
117 | For example, I updated the `mta_tax` variable to have a value of `0.9` and it had a `62.4%` impact on predicting `tip_percentage`
118 | 
119 | You can continue to play around with these values to evaluate the accuracy of your model and the effect each variable had on the prediction
120 | 
121 | ### Part 5.5: Batch Prediction (Optional)
122 | 
123 | Navigate back to the data flow page and click on import tabular data again. Click through the s3 folder stucture and select the `test_no_labels.csv` file.
124 | 
125 | ![Test No Labels](./Images/Import_Test_No_Label.png)
126 | 
127 | You can preview the data to ensure their is no `tip_percentage` column as this is the column the model will be predicting.
128 | 
129 | ![Test_data](./Images/preview_data_test.png)
130 | 
131 | select import to import the data.
132 | 
133 | Navigate back to the Data Flow page and click the 3 dots on the newly created Data Types of the Data Source and select Export -> Export data to Canvas dataset
134 | 
135 | ![Export Canvas dataset](./Images/Export%20Canvas%20Dataset.png)
136 | 
137 | Give the dataset an appropriate name and click create.
138 | 
139 | Navigate back to the Data Flow page to view the newly created dataset
140 | 
141 | ![view dataset](./Images/view_created_dataset.png)
142 | 
143 | Navigate to the predictions page by selecting My Models -> The model you created -> predictions tab -> batch predictions -> Manual -> and select the dataset you previously created with the test data.
144 | 
145 | ![Select Test Data](./Images/select_test_data.png)
146 | 
147 | Once the model finishes generating the predictions, you should be able to see the completed job in the All Jobs table. Here you can click on the job and view the inference results.
148 | 
149 | ![Batch Inference Resuts](./Images/Batch_Inference_results.png)
150 | 
151 | ## Conclusion
152 | 
153 | In this lab, we successfully demonstrated the power and versatility of Amazon SageMaker's suite of tools in handling complex data processing and machine learning tasks. We began by integrating two different data formats - CSV and Parquet - showcasing SageMaker's ability to work with diverse data sources. Through feature engineering and enrichment, we enhanced the quality and relevance of our dataset, preparing it for machine learning applications.
154 | 
155 | The use of various SageMaker components - Notebooks, Studio, Canvas, and Auto ML - highlighted the platform's comprehensive approach to the machine learning lifecycle. This integrated ecosystem allowed us to move from data preparation to model training and deployment.
156 | 
157 | Our final model, trained to predict tip percentages for New York City taxi rides, achieved an RMSE of 4.28, indicating a reasonably good fit considering the sparsity of the data. The model utilized 24 features, with trip_distance being the most influential, accounting for 20.32% of the prediction importance, followed by fare amount at 16.66%. This insight provides valuable information about the key factors affecting tipping behavior.
158 | 
159 | The scale of this project was significant, with the training set containing over 49 million examples. Despite this large dataset, the entire lab could be completed in about 2-3 hours, with the model training phase taking anywhere from a couple of minutes to over an hour. This demonstrates the efficiency and scalability of SageMaker's Auto ML capabilities in handling large datasets.
160 | 
161 | This lab not only achieved its technical objectives but also demonstrated the accessibility and efficiency of SageMaker in handling end-to-end machine learning projects, from data processing to model deployment and inference generation. The ability to process such a large dataset and train a model with meaningful results in a relatively short time frame underscores the power of cloud-based machine learning tools like SageMaker.
162 | 
163 | In conclusion, this project showcases the practical applications of machine learning in real-world scenarios, providing valuable insights into tipping behavior that could benefit both drivers and ride-sharing companies. It also showcases how Amazon SageMaker can be leveraged to quickly and effectively develop machine learning solutions for complex problems with large datasets.
164 | 


--------------------------------------------------------------------------------
/auto-ml-data-engineering/data/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/auto-ml-data-engineering/data/data.zip


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/README.md:
--------------------------------------------------------------------------------
 1 | # Serverless Data Prep with Glue Interactive Sessions (from `SageMaker Studio`)
 2 | 
 3 | This demo demonstrates the ability to use Apache Spark using AWS Glue to do data prep with two different datasets in order to build an urban air quality predictor with Amazon SageMaker.
 4 | 
 5 | ![Intro](img/img1.png)
 6 | 
 7 | # Introduction
 8 | 
 9 | While some organizations see data science, data engineering, and data analytics as separate siloed functions, we're increasingly seeing with many of our customers that data prep and analytics are foundational components of ML workflows.
10 | 
11 | For example, although organizations have data engineering teams to clean and prepare data for analytics and ML, the specific data that a data scientist may need for training a specific model may not be available in the repository of data that a data engineering team may have prepared.
12 | 
13 | ![Intro](img/img2.png)
14 | 
15 | # Problem Statement
16 | 
17 | Lets take a problem and try to solve it. As we all know, Air pollution in cities can be an acute problem leading to damaging effects on people, animals, plants and property.
18 | 
19 | We need to build a machine learning model which can help to predict the amount of NO2 in the area based on weather conditions
20 | 
21 | So, ultimately we would like to have a ML model, wherein we are going to feed the weather details of a particular city at a given time, These details would be, mean temperature, maximum temperature, minimum temperate and so on.
22 | 
23 | And the Model should predict the NO2 or nitrogen dioxide concentration levels at that time.
24 | 
25 | ![Intro](img/img3.png)
26 | 
27 | # Dataset
28 | 
29 | For this demo we would use the following dataset:
30 | 
31 | - [OpenAQ physical air quality data](https://registry.opendata.aws/openaq/) : Global, aggregated physical air quality data from public data sources provided by government, research-grade and other sources.
32 |     42GB of Data
33 | 
34 | - [NOAA Global Surface Summary of Day](https://registry.opendata.aws/noaa-gsod/) : Global summary of day data for 18 surface meteorological elements are derived from the synoptic/hourly observations contained in USAF DATSAV3 Surface data and Federal Climate Complex Integrated Surface Hourly (ISH).
35 | 
36 | # Code
37 | 
38 | Follow the notebook in sequence:
39 | 1. [Data preparation using AWS Glue with Spark](code/1.DataProcessingGlue.ipynb)
40 | 2. [ML Model training and deployment using Amazon SageMaker](code/2.ModelBuildingDeployment.ipynb)
41 | 


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/code/img/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/code/img/img1.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/code/img/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/code/img/img2.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/code/img/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/code/img/img3.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/img/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/img/img1.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/img/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/img/img2.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/img/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/img/img3.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/img/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/img/img4.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/img/img5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/img/img5.png


--------------------------------------------------------------------------------
/aws-glue-sm-studio-integration/img/img6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/aws-glue-sm-studio-integration/img/img6.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/.DS_Store


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/.gitignore:
--------------------------------------------------------------------------------
1 | dataset
2 | dataset/*
3 | 
4 | 


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Suman Debnath
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/README.md:
--------------------------------------------------------------------------------
  1 | # Build a managed analytics platform for an e-commerce business on AWS 
  2 | 
  3 | With the increase in popularity of online shopping, building an analytics platform for e-commerce is important for any organization, as it provides insights into the business, trends, and customer behavior. But more importantly, it can uncover hidden insights that can trigger revenue-generating business decisions and actions. In this blog, we will learn how to build a complete analytics platform in batch and real-time mode. The real-time analytics pipeline also shows how to detect distributed denial of service (DDoS) and bot attacks, which is a common requirement for such use cases.
  4 | 
  5 | ## Introduction 
  6 | 
  7 | E-commerce analytics is the process of collecting data from all of the sources that affect a certain online business. Data Analysts or Business Analysts can then utilize this information to deduce changes in customer behavior and online shopping patterns. E-commerce analytics spans the whole customer journey, starting from discovery through acquisition, conversion, and eventually retention and support.
  8 | 
  9 | In this two part blog series, we will build an e-commerce analytical platform which can help to analyze the data in real time as well as in batch. We will use an eCommerce dataset from [Kaggle]((https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store)
 10 | ) to simulate the logs of user purchases, product views, cart history, and the user’s journey on the online platform to create two analytical pipelines:
 11 | 
 12 | - Batch Processing 
 13 | - Online/Real-time Processing 
 14 | 
 15 | You may like to refer to [this session](https://www.youtube.com/watch?v=DvCjIVcs9KA&t=16s&ab_channel=AWSEvents) for a video walk-through. 
 16 | 
 17 | ![Img1](img/img1.png)
 18 | 
 19 | **Batch Processing**  
 20 | 
 21 | The `Batch processing` will involve data ingestion, Lake House architecture, processing, visualization using Amazon Kinesis, AWS Glue, Amazon S3, and Amazon QuickSight to draw insights regarding the following:
 22 | 
 23 | - Unique visitors per day
 24 | 
 25 | - During a certain time, the users add products to their carts but don’t buy them
 26 | 
 27 | - Top categories per hour or weekday (i.e. to promote discounts based on trends)
 28 | 
 29 | - To know which brands need more marketing
 30 | 
 31 | **Online/Real-time Processing** 
 32 | 
 33 | The `Real-time processing` would involve detecting DDoS and bot attacks using AWS Lambda, Amazon DynamoDB, Amazon CloudWatch, and AWS SNS.
 34 | 
 35 | This is the first part of the blog series, where we will focus only on the **Online/Real-time processing** data pipeline. In the second part of the blog series, we will dive into the **Batch Processing** 
 36 | 
 37 | ## Dataset 
 38 | 
 39 | For this blog, we are going to use the [eCommerce behavior data from multi category store](https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store)
 40 | 
 41 | This file contains the behavior data for 7 months (from October 2019 to April 2020) from a large multi-category online store, where each row in the file represents an event. All events are related to products and users. Each event is like many-to-many relation between products and users.
 42 | 
 43 | ## Architecture 
 44 | 
 45 | **Real-time Processing**  
 46 | 
 47 | We are going to build an end to end data engineering pipeline where we will start with this [eCommerce behavior data from multi category store](https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store) dataset as an input, which we will use to simulate real time e-commerce workload. 
 48 | 
 49 | This input `raw` stream of data will go into an **Amazon Kinesis Data Stream** (`stream1`), which will stream the data to **Amazon Kinesis Data Analytics** for analysis, where we will use an [Apache Flink application](https://docs.aws.amazon.com/kinesisanalytics/latest/java/what-is.html) to detect any DDoS attack, and the `filtered` data will be sent to another Amazon Kinesis Data Stream (`stream2`). 
 50 | 
 51 | We are going to use **SQL** to build the `Apache Flink` application using **Amazon Kinesis Data Analytics** and hence, we would need a metadata store, for which we are going to use **AWS Glue** Data Catalog. 
 52 | 
 53 | And then this `stream2` will trigger an **AWS Lambda** function which will send an **Amazon SNS** notification to the stakeholders and shall store the fraudulent transaction details in a **DynamoDB** table. The architecture would look like this: 
 54 | 
 55 | ![Img1](img/img2-1.png)
 56 | 
 57 | **Batch Processing** 
 58 | 
 59 | If we look into the architecture diagram above, we will see that we are not storing the `raw` incoming data anywhere. As the data enters through **Kinesis Data Stream** (`stream1`) we are passing it to **Kinesis Data Analytics** to analyze. And it might happen that later on we discover some bug in our `Apache Flink` application, and at that point, we will fix the bug and resume processing the data, but we cannot process the old data (which was processed by our buggy `Apache Flink` application. And this is because we have not stored the `raw` data anywhere which can allow us to re-process later. 
 60 | 
 61 | And that's why it's recommended to always have a copy of the `raw` data stored in some storage (e.g. on Amazon S3) so that we can revisit the data if needed for reprocessing and/or for batch processing. 
 62 | 
 63 | And this is exactly what we are going to do. We will use the same incoming data stream from Amazon Kinesis Data Stream (`stream1`) and pass it on to **Kinesis Firehose** which can write the data on **S3**. Then we will use **Glue** to catalog that data and perform an ETL job using Glue ETL to process/clean that data so that we can further use the data for running some analytical queries using **Athena**. 
 64 | 
 65 | At last, we would leverage **QuickSight** to build a dashboard for visualization.  
 66 | 
 67 | ![Img1](img/img3-1.png)
 68 | 
 69 | ## Step by step walk through
 70 | 
 71 | Let's build this application step by step. I'm going to use an [AWS Cloud9 instance](https://aws.amazon.com/cloud9/) for this project, but it is not mandatory. But if you wish to spin up an AWS Cloud9 instance, you may like to follow steps mentions [here](https://docs.aws.amazon.com/cloud9/latest/user-guide/create-environment-main.html) and proceed further. 
 72 | 
 73 | 
 74 | ### Download the dataset and clone the GirHub Repo 
 75 | 
 76 | Clone the project and change it to the right directory:
 77 | 
 78 | ```bash
 79 | 
 80 | # CLone the project repository 
 81 | git clone https://github.com/debnsuma/build-a-managed-analytics-platform-for-ecommerce-business.git
 82 | 
 83 | cd build-a-managed-analytics-platform-for-ecommerce-business/
 84 | 
 85 | # Create a folder to store the dataset 
 86 | mkdir dataset 
 87 | 
 88 | ```
 89 | 
 90 | Download the dataset from [here](https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store) and move the downloaded file (`2019-Nov.csv.zip`) under the `dataset` folder  
 91 | 
 92 | ![img](img/img4.png)
 93 | 
 94 | Now, let's unzip the file and create a sample version of the dataset by just taking the first `1000` records from the file. 
 95 | 
 96 | ```bash
 97 | 
 98 | cd dataset 
 99 | 
100 | unzip 2019-Nov.csv.zip
101 | 
102 | cat 2019-Nov.csv | head -n 1000 > 202019-Nov-sample.csv
103 | 
104 | ```
105 | 
106 | ### Create an Amazon S3 bucket 
107 | 
108 | Now we can create a S3 bucket and upload this dataset 
109 | 
110 | - Name of the Bucket : `ecommerce-raw-us-east-1-dev` (replace this with your own `<BUCKET_NAME>`)
111 | 
112 | ```bash
113 | 
114 | # Copy all the files in the S3 bucket 
115 | aws s3 cp 2019-Nov.csv.zip s3://<BUCKET_NAME>/ecomm_user_activity/p_year=2019/p_month=11/
116 | aws s3 cp 202019-Nov-sample.csv s3://<BUCKET_NAME>/ecomm_user_activity_sample/202019-Nov-sample.csv
117 | aws s3 cp 2019-Nov.csv s3://<BUCKET_NAME>/ecomm_user_activity_unconcompressed/p_year=2019/p_month=11/
118 | 
119 | ```
120 | 
121 | ### Create the Kinesis Data Stream 
122 | 
123 | Now, let's create the first Kinesis data stream (`stream1` in our architecture diagram) which we will be using as the incoming stream. Open the **AWS Console** and then:
124 | 
125 | - Go to **Amazon Kinesis** 
126 | - Click on **Create data stream** 
127 | 
128 | ![](img/img5-1.png)
129 | 
130 | - Put `ecommerce-raw-user-activity-stream-1` as the Data stream name
131 | - Click on **Create data stream** 
132 | 
133 | ![](img/img6-1.png)
134 | 
135 | 
136 | Let's create another Kinesis data stream which we are going to use later on (`stream2` in the architecture diagram). This time use the Data stream name as `ecommerce-raw-user-activity-stream-2` 
137 | 
138 | ![](img/img7-1.png)
139 | 
140 | ### Start the e-commerce traffic 
141 | 
142 | We can now start the e-commerce traffic, as our Kinesis data stream is ready. This simulator which we are going to use is a simple `python script` which will read the data from a CSV file (`202019-Nov-sample.csv`, the dataset which we downloaded earlier) line by line and send it to the Kinesis data stream (`stream1`). 
143 | 
144 | But before you run the simulator, just edit the `stream-data-app-simulation.py` script with your *<BUCKET_NAME>* where we have our dataset. 
145 | 
146 | 
147 | ```python
148 | 
149 | # S3 buckect details (UPDATE THIS)
150 | BUCKET_NAME = "ecommerce-raw-us-east-1-dev"       
151 | ```
152 | 
153 | Once it's updated, we can run the simulator. 
154 | 
155 | ```bash 
156 | # Go back to the project root directory 
157 | cd .. 
158 | 
159 | # Run simulator 
160 | pip install boto3
161 | python code/ecomm-simulation-app/stream-data-app-simulation.py 
162 | 
163 | HttpStatusCode: 200 ,  electronics.smartphone
164 | HttpStatusCode: 200 ,  appliances.sewing_machine
165 | HttpStatusCode: 200 ,  
166 | HttpStatusCode: 200 ,  appliances.kitchen.washer
167 | HttpStatusCode: 200 ,  electronics.smartphone
168 | HttpStatusCode: 200 ,  computers.notebook
169 | HttpStatusCode: 200 ,  computers.notebook
170 | HttpStatusCode: 200 ,  
171 | HttpStatusCode: 200 ,  
172 | HttpStatusCode: 200 ,  electronics.smartphone
173 | 
174 | ```
175 | 
176 | ### Integration with Kinesis Data Analytics and Apache Flink
177 | 
178 | Now, we will create an **Amazon Kinesis Data Analytics** Streaming Application which will analyze this incoming stream for any DDoS or bot attack. Open the AWS Console and then:
179 | 
180 | - Go to **Amazon Kinesis** 
181 | - Select **Analytics applications** 
182 | - Click on **Studio notebooks** 
183 | - Click on **Create Studio notebook**
184 | 
185 | ![](img/img8-1.png)
186 | 
187 | - Use `ecomm-streaming-app-v1` as the **Studio notebook name** 
188 | - Under the **Permissions** section, click on `Create` to create an AWS Glue database, name the database as `my-db-ecomm` 
189 | -  Use the same database, `my-db-ecomm` from the dropdown 
190 | - Click on **Create Studio notebook** 
191 | 
192 | ![](img/img9-1.png)
193 | 
194 | Now, select the `ecomm-streaming-app-v1` Studio notebook and click on **Open in Apache Zeppelin** 
195 | 
196 | ![](img/img10-1.png)
197 | 
198 | Once the **Zeppelin Dashboard** comes up, click on `Import note` and import this [notebook](/code/flink-app/sql-flink-ecomm-notebook-1.zpln)
199 | 
200 | ![](img/img11.png)
201 | 
202 | Open the `sql-flink-ecomm-notebook-1` notebook. Flink interpreter supported by Apache Zeppelin notebook are Python, IPython, stream SQL, or batch SQL, and we are going to use `SQL` to write our code. There are many different ways to create a **Flink Application** but one of the easiest way is to use Zeppelin notebook. Let's look at this notebook and briefly discuss what are we doing here:
203 | 
204 | 
205 | - First we are creating a `table` for the incoming source of data (which is the `ecommerce-raw-user-activity-stream-1` incoming stream) 
206 | - Next we are creating another `table` for the filtered data (which is for the `ecommerce-raw-user-activity-stream-2` outgoing stream)
207 | - And finally we are putting the logic to simulate the **DDoSS** attack. We are essentially looking into the last 10 seconds of the data and grouping them by `user_id`. And if we notice more than 5 records within that 10 seconds, Flink will take that `user_id` and the no. of records within those 10 seconds and will push that data to the `ecommerce-raw-user-activity-stream-2` outgoing stream. 
208 | 
209 | ![](/img/img-flink.png)
210 | 
211 | Since we are working within a dummy environment, we can set the threshold record to any other number (not just 5). The idea is to simulate DDoS attack, and if we see the same user is adding/viewing/placing orders (lets say, `5 products in last 10 seconds, by user_id 1`), we can assume its a DDoS/BOT attack, as it's naturally not feasible. We are hardcoding it just for this demo purpose, but in real world this might be coming dynamically from a configuration file.
212 | 
213 | ```sql
214 | 
215 | %flink.ssql
216 | 
217 | /*Option 'IF NOT EXISTS' can be used, to protect the existing Schema */
218 | DROP TABLE IF EXISTS ecomm_user_activity_stream_1;
219 | 
220 | CREATE TABLE ecomm_user_activity_stream_1 (
221 |   `event_time` VARCHAR(30), 
222 |   `event_type` VARCHAR(30), 
223 |   `product_id` BIGINT, 
224 |   `category_id` BIGINT, 
225 |   `category_code` VARCHAR(30), 
226 |   `brand` VARCHAR(30), 
227 |   `price` DOUBLE, 
228 |   `user_id` BIGINT, 
229 |   `user_session` VARCHAR(30),
230 |   `txn_timestamp` TIMESTAMP(3),
231 |   WATERMARK FOR txn_timestamp as txn_timestamp - INTERVAL '10' SECOND  
232 | )
233 | PARTITIONED BY (category_id)
234 | WITH (
235 |   'connector' = 'kinesis',
236 |   'stream' = 'ecommerce-raw-user-activity-stream-1',
237 |   'aws.region' = 'us-east-1',
238 |   'scan.stream.initpos' = 'LATEST',
239 |   'format' = 'json',
240 |   'json.timestamp-format.standard' = 'ISO-8601'
241 | );
242 | 
243 | /*Option 'IF NOT EXISTS' can be used, to protect the existing Schema */
244 | DROP TABLE IF EXISTS ecomm_user_activity_stream_2;
245 | 
246 | CREATE TABLE ecomm_user_activity_stream_2 (
247 |   `user_id` BIGINT, 
248 |   `num_actions_per_watermark` BIGINT
249 | )
250 | WITH (
251 |   'connector' = 'kinesis',
252 |   'stream' = 'ecommerce-raw-user-activity-stream-2',
253 |   'aws.region' = 'us-east-1',
254 |   'format' = 'json',
255 |   'json.timestamp-format.standard' = 'ISO-8601'
256 | );
257 | 
258 | /* Inserting aggregation into Stream 2*/
259 | insert into ecomm_user_activity_stream_2
260 | select  user_id, count(1) as num_actions_per_watermark
261 | from ecomm_user_activity_stream_1
262 | group by tumble(txn_timestamp, INTERVAL '10' SECOND), user_id
263 | having count(1) > 5;
264 | 
265 | ```
266 | 
267 | ### Create the Apache Flink Application
268 | 
269 | Now that we have our notebook imported, we can create the **Flink Application** from the notebook directly. And to do that: 
270 | 
271 | - Click on `Actions for ecomm-streaming-app-v1` on the top right corner 
272 | 
273 | ![](img/img12.png)
274 | 
275 | - Click on `Build sql-flink-ecomm-notebook-1` > `Build and export`. It will compile all the codes, will create a ZIP file and would store the file on S3 
276 | 
277 | ![](img/img13.png)
278 | 
279 | - And now we can deploy that application by simply clicking on `Actions for ecomm-streaming-app-v1` on the top right corner 
280 | 
281 | - Click on `Deploy sql-flink-ecomm-notebook-1 as Kinesis Analytics application` > `Deploy using AWS Console` 
282 | 
283 | - Scroll down and click on `Save changes` 
284 | 
285 | ![](img/img14.png)
286 | 
287 | This is the power of **Kinesis Data Analytics**, just from a simple Zeppelin Notebook we can create a real world application without any hindrance. 
288 | 
289 | - Finally we can start the application by clicking on **Run**. It might take couple of minutes to start the application so lets wait till we see **Status** as `Running` 
290 | 
291 | ![](img/img15-1.png) 
292 | 
293 | ### Alarming DDoS Attack 
294 | 
295 | If we revisit our architecture, we will see that we are almost done with the **real-time/online processing**, the only thing which is pending is to create a Lambda function which will be triggered whenever there is a entry of a record inside the `ecommerce-raw-user-activity-stream-2` stream. And the Lambda function would perform the following:
296 |   - Write that record into a **DynamoDB** table  
297 |   - Send a **SNS** notification
298 |   - Update the **CloudWatch** metrics 
299 | 
300 | ![](img/img16.png) 
301 | 
302 | Let's first build the code for the Lambda function, the code is available under [`code/serverless-app`](code/serverless-app/lambda_function.py) folder
303 | 
304 | ```bash
305 | # Install the aws_kinesis_agg package
306 | cd code/serverless-app/
307 | pip install aws_kinesis_agg -t .
308 | 
309 | # Build the lambda package and download the zip file.
310 | zip -r ../lambda-package.zip .
311 | 
312 | # Upload the zip to S3
313 | cd ..
314 | aws s3 cp lambda-package.zip s3://ecommerce-raw-us-east-1-dev/src/lambda/
315 | 
316 | ```
317 | 
318 | Now, let's create the Lambda function
319 | 
320 | - Open the **AWS Lambda** console 
321 | - Click on **Create function** button 
322 | 
323 | ![](img/img17-1.png) 
324 | 
325 | - Enter the Function name as `ecomm-detect-high-event-volume` 
326 | - Enter the Runtime as `Python 3.7`
327 | - Click on **Create function**  
328 | 
329 | ![](img/img18-1.png) 
330 | 
331 | Once the Lambda function is created we need to upload the code which we stored in S3. 
332 | 
333 | ![](img/img19-1.png) 
334 | 
335 | Provide the location of the Lambda code and click on **Save**  
336 | 
337 | ![](img/img20.png) 
338 | 
339 | We need to provide adequate privileges to our Lambda function so that it can talk to Kinesis Data Streams, DynamoDB, CloudWatch and SNS. To modify the IAM Role: 
340 | 
341 | - Go to **Configuration** tab > **Permission** tab on the left
342 | - Click on the **Role Name**
343 | 
344 | ![](img/img21-1.png) 
345 | 
346 | Since this is just for the demo, we are adding `Full Access`, but its **NOT** recommended for production environment. We should always follow the *least privilege* principle to grant access to any user/resource. 
347 | 
348 | ![](img/img22-1.png) 
349 | 
350 | Let's create the a SNS Topic:
351 | 
352 | - Open the **Amazon SNS** console 
353 | - Click on **Create Topic** 
354 | - Select the Type as `Standard` 
355 | - Provide the Name as `ecomm-user-high-severity-incidents` 
356 | - Click on **Create Topic** 
357 | 
358 | ![](img/img24-1.png) 
359 | 
360 | Let's create a DynamoDB table:
361 | 
362 | - Open the **Amazon DynamoDB** console 
363 | - Click on **Create table** 
364 | - Create the table with the following details
365 | 
366 |     | Field | Value      | 
367 |     | :-------- | :------- |
368 |     | `Name`      | `ddb-ecommerce-tab-1` |
369 |     | `Partition Key`      | `ddb_partition_key` |
370 |     | `Secondary Key`      | `ddb_sort_key` |
371 | 
372 | ![](img/img25-1.png) 
373 | 
374 | Now, we can add the environment variables which are needed for the Lambda Function, these environment variables are used in the [`lambda function code`](code/serverless-app/lambda_function.py)
375 | 
376 | ![](img/img23.png) 
377 | 
378 | Following are the environment variables:
379 | 
380 | | Key | Value      | 
381 | | :-------- | :------- |
382 | | `cloudwatch_metric`      | `ecomm-user-high-volume-events` |
383 | | `cloudwatch_namespace`      | `ecommerce-namespace-1` |
384 | | `dynamodb_control_table`      | `ddb-ecommerce-tab-1` |
385 | | `topic_arn`      | `<Your SNS Topic ARN>` |
386 | 
387 | 
388 | ![](img/img26-1.png) 
389 | 
390 | ## Show time 
391 | 
392 | ![](img/img27.png) 
393 | 
394 | So, now we are all done with the implementation and it's time to start generating the traffic using the `python script` which we created earlier, and see everything in action!!
395 | 
396 | ```bash
397 | cd build-a-managed-analytics-platform-for-ecommerce-business 
398 | 
399 | python code/ecomm-simulation-app/stream-data-app-simulation.py 
400 | HttpStatusCode: 200 ,  electronics.smartphone
401 | HttpStatusCode: 200 ,  appliances.sewing_machine
402 | HttpStatusCode: 200 ,  
403 | HttpStatusCode: 200 ,  appliances.kitchen.washer
404 | HttpStatusCode: 200 ,  electronics.smartphone
405 | HttpStatusCode: 200 ,  computers.notebook
406 | HttpStatusCode: 200 ,  computers.notebook
407 | HttpStatusCode: 200 ,  
408 | HttpStatusCode: 200 ,  
409 | HttpStatusCode: 200 ,  electronics.smartphone
410 | HttpStatusCode: 200 ,  furniture.living_room.sofa
411 | 
412 | ```
413 | 
414 | We can also monitor this traffic using the **Apache Flink Dashboard** 
415 | 
416 | - Open the **Amazon Kinesis Application** dashboard 
417 | - Select the Application, `ecomm-streaming-app-v1-sql-flink-ecomm-notebook-1-2HFDAA9HY` 
418 | - Click on `Open Apache Flink dashboard` 
419 | 
420 | ![](img/img28-1.png) 
421 | 
422 | Once you are on the `Open Apache Flink dashboard`
423 | 
424 | - Click on `Running Jobs` > `Job Name` which is running 
425 | 
426 | ![](img/img29.png) 
427 | 
428 | And finally, we can also see all the details of the users which are classified as a DDoS attack by the Flink Application in the `DynamoDB` table. 
429 | 
430 | ![](img/img30-1.png) 
431 | 
432 | You can let the simulator run for next 5-10 mins and can explore and monitor all the components we have built in this whole data pipeline. 
433 | 
434 | ## Summary 
435 | 
436 | In this blog post, we built an e-commerce analytical platform which can help analyze the data in real time.
437 | 
438 | We used a `python` script to simulate the real traffic using the dataset, used Amazon Kinesis as the incoming stream of data. And that data is being analyzed by Amazon Kinesis Data Analytics using Apache Flink using SQL, which involves detecting Distributed denial of service (DDoS) and bot attacks using AWS Lambda, DynamoDB, CloudWatch, and AWS SNS.
439 | 
440 | In the second part of this blog series, we will dive deep and build the batch processing pipeline and build a dashboard using Amazon QuickSight, which will help us to get more insights about users. It will help us to know details like, who visits the ecommerce website more frequently, which are the top and bottom selling products, which are the top brands, and so on. 
441 | 


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/code/.DS_Store


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/code/ecomm-simulation-app/stream-data-app-simulation.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import csv
 3 | import json
 4 | from time import sleep
 5 | from datetime import datetime
 6 | 
 7 | # S3 buckect details (UPDATE THIS <BUCKET_NAME>)
 8 | BUCKET_NAME = "ecommerce-raw-us-east-1-dev"
 9 | KEY = "ecomm_user_activity_sample/202019-Nov-sample.csv"
10 | 
11 | # AWS Settings
12 | s3 = boto3.client('s3', region_name='us-east-1')
13 | s3_resource = boto3.resource('s3', region_name='us-east-1')
14 | kinesis_client = boto3.client('kinesis', region_name='us-east-1')
15 | 
16 | # Kinesis Details 
17 | kinesis_stream_name = 'ecommerce-raw-user-activity-stream-1'
18 | streaming_partition_key = 'category_id'
19 | 
20 | # Function can be converted to Lambda;
21 | #   i.e. by iterating the S3-put events records; e.g. record['s3']['bucket']['name']
22 | def stream_data_simulator(input_s3_bucket, input_s3_key):
23 |     s3_bucket = input_s3_bucket
24 |     s3_key = input_s3_key
25 | 
26 |     # Read CSV Lines and split the file into lines
27 |     csv_file = s3_resource.Object(s3_bucket, s3_key)
28 |     s3_response = csv_file.get()
29 |     lines = s3_response['Body'].read().decode('utf-8').split('\n')
30 | 
31 |     for row in csv.DictReader(lines):
32 |         try:
33 |             # Convert to JSON, to make it easier to work in Kinesis Analytics
34 |             line_json = json.dumps(row)
35 |             json_load = json.loads(line_json)
36 | 
37 |             # Adding fake txn ts:
38 |             json_load['txn_timestamp'] = datetime.now().isoformat()
39 |             # print(json_load)
40 | 
41 |             # Write to Kinesis Streams:
42 |             response = kinesis_client.put_record(StreamName=kinesis_stream_name, Data=json.dumps(json_load, indent=4),
43 |                                                  PartitionKey=str(json_load[streaming_partition_key]))
44 |             # response['category_code'] = json_load['category_code']
45 |             print('HttpStatusCode:', response['ResponseMetadata']['HTTPStatusCode'], ', ', json_load['category_code'])
46 |             # print(response)
47 |             
48 |             # Adding a temporary pause, for demo-purposes:
49 |             sleep(0.5)
50 | 
51 |         except Exception as e:
52 |             print('Error: {}'.format(e))
53 | 
54 | 
55 | # Run stream:
56 | for i in range(0, 5):
57 |     stream_data_simulator(input_s3_bucket=BUCKET_NAME,
58 |                           input_s3_key=KEY)
59 | 


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/code/flink-app/sql-flink-ecomm-notebook-1.zpln:
--------------------------------------------------------------------------------
  1 | ﻿{
  2 |   "paragraphs": [
  3 |     {
  4 |       "text": "%md\n## Streaming Demo: Apache Flink - Using SQL",
  5 |       "user": "anonymous",
  6 |       "dateUpdated": "2022-10-12T19:28:25+0000",
  7 |       "progress": 0,
  8 |       "config": {
  9 |         "tableHide": false,
 10 |         "editorSetting": {
 11 |           "language": "markdown",
 12 |           "editOnDblClick": true,
 13 |           "completionKey": "TAB",
 14 |           "completionSupport": false
 15 |         },
 16 |         "colWidth": 12,
 17 |         "editorMode": "ace/mode/markdown",
 18 |         "fontSize": 9,
 19 |         "editorHide": true,
 20 |         "results": {},
 21 |         "enabled": true
 22 |       },
 23 |       "settings": {
 24 |         "params": {},
 25 |         "forms": {}
 26 |       },
 27 |       "results": {
 28 |         "code": "SUCCESS",
 29 |         "msg": [
 30 |           {
 31 |             "type": "HTML",
 32 |             "data": "<div class=\"markdown-body\">\n<h2>Streaming Demo: Apache Flink - Using SQL</h2>\n\n</div>"
 33 |           }
 34 |         ]
 35 |       },
 36 |       "apps": [],
 37 |       "runtimeInfos": {},
 38 |       "progressUpdateIntervalMs": 500,
 39 |       "jobName": "paragraph_1665602789959_1871698761",
 40 |       "id": "paragraph_1644079724213_1071705746",
 41 |       "dateCreated": "2022-10-12T19:26:29+0000",
 42 |       "status": "FINISHED",
 43 |       "focus": true,
 44 |       "$$hashKey": "object:169",
 45 |       "dateFinished": "2022-10-12T19:28:27+0000",
 46 |       "dateStarted": "2022-10-12T19:28:25+0000"
 47 |     },
 48 |     {
 49 |       "text": "%md\n#### Data pipeline: Kinesis Stream --> KDA and Apache Flink --> Kinesis Stream",
 50 |       "user": "anonymous",
 51 |       "dateUpdated": "2022-10-12T19:26:29+0000",
 52 |       "progress": 0,
 53 |       "config": {
 54 |         "tableHide": false,
 55 |         "editorSetting": {
 56 |           "language": "markdown",
 57 |           "editOnDblClick": true,
 58 |           "completionKey": "TAB",
 59 |           "completionSupport": false
 60 |         },
 61 |         "colWidth": 12,
 62 |         "editorMode": "ace/mode/markdown",
 63 |         "fontSize": 9,
 64 |         "editorHide": true,
 65 |         "results": {},
 66 |         "enabled": true
 67 |       },
 68 |       "settings": {
 69 |         "params": {},
 70 |         "forms": {}
 71 |       },
 72 |       "results": {
 73 |         "code": "SUCCESS",
 74 |         "msg": [
 75 |           {
 76 |             "type": "HTML",
 77 |             "data": "<div class=\"markdown-body\">\n<h4>Data pipeline: Kinesis Stream &ndash;&gt; KDA and Apache Flink &ndash;&gt; Kinesis Stream</h4>\n\n</div>"
 78 |           }
 79 |         ]
 80 |       },
 81 |       "apps": [],
 82 |       "runtimeInfos": {},
 83 |       "progressUpdateIntervalMs": 500,
 84 |       "jobName": "paragraph_1665602789960_1859361410",
 85 |       "id": "paragraph_1644419083868_1242358902",
 86 |       "dateCreated": "2022-10-12T19:26:29+0000",
 87 |       "status": "READY",
 88 |       "$$hashKey": "object:170"
 89 |     },
 90 |     {
 91 |       "text": "%flink.ssql\n\n/*Option 'IF NOT EXISTS' can be used, to protect the existing Schema */\nDROP TABLE IF EXISTS ecomm_user_activity_stream_1;\n\nCREATE TABLE ecomm_user_activity_stream_1 (\n  `event_time` VARCHAR(30), \n  `event_type` VARCHAR(30), \n  `product_id` BIGINT, \n  `category_id` BIGINT, \n  `category_code` VARCHAR(30), \n  `brand` VARCHAR(30), \n  `price` DOUBLE, \n  `user_id` BIGINT, \n  `user_session` VARCHAR(30),\n  `txn_timestamp` TIMESTAMP(3),\n  WATERMARK FOR txn_timestamp as txn_timestamp - INTERVAL '10' SECOND  \n)\nPARTITIONED BY (category_id)\nWITH (\n  'connector' = 'kinesis',\n  'stream' = 'ecommerce-raw-user-activity-stream-1',\n  'aws.region' = 'us-east-1',\n  'scan.stream.initpos' = 'LATEST',\n  'format' = 'json',\n  'json.timestamp-format.standard' = 'ISO-8601'\n);\n\n\n/*Option 'IF NOT EXISTS' can be used, to protect the existing Schema */\nDROP TABLE IF EXISTS ecomm_user_activity_stream_2;\n\nCREATE TABLE ecomm_user_activity_stream_2 (\n  `user_id` BIGINT, \n  `num_actions_per_watermark` BIGINT\n)\nWITH (\n  'connector' = 'kinesis',\n  'stream' = 'ecommerce-raw-user-activity-stream-2',\n  'aws.region' = 'eu-west-1',\n  'format' = 'json',\n  'json.timestamp-format.standard' = 'ISO-8601'\n);\n\n/* Inserting aggregation into Stream 2*/\ninsert into ecomm_user_activity_stream_2\nselect  user_id, count(1) as num_actions_per_watermark\nfrom ecomm_user_activity_stream_1\ngroup by tumble(txn_timestamp, INTERVAL '10' SECOND), user_id\nhaving count(1) > 1;\n",
 92 |       "user": "anonymous",
 93 |       "dateUpdated": "2022-10-12T19:51:46+0000",
 94 |       "progress": 0,
 95 |       "config": {
 96 |         "editorSetting": {
 97 |           "language": "sql",
 98 |           "editOnDblClick": false,
 99 |           "completionKey": "TAB",
100 |           "completionSupport": true
101 |         },
102 |         "colWidth": 12,
103 |         "editorMode": "ace/mode/sql",
104 |         "fontSize": 9,
105 |         "editorHide": false,
106 |         "results": {},
107 |         "enabled": true
108 |       },
109 |       "settings": {
110 |         "params": {},
111 |         "forms": {}
112 |       },
113 |       "results": {
114 |         "code": "SUCCESS",
115 |         "msg": [
116 |           {
117 |             "type": "TEXT",
118 |             "data": "Table has been dropped.\nTable has been created.\nTable has been dropped.\nTable has been created.\n"
119 |           }
120 |         ]
121 |       },
122 |       "apps": [],
123 |       "runtimeInfos": {},
124 |       "progressUpdateIntervalMs": 500,
125 |       "jobName": "paragraph_1665602789960_2041678241",
126 |       "id": "paragraph_1644099104676_1577171503",
127 |       "dateCreated": "2022-10-12T19:26:29+0000",
128 |       "status": "FINISHED",
129 |       "$$hashKey": "object:171",
130 |       "dateFinished": "2022-10-12T19:39:04+0000",
131 |       "dateStarted": "2022-10-12T19:38:20+0000"
132 |     }
133 |   ],
134 |   "name": "sql-flink-ecomm-notebook-1",
135 |   "id": "2HFDAA9HY",
136 |   "defaultInterpreterGroup": "flink",
137 |   "version": "0.9.0",
138 |   "noteParams": {
139 |     "kda.deploy-as-application.app-name": "ecomm-streaming-app-v1-sql-flink-ecomm-notebook-1-2HFDAA9HY"
140 |   },
141 |   "noteForms": {},
142 |   "angularObjects": {},
143 |   "config": {
144 |     "isZeppelinNotebookCronEnable": false,
145 |     "looknfeel": "default",
146 |     "personalizedMode": "false"
147 |   },
148 |   "info": {},
149 |   "path": "/sql-flink-ecomm-notebook-1"
150 | }


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/code/serverless-app/lambda_function.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from aws_kinesis_agg.deaggregator import iter_deaggregate_records
 3 | from datetime import datetime
 4 | import base64
 5 | import json
 6 | import boto3
 7 | import os
 8 | 
 9 | # OS input variables:
10 | cloudwatch_namespace = os.environ['cloudwatch_namespace']
11 | cloudwatch_metric = os.environ['cloudwatch_metric']
12 | dynamodb_control_table = os.environ['dynamodb_control_table']
13 | topic_arn = os.environ['topic_arn']
14 | 
15 | # AWS Services
16 | cloudwatch = boto3.client('cloudwatch', region_name='us-east-1')
17 | sns = boto3.client('sns', region_name='us-east-1')
18 | dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
19 | db_table = dynamodb.Table(dynamodb_control_table)
20 | 
21 | 
22 | def lambda_handler(event, context):
23 |     raw_kinesis_records = event['Records']
24 |     record_count = 0
25 | 
26 |     # Using DynamoDB Batch Writer. Source: http://bit.ly/2ZSSdIz
27 |     with db_table.batch_writer() as batch_writer:
28 | 
29 |         # Deaggregate all records using a generator function
30 |         for record in iter_deaggregate_records(raw_kinesis_records):
31 | 
32 |             try:
33 |                 # Kinesis data in Python Lambdas is base64 encoded
34 |                 payload = base64.b64decode(record['kinesis']['data'])
35 |                 json_document = json.loads(payload.decode('utf-8'))
36 | 
37 |                 # Input Data extraction
38 |                 input_user_id = str(json_document['user_id'])
39 |                 input_num_actions_per_watermark = str(json_document['num_actions_per_watermark'])
40 | 
41 |                 # DYNAMODB LAYER:
42 |                 # - Add time as Monitor Control and Write micro-batch to DynamoDB:
43 |                 json_document['ddb_partition_key'] = 'userid#{}#appserver#{}'.format(input_user_id, 'app-server-tomcat-123')
44 |                 json_document['ddb_sort_key'] = int(datetime.utcnow().timestamp())
45 |                 ddb_response = batch_writer.put_item(Item=json_document)
46 |                 print('DynamoDB API Response: {}'.format(ddb_response))
47 | 
48 |                 # CLOUDWATCH LAYER:
49 |                 # - Note: this can be dynamically built or fetched from properties file,
50 |                 #         without hard-coding KEY-VALUE pairs.
51 |                 dimension_name_1 = 'user_id'
52 |                 dimension_name_2 = 'num_actions_per_watermark'
53 |                 cloudwatch_response = cloudwatch.put_metric_data(
54 |                     MetricData=[
55 |                         {
56 |                             'MetricName': cloudwatch_metric,
57 |                             'Dimensions': [
58 |                                 {
59 |                                     'Name': dimension_name_1,
60 |                                     'Value': input_user_id
61 |                                 },
62 |                                 {
63 |                                     'Name': dimension_name_2,
64 |                                     'Value': input_num_actions_per_watermark
65 |                                 },
66 |                             ],
67 |                             'Unit': 'Count',
68 |                             'Value': 1,
69 |                             'StorageResolution': 1
70 |                         },
71 |                     ],
72 |                     Namespace=cloudwatch_namespace
73 |                 )
74 | 
75 |                 # Print Cloudwatch response:
76 |                 # - Implement real Logging for Production; e.g. logging.getLogger().setLevel(logging.INFO)
77 |                 print('CloudWatch API Response: {}'.format(cloudwatch_response))
78 | 
79 |                 # DDoS NOTIFICATIONS LAYER: Look for possible BOTs or attacks in stream:
80 |                 if int(input_num_actions_per_watermark) > 10:
81 |                     sns_response = sns.publish(TopicArn=topic_arn, Message=str(json_document),
82 |                                                Subject='Possible DDoS detected, by user_id {} with a number of attempts of : {}/window'.format(input_user_id, input_num_actions_per_watermark))
83 |                     print('Email notification sent, due high severity incident. API Response: {}'.format(sns_response))
84 | 
85 |             except Exception as e:
86 |                 # - Implement real Logging for Production; e.g. logging.getLogger().setLevel(logging.INFO)
87 |                 print('Error when processing stream:')
88 |                 print(e)
89 | 
90 |             # Print response and increment counter
91 |             record_count += 1
92 | 
93 |     return 'Successfully processed {} records.'.format(record_count)
94 | 


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/.DS_Store


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img-flink.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img-flink.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img10-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img10.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img11.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img12.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img13.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img14.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img15-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img15.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img16.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img17-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img17.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img18-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img18.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img19-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img19.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img2-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img2.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img20.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img21-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img21.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img22-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img22.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img23.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img24-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img24-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img24.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img25-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img25.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img26-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img26.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img27.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img28-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img28-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img28.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img29.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img3-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img3.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img30-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img30-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img30.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img4.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img5-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img6-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img7-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img8-1.png


--------------------------------------------------------------------------------
/build-a-managed-analytics-platform-for-ecommerce-business/img/img9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/build-a-managed-analytics-platform-for-ecommerce-business/img/img9-1.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/.DS_Store


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Suman Debnath
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Creating an ETL Pipeline with Amazon EMR and Apache Spark
  3 | 
  4 | Extract, Transform, and Load (or ETL) - sometimes called Ingest, Transform, and Export - is vital for building a robust data engineering pipeline for any organization. Essentially, if the ETL pipeline is designed and built using the right tools and services, it brings high value to any organization for both for batch and real-time processing. But designing and building such a pipeline is a time consuming task, and it requires different skillsets considering the number of tools and frameworks in this big data space. Luckily, it's pretty easy if you're using EMR and Spark. 
  5 | 
  6 | Batch ETL is a common use case across many organizations. And this tutorial will provide you with a starting point, which can help you to build more complex data pipelines in AWS using [Amazon EMR (Amazon Elastic MapReduce)](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-what-is-emr.html) and [Apache Spark](https://spark.apache.org/). Here's how to do it.
  7 | 
  8 | We are going to use [PySpark](https://spark.apache.org/docs/latest/api/python/) to interact with the Spark cluster. PySpark allows you to write Spark applications using Python APIs. 
  9 | 
 10 | ## What you will learn
 11 | 
 12 | In this guide, you will:
 13 | - Create and set up an Amazon EMR cluster 
 14 | - Submit a PySpark job on EMR 
 15 | - Integrate Amazon EMR with Amazon S3 
 16 | 
 17 | Let's get started!
 18 | 
 19 | ## Requirements 
 20 | 
 21 | Before Starting this guide, you will need:
 22 | 
 23 | - An AWS account (if you don't yet have one, please create one and [set up your environment](https://aws.amazon.com/getting-started/guides/setup-environment/))
 24 | - An IAM user that has the access and create AWS resources. 
 25 | - Basic understanding of Python
 26 | 
 27 | 
 28 | ## Use case and problem statement
 29 | 
 30 | For this tutorial, let's assume you have a vendor who provides incremental sales data at the end of every month. The file arrives in S3 as a `CSV` file and it needs to be processed and made available to your data analysts for querying and analysis. 
 31 | 
 32 | ## Architecture 
 33 | 
 34 | To implement this data pipeline, we will use an EMR cluster with Spark as the distributed processing engine. We'll use S3 for storing the:
 35 | 
 36 | -  `RAW` data (which is the input and unprocessed data) 
 37 | -  `CLEANSED` data (which is output and processed data)
 38 | 
 39 | We need to build a data pipeline such that it will take this new sales file from the S3 bucket, process it with required transformations using Amazon EMR, and save the cleaned and transformed data into the target S3 bucket, which will be used later on for querying using Amazon Athena. 
 40 | 
 41 | ![Img Architecture](images/Architecture.png)
 42 | 
 43 | ## Implementation 
 44 | 
 45 | To implement our data processing pipeline, we first need to create an EMR cluster that will run our ETL jobs, an SSH key pair to allow connecting to the server, an S3 bucket to store the raw and processed data, and finally start our job on the cluster.
 46 | 
 47 | ### Step 1: Create an EMR Cluster
 48 | 
 49 | Before we create an EMR cluster we need to create a `Key Pair`, which we would need to access the EMR cluster's master node later on. So let's do it. 
 50 | 
 51 | 1. Log in to your AWS account and navigate to the EC2 console and click on the [**Key Pairs**](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/create-key-pairs.html) option on the left menu bar. And then, click `Create Key Pair`.
 52 | 
 53 | ![Key Pair 1](images/key_pair.png)
 54 | 
 55 | 2. Provide a name (`mykey-emr`) for your key pair and click `Create Key Pair`. 
 56 | 
 57 | ![Key Pair 2](images/key_pair_2.png)
 58 | 
 59 | 3. Now we can go ahead and create an `Amazon EMR cluster`. For that, navigate to Amazon EMR in the console and click **Create Cluster** to create an EMR cluster.
 60 | 
 61 | ![emr cluster 1](images/emr_1-new.png)
 62 | 
 63 | 4. Provide `Cluster name` as `MyDemoEMRCluster` to your EMR cluster, and select the following:
 64 |     - Select the **latest release** of EMR under **Software configuration** section
 65 |     - Select **Spark** under **Application bundle** section, 
 66 |     - Select the right **EC2 key pair** (which you created in the previous step) under the **Security and access** section
 67 |  
 68 | Keep everything else as default and click on Create cluster. This will create a cluster with three instances. 
 69 | 
 70 | ![emr cluster 2](images/emr_2-new.png)
 71 | 
 72 | 5. Cluster creation will take some time, and after couple of minutes, you will see that the cluster is **up and running** with a state as `Waiting` (which means the cluster is now ready and waiting to execute any ETL job).
 73 | 
 74 | ![emr cluster 3](images/emr_3-new.png)
 75 | 
 76 | ### Step 2: Create an Amazon S3 bucket
 77 | 
 78 | Now we will create an Amazon S3 bucket and create two sub-folders within that, which will be used to store `RAW` and `CLEANSED` data.
 79 | 
 80 | 1. Navigate to the Amazon S3 console and click on **Create Bucket**.
 81 | 
 82 | ![S3_1](images/s3_1-new.png)
 83 | 
 84 | 2. Create a **bucket** (e.g. `etl-batch-emr-demo`).
 85 | 
 86 | ![S3_2](images//s3_2-new.png)
 87 | 
 88 | 3. Once the bucket is created, create two sub-folders named: 
 89 |     - `cleaned_data` 
 90 |     - `raw_data`
 91 | 
 92 | ![S3_3](images/s3_3.png)
 93 | 
 94 | 4. Upload the [sales dataset CSV file](https://github.com/aws-samples/data-engineering-on-aws/blob/main/dataset/SalesData.csv) in the bucket under the folder `raw_data`.
 95 | 
 96 | ![Upload raw data](images/upload_csv-new.png)
 97 | 
 98 | ### Step 3: Submit the PySpark job 
 99 | 
100 | Now, that we have the dataset uploaded in S3, it's time to submit the PySpark job from our EMR cluster. 
101 | 
102 | 1. Sign in to the AWS Management Console, and open the [Amazon EMR console](https://console.aws.amazon.com/emr/).
103 | 
104 | 2. Under **EMR on EC2** in the left navigation pane, choose **Clusters**, and then select the `myDemoEMRCluster` cluster where you want to retrieve the public DNS name.
105 | 
106 | 3. Note the **Primary node public DNS** value in the Summary section of the cluster details page.
107 | 
108 | ![emr_4](images//emr_4-new.png)
109 | 
110 | 4. SSH to the EMR cluster's Master node from your terminal 
111 | 
112 | ```bash
113 | ssh -i "mykey-emr.pem" root@ec2-18-219-203-79.us-east-2.compute.amazonaws.com
114 | ```
115 | 
116 | 5. Copy the PySpark code [`etl-job.py`](https://github.com/aws-samples/data-engineering-on-aws/blob/main/create-an-etl-pipeline-apache-spark/emr-etl-job.py) and save on the `Master Node` under the home directory and make the following changes and save the file:
117 | 
118 |     - `S3_INPUT_DATA`  = 's3://<YOUR_BUCKET_LOCATION_OF_RAW_DATA>'
119 |     - `S3_OUTPUT_DATA` = 's3://<YOUR_BUCKET_LOCATION_OF_CLEANED_DATA>'
120 | 
121 | 6. Submit the `PySpark job` and wait for the job to complete before proceeding.
122 | 
123 | ```bash
124 | sudo spark-submit etl-job.py 
125 | ``` 
126 | 
127 | 7. Once the job completes, check the S3 bucket under the folder `cleaned_data`, you will see the new transformed and processed data in parquet format. 
128 | 
129 | ![s3 cleaned data](images/s3_cleaned_data.png)
130 | 
131 | ### Step 4: Validating the output using Amazon Athena
132 | 
133 | Now the `cleansed` data is available in Amazon S3 in the form of parquet format, but to make it more consumable for data analysts or data scientists, it would be great if we could enable querying the data through SQL by making it available as a database table.
134 | 
135 | To make that integration, we can follow a two-step approach:
136 | 1. We need to run the Glue crawler to create an AWS Glue Data Catalog table on top of the S3 data.
137 | 2. Once that is done, we can run a query in Amazon Athena to validate the output.
138 | 
139 | ### Step 5: Creating an AWS Glue Data Catalog
140 | 
141 | 1. Navigate to the AWS Glue crawler console and click on **Create Crawler**.
142 | 
143 | ![glue crawler](images/glue_ui.png)
144 | 
145 | 2. Give a **name** for the Glue Crawler (`my-crawler-1`).
146 | 
147 | ![glue crawler](images/glue_crawler_1.png)
148 | 
149 | 3. Add the **data source** as S3 bucket where you have your cleansed and processed data (`s3://etl-batch-emr-demo/cleaned_data`).
150 | 
151 | ![glue crawler](images/glue_crawler_2.png)
152 | 
153 | 4. Create an **IAM role** (`AWSGlueServiceRole-default`) and attached the same. You can create a role and attach the following policies (for more details you can refer to [this](https://docs.aws.amazon.com/glue/latest/dg/crawler-prereqs.html) and follow the steps:
154 | 
155 | - The AWSGlueServiceRole AWS managed policy, which grants the required permissions on the Data Catalog
156 | 
157 | - An inline policy that grants permissions on the data source (`S3_INPUT_DATA` location)
158 | 
159 | ![glue crawler](images/glue_crawler_3.png)
160 | 
161 | 5. Create a **database** by clicking on **Add database** and select the same from dropdown menu (`my_demo_db`).
162 | 
163 | ![glue crawler](images/glue_crawler_4.png)
164 | 
165 | 6. Review and verify all the details and click on **Create crawler**.
166 | 
167 | ![glue crawler](images/glue_crawler_5.png)
168 | 
169 | 7. Once the crawler is created, select the crawler and click on **Run**.
170 | 
171 | ![glue crawler](images/glue_run.png)
172 | 
173 | 8. Once the crawler finishes its run, you will see `detected tables`.
174 | 
175 | ![glue crawler](images/glue_run_complete.png)
176 | 
177 | Now that we have the Glue Data Catalog table created, we can navigate to Amazon Athena to query the data using SQL.
178 | 
179 | Until now, we have extracted the data from Amazon S3, and then transformed the data by converting the data into parquet format
180 | using a Glue ETL (pySpark) job. Finally we will use that cleaned data for analysis using Amazon Athena. 
181 | 
182 | ### Step 6: Querying output data using Amazon Athena standard SQL 
183 | 
184 | 1. Open Athena query editor. You can keep Data Source as the default `AwsDataCatalog` and select `my_demo_db` for Database (as show in the screen shot) and run the following query. 
185 | 
186 | ```sql
187 | SELECT * FROM "my_demo_db"."cleaned_data" limit 10;
188 | ```
189 | 
190 | ![athena](images/athena_q1.png)
191 | 
192 | 2. Now you can perform other SQL queries to analyze the data. For example, if we would like to know the `forcast_monthly_revenue` for each `region per segment wise`, you can run this:
193 | 
194 | ```sql 
195 | SELECT 
196 |     region, 
197 |     segment, 
198 |     SUM(forecasted_monthly_revenue) as forcast_monthly_revenue 
199 | FROM "my_demo_db"."cleaned_data" 
200 | GROUP BY segment, region;
201 | ```
202 | ![athena](images/athena_q2.png)
203 | 
204 | 
205 | ## Clean up resources 
206 | 
207 | Now that you’ve finished this walk-through, you can delete all the following resources to avoid incurring unexpected costs:
208 | 
209 | - Delete the **EMR Cluster** 
210 | 
211 | ![emr terminate](images/emr_terminate.png)
212 | 
213 | - Delete the **Amazon S3 bucket**
214 | 
215 | ```bash
216 | aws s3 rb s3://<YOUR_BUCKET_LOCATION> --force
217 | ```
218 | 
219 | - Delete the **Glue Database**
220 | 
221 | ![glue db delete](images/glue_db_delete.png) 
222 | 
223 | ## Conclusion
224 | 
225 | Congratulations! You have finished the tutorial on creating an ETL pipeline with Amazon EMR and Apache Spark. 
226 | 
227 | In this tutorial, we learned how to build an ETL pipeline, which can be applied in different batch processing use-cases, like e-commerce sales data analysis. We learned how to extract the data from S3 and then transform the data based on our requirement by using a simple Glue ETL (pySpark) job. And then finally, we analyzed the data using SQL via Amazon Athena. If you're interested in learning more about ERM and Spark-based ETL, you may like to check this [workshop](https://catalog.us-east-1.prod.workshops.aws/workshops/c86bd131-f6bf-4e8f-b798-58fd450d3c44/en-US/spark-etl). 


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/emr-etl-job.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql import functions as F
 3 | 
 4 | S3_INPUT_DATA = '<YOUR_BUCKET_LOCATION_OF_RAW_DATA>'
 5 | S3_OUTPUT_DATA = '<YOUR_BUCKET_LOCATION_OF_CLEANED_DATA>'
 6 | 
 7 | 
 8 | def main():
 9 | 
10 |     spark = SparkSession.builder.appName("My Demo ETL App").getOrCreate()
11 |     spark.sparkContext.setLogLevel('ERROR')
12 | 
13 |     # Spark Dataframe (Raw)- Transformation 
14 |     df = spark.read.option("Header", True).option("InferSchema", True).csv(S3_INPUT_DATA)
15 |     
16 |     replacements = {c:c.replace(' ','_') for c in df.columns if ' ' in c}
17 |     final_df = df.select([F.col(c).alias(replacements.get(c, c)) for c in df.columns])
18 | 
19 |     print(f"Total no. of records in the source data set is : {final_df.count()}")
20 | 
21 |     try:
22 |         final_df.write.mode('overwrite').parquet(S3_OUTPUT_DATA)
23 |         print('The cleaned data is uploaded')
24 |     except:
25 |         print('Something went wrong, please check the logs :P')
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/Architecture.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/athena_q1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/athena_q1.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/athena_q2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/athena_q2.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_1-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_1-new.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_1.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_2-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_2-new.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_2.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_3-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_3-new.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_3.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_4-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_4-new.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_4.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/emr_terminate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/emr_terminate.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_crawler_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_crawler_1.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_crawler_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_crawler_2.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_crawler_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_crawler_3.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_crawler_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_crawler_4.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_crawler_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_crawler_5.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_db_delete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_db_delete.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_run.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_run_complete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_run_complete.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/glue_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/glue_ui.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/key_pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/key_pair.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/key_pair_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/key_pair_2.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/s3_1-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/s3_1-new.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/s3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/s3_1.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/s3_2-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/s3_2-new.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/s3_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/s3_2.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/s3_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/s3_3.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/s3_cleaned_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/s3_cleaned_data.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/upload_csv-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/upload_csv-new.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/images/upload_csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/create-an-etl-pipeline-apache-spark/images/upload_csv.png


--------------------------------------------------------------------------------
/create-an-etl-pipeline-apache-spark/sql_queries.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --- Query 1
 3 | SELECT * FROM "my_demo_db"."cleaned_data" limit 10;
 4 | 
 5 | --- Query 2
 6 | SELECT 
 7 |     region, 
 8 |     segment, 
 9 |     SUM(forecasted_monthly_revenue) as forcast_monthly_revenue 
10 | FROM "my_demo_db"."cleaned_data" 
11 | GROUP BY segment, region;
12 | 
13 | 


--------------------------------------------------------------------------------
/ml-sagemaker-studio/01_glue_data_prep.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "18df7b03-92b3-4da7-a807-34b9941e120e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Serverless Data Prep with Glue Interactive Sessions\n",
  9 |     "We can scale our data preparation using serverless Spark or Ray with native integration with AWS Glue Interactive Sessions"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "id": "beeac588-afea-4e9f-970f-14194a1432bd",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### What is AWS Glue\n",
 18 |     "\n",
 19 |     "AWS Glue is a serverless data integration service that makes it easier to discover, prepare, move, and integrate data from multiple sources for analytics, machine learning (ML), and application development.\n",
 20 |     "\n",
 21 |     "![img](https://d1.awsstatic.com/reInvent/reinvent-2022/glue/Product-Page-Diagram_AWS-Glue_for-Ray%402x.f34b47cf0280c7d843ea457b704ea512bebd91d5.png)\n"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "id": "4182e44c-be7d-414b-beb8-1c101a3950ef",
 27 |    "metadata": {
 28 |     "tags": []
 29 |    },
 30 |    "source": [
 31 |     "## Objective \n",
 32 |     "\n",
 33 |     "Want to predict the amount of NO2 in the area based on weather conditions\n",
 34 |     "\n",
 35 |     "![img](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b1/Origins_of_acid_rain.svg/1280px-Origins_of_acid_rain.svg.png)\n"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "id": "9abbd41d-d861-4cd1-9261-4f5c62ec2e39",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### Datasets in our Example\n",
 44 |     "\n",
 45 |     "[OpenAQ Physical Air Quality Data](https://registry.opendata.aws/openaq/):\n",
 46 |     "* Global, aggregated physical air quality data from public data sources provided by government, research-grade and other sources.\n",
 47 |     "* 42GB of Data\n",
 48 |     "\n",
 49 |     "\n",
 50 |     "[NOAA Global Surface Summary of Day](https://registry.opendata.aws/noaa-gsod/):\n",
 51 |     "* Global summary of day data for 18 surface meteorological elements are derived from the synoptic/hourly observations contained in USAF DATSAV3 Surface data and Federal Climate Complex Integrated Surface Hourly (ISH).\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "id": "d6e4cfd1-7fa2-4adf-886c-a723535d2bd4",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "### Set Configurations"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "89f95a45-57af-47d0-a599-076d9a5ba060",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "%help"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "d6df5863-fe16-4ad5-9afb-546ab55a0003",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "%session_id_prefix air-analysis\n",
 80 |     "%glue_version 3.0\n",
 81 |     "%number_of_workers 10\n",
 82 |     "%idle_timeout 180"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "fd4d2e55-39d2-4e88-9fd7-f114786de7c0",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "print(spark.version)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "4226c44e-4223-4eed-92e1-c320adbd3279",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "bucket = <\"YOUR_S3_BUCKET\">"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "218e10f0-82f0-464a-b15a-40431ae5f074",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "schema_df = spark.read.json(\"s3://openaq-fetches/realtime-gzipped/2022-01-05/1641340870.ndjson.gz\")\n",
113 |     "df = spark.read.schema(schema_df.schema).json(\"s3://openaq-fetches/realtime-gzipped/20*\")\n",
114 |     "df.show()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "7f38c59c-e708-445b-96c3-12ac13f6a62f",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "from pyspark.sql.functions import split, lower, to_date\n",
125 |     "\n",
126 |     "yr_split_args = (df.date.utc, \"-\", 0)\n",
127 |     "dfSea = df.filter(lower((df.city)).contains('seattle')).filter(df.parameter == \"no2\").withColumn(\"year\", split(*yr_split_args)[0]).cache()\n",
128 |     "dfSea.show(truncate=False)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "fa5805f8-a718-457a-92e2-faf79aea90c5",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "dfNoAvg = dfSea.withColumn(\"ymd\", to_date(dfSea.date.utc)).groupBy(\"ymd\").avg(\"value\").withColumnRenamed(\"avg(value)\", \"no2_avg\")\n",
139 |     "dfNoAvg.show()"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "id": "322044bb-6228-428c-a088-b694be8d95e0",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "# Write to S3\n",
150 |     "dfNoAvg.coalesce(1).write.parquet(f\"s3://{bucket}/subset-aggregate-no2.parquet\")"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "f6a5f462-59f4-4883-a03e-ec996ccef4b5",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "from pyspark.sql.functions import min, max, year\n",
161 |     "year_min, year_max = dfNoAvg.select(year(min(\"ymd\")), year(max(\"ymd\"))).first()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "id": "c2510a7d-1398-4aea-b66c-85d183064e5b",
167 |    "metadata": {
168 |     "tags": []
169 |    },
170 |    "source": [
171 |     "## Weather"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "aa4a7efd-097a-4481-a2e3-85bec5e82def",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "from pyspark.sql.types import DoubleType\n",
182 |     "from pyspark.sql import functions as F\n",
183 |     "\n",
184 |     "# Scope to Seattle, WA, USA\n",
185 |     "longLeft, latBottom, longRight, latTop = [-122.459696,47.481002,-122.224433,47.734136]\n",
186 |     "\n",
187 |     "dfSchema = spark.read.csv(\"s3://noaa-gsod-pds/2022/32509099999.csv\", header=True, inferSchema=True)\n",
188 |     "\n",
189 |     "# We read our first year, then union the rest of the years :)\n",
190 |     "def read_year(year):\n",
191 |     "    return spark.read.csv(f\"s3://noaa-gsod-pds/{year}/\", header=True, schema=dfSchema.schema)\n",
192 |     "\n",
193 |     "year_range = range(year_min, year_max+1)\n",
194 |     "df = read_year(year_range[0])\n",
195 |     "for year in year_range[1:]:\n",
196 |     "    df = df.union(read_year(year))\n",
197 |     "\n",
198 |     "df = df \\\n",
199 |     "        .withColumn('LATITUDE', df.LATITUDE.cast(DoubleType())) \\\n",
200 |     "        .withColumn('LONGITUDE', df.LONGITUDE.cast(DoubleType()))\n",
201 |     "\n",
202 |     "seadf = df \\\n",
203 |     "        .filter(df.LATITUDE >= latBottom) \\\n",
204 |     "        .filter(df.LATITUDE <= latTop) \\\n",
205 |     "        .filter(df.LONGITUDE >= longLeft) \\\n",
206 |     "        .filter(df.LONGITUDE <= longRight)\n",
207 |     "\n",
208 |     "# Rename columns so they're easier to read\n",
209 |     "seafeatures = seadf.selectExpr(\"Date as date\", \"MAX as temp_max\", \"MIN as temp_min\", \"WDSP as wind_avg\", \"SLP as pressure_sea_level\", \"STP as pressure_station\", \"VISIB as visibility\")\n",
210 |     "\n",
211 |     "# Remove invalid readings\n",
212 |     "no_data_mappings = [\n",
213 |     "    [\"temp_max\", 9999.9],\n",
214 |     "    [\"temp_min\", 9999.9],\n",
215 |     "    [\"wind_avg\", 999.9],\n",
216 |     "    [\"pressure_sea_level\", 9999.9],\n",
217 |     "    [\"pressure_station\", 9999.9],\n",
218 |     "    [\"visibility\", 999.9],\n",
219 |     "]\n",
220 |     "for [name, val] in no_data_mappings:\n",
221 |     "    seafeatures = seafeatures.withColumn(name, F.when(F.col(name)==val, None).otherwise(F.col(name)))\n",
222 |     "    \n",
223 |     "# Now average each reading per day\n",
224 |     "seafeatures = seafeatures.groupBy(\"date\").agg(*[F.mean(c).alias(c) for c in seafeatures.columns[1:]])"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "id": "c4bda4fe-bf6a-42ae-bd0d-278f9b9e669a",
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "seafeatures.coalesce(1).write.parquet(f\"s3://{bucket}/subset-seattle-weather.parquet\")"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "id": "45f21d5f-ae5f-47d9-b1ad-5c0099774f07",
240 |    "metadata": {},
241 |    "source": [
242 |     "# End the Session"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "id": "e9bb0985-1454-4412-a543-7449c765f842",
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": []
252 |   }
253 |  ],
254 |  "metadata": {
255 |   "instance_type": "ml.t3.medium",
256 |   "kernelspec": {
257 |    "display_name": "Glue PySpark (SparkAnalytics 2.0)",
258 |    "language": "python",
259 |    "name": "conda-env-sm_glue_is-glue_pyspark__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/sagemaker-sparkanalytics-310-v1"
260 |   },
261 |   "language_info": {
262 |    "codemirror_mode": {
263 |     "name": "python",
264 |     "version": 3
265 |    },
266 |    "file_extension": ".py",
267 |    "mimetype": "text/x-python",
268 |    "name": "Python_Glue_Session",
269 |    "pygments_lexer": "python3"
270 |   }
271 |  },
272 |  "nbformat": 4,
273 |  "nbformat_minor": 5
274 | }
275 | 


--------------------------------------------------------------------------------
/ml-sagemaker-studio/02_model_building.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "e749dcc7-88cd-49f3-89ce-c94e8c7bbc21",
  6 |    "metadata": {
  7 |     "tags": []
  8 |    },
  9 |    "source": [
 10 |     "# Collaborative Model Building"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 14,
 16 |    "id": "1837007e-0773-43c6-bdd1-b768addd31b4",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import sagemaker_datawrangler\n",
 23 |     "import matplotlib.pyplot as plt"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "bb7f6369-dfd3-4158-8f9f-b0f5ea929a99",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# Download Parquet from S3 locally, or use awswrangler to read pandas df directly from S3\n",
 34 |     "no2_file = \"YOUR-FILE-LOCATION\"\n",
 35 |     "weather_file = \"YOUR-FILE-LOCATION\""
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 15,
 41 |    "id": "f8f0dec9-35ed-44f3-a9ac-c4c37b80eb17",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "df_no2 = pd.read_parquet(no2_file)\n",
 46 |     "df_weather = pd.read_parquet(weather_file)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 16,
 52 |    "id": "abbf8dd9-93a9-4ddb-b21c-f8a404b3792c",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "df_weather"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 7,
 62 |    "id": "6bf939f7-fb1e-4129-8ad4-0c9103ef41b9",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Pandas code generated by sagemaker_datawrangler\n",
 67 |     "output_df = df_weather.copy(deep=True)\n",
 68 |     "\n",
 69 |     "\n",
 70 |     "# Code to Replace with median for column: visibility to resolve warning: Missing values \n",
 71 |     "output_df['visibility']=output_df['visibility'].fillna(output_df['visibility'].median(skipna=True))\n"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 8,
 77 |    "id": "3de8be06-03c1-418c-9991-14b01b7501bd",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "df_no2['ymd'] = pd.to_datetime(df_no2['ymd'], infer_datetime_format=True)\n",
 82 |     "df_no2 = df_no2.set_index('ymd')\n",
 83 |     "\n",
 84 |     "idx = pd.date_range(df_no2.index.min(), df_no2.index.max())\n",
 85 |     "df_no2 = df_no2.reindex(idx, fill_value=None)\n",
 86 |     "df_no2 = df_no2.interpolate(method='time')"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 9,
 92 |    "id": "b4a08b72-d20f-41f5-91bd-f9d5ec279c7b",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "output_df['date'] = pd.to_datetime(output_df['date'], infer_datetime_format=True)\n",
 97 |     "output_df = output_df.set_index('date').sort_index()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 10,
103 |    "id": "52f3113d-ce9c-45b1-af33-0db9c2f40964",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "min_viable_date = max(df_no2.index.min(), output_df.index.min())\n",
108 |     "max_viable_date = min(df_no2.index.max(), output_df.index.max())"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 11,
114 |    "id": "03cf96ed-3f7e-4466-97d7-aa1aea153a85",
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "Merging dataframes between 2016-03-06 00:00:00 and 2022-11-14 00:00:00\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "print(f\"Merging dataframes between {min_viable_date} and {max_viable_date}\")\n",
127 |     "\n",
128 |     "comp_df = pd.merge(\n",
129 |     "    output_df[min_viable_date:max_viable_date],\n",
130 |     "    df_no2[min_viable_date:max_viable_date][['no2_avg']],\n",
131 |     "    left_index=True, right_index=True\n",
132 |     ")"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 12,
138 |    "id": "746e2db9-d62e-4e6a-b5d6-79f0d2e85e6f",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "mydata = comp_df[['wind_avg','no2_avg']]\n",
143 |     "\n",
144 |     "x = mydata['wind_avg']\n",
145 |     "y = mydata['no2_avg']\n",
146 |     "plt.scatter(x, y)\n",
147 |     "\n",
148 |     "z = np.polyfit(x, y, 1)\n",
149 |     "p = np.poly1d(z)\n",
150 |     "plt.plot(x,p(x),\"r--\")\n",
151 |     "\n",
152 |     "plt.ylabel('NO2 Conc. ppm')\n",
153 |     "plt.xlabel('Wind Speed (mph)')\n",
154 |     "plt.show()"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 13,
160 |    "id": "e3beb531-1728-445f-a682-05b936f08b68",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# Drop the 1st row as NaN\n",
165 |     "aq_df = comp_df.iloc[1:].copy()\n",
166 |     "\n",
167 |     "# Drop visibility as it didn't seem correlate much and has NaNs that break the training\n",
168 |     "aq_df = aq_df.drop('visibility', 1)\n",
169 |     "\n",
170 |     "# Use the data from years 2016 up to 2020 as training, and the year 2021 as our candidate year for testing and validating our model.\n",
171 |     "aq_train_df = aq_df[aq_df.index.year < 2021]\n",
172 |     "aq_test_df = aq_df[aq_df.index.year == 2021]\n",
173 |     "\n",
174 |     "x_train_df = aq_train_df.drop('no2_avg',1)\n",
175 |     "x_train = x_train_df.values.astype('float32')\n",
176 |     "\n",
177 |     "\n",
178 |     "x_test_df = aq_test_df.drop('no2_avg',1)\n",
179 |     "x_test = x_test_df.values.astype('float32')\n",
180 |     "\n",
181 |     "y_train_df = aq_train_df[[\"no2_avg\"]]\n",
182 |     "y_train = y_train_df.values[:, 0].astype('float32')\n",
183 |     "\n",
184 |     "y_test_df = aq_test_df[[\"no2_avg\"]]\n",
185 |     "y_test = y_test_df.values[:, 0].astype('float32')"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 14,
191 |    "id": "fbc3accb-ff40-4ed8-9f98-aa369cb8c820",
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "from math import sqrt\n",
196 |     "from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score\n",
197 |     "\n",
198 |     "def smape(actual, predicted):\n",
199 |     "    dividend= np.abs(np.array(actual) - np.array(predicted))\n",
200 |     "    denominator = np.array(actual) + np.array(predicted)\n",
201 |     "    \n",
202 |     "    return 2 * np.mean(np.divide(dividend, denominator, out=np.zeros_like(dividend), where=denominator!=0, casting='unsafe'))\n",
203 |     "\n",
204 |     "def print_metrics(y_test, y_pred):\n",
205 |     "    print(\"RMSE: %.4f\" % sqrt(mean_squared_error(y_test, y_pred)))\n",
206 |     "    print('Variance score: %.4f' % r2_score(y_test, y_pred))\n",
207 |     "    print('Explained variance score: %.4f' % explained_variance_score(y_test, y_pred))\n",
208 |     "    forecast_err = np.array(y_test) - np.array(y_pred)\n",
209 |     "    print('Forecast bias: %.4f' % (np.sum(forecast_err) * 1.0/len(y_pred) ))\n",
210 |     "    print('sMAPE: %.4f' % smape(y_test, y_pred))"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 15,
216 |    "id": "b3f9a0f2-5425-4322-85f6-6cbff7a1d436",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "from sklearn.linear_model import LinearRegression"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 17,
226 |    "id": "b61dc1a6-1227-4fa9-aa9c-b12aa64d48d4",
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "from sklearn.ensemble import RandomForestRegressor\n",
231 |     "reg = RandomForestRegressor(max_depth=4)\n",
232 |     "reg.fit(x_train, y_train)\n",
233 |     "reg.score(x_test, y_test)\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "id": "b3f6fc3e-9d21-4fe1-a502-0e748737c87f",
239 |    "metadata": {
240 |     "tags": []
241 |    },
242 |    "source": [
243 |     "# Real Time Collaboration - Model Improvement"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 4,
249 |    "id": "066f22bd-153f-485e-890e-c5f14d376ed7",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "# from sklearn.ensemble import RandomForestRegressor\n",
254 |     "# reg = RandomForestRegressor(max_depth=4)\n",
255 |     "# reg.fit(x_train, y_train)\n",
256 |     "# reg.score(x_test, y_test)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "id": "ed6a1f85-3dd8-4fd3-bc09-9058b8c0b9e7",
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "result = reg.predict(x_test)\n",
267 |     "print_metrics(y_test, result)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "7bd38233-9e94-4176-922d-ec38d5fa7861",
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "y_pred_df = pd.DataFrame(result, columns=y_test_df.columns).set_index(y_test_df.index).sort_index()\n",
278 |     "\n",
279 |     "plt.plot(y_test_df, label='actual')\n",
280 |     "plt.plot(y_pred_df, label='forecast')\n",
281 |     "plt.legend()\n",
282 |     "plt.show()\n"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "304d2f2c-0ae1-4b83-8e8c-831918079f41",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "import joblib\n",
293 |     "\n",
294 |     "# Save model\n",
295 |     "joblib.dump(reg, \"data/model.pkl\") "
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 62,
301 |    "id": "b3fb06bf-1c6a-4414-968a-7f8f4068ff78",
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "# Upload to S3\n",
306 |     "!aws s3 cp data/model.pkl s3://YOUR-S3-BUCKET/airquality-experiment/"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "id": "f9c9c838-85d9-45ef-b38e-6d5d10e70aee",
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": []
316 |   }
317 |  ],
318 |  "metadata": {
319 |   "instance_type": "ml.t3.medium",
320 |   "kernelspec": {
321 |    "display_name": "Python 3 (Data Science 3.0)",
322 |    "language": "python",
323 |    "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/sagemaker-data-science-310-v1"
324 |   },
325 |   "language_info": {
326 |    "codemirror_mode": {
327 |     "name": "ipython",
328 |     "version": 3
329 |    },
330 |    "file_extension": ".py",
331 |    "mimetype": "text/x-python",
332 |    "name": "python",
333 |    "nbconvert_exporter": "python",
334 |    "pygments_lexer": "ipython3",
335 |    "version": "3.10.6"
336 |   }
337 |  },
338 |  "nbformat": 4,
339 |  "nbformat_minor": 5
340 | }
341 | 


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/README.md:
--------------------------------------------------------------------------------
 1 | # Scalable data preparation & ML using Apache Spark on AWS 
 2 | 
 3 | Analyzing, transforming and preparing large amounts of data is a foundational step of any data science and ML workflow. This session shows how to build end-to-end data preparation and machine learning (ML) workflows. We explain how to connect Apache Spark, for fast data preparation in your data processing environments on Amazon EMR and AWS Glue interactive sessions from Amazon SageMaker Studio. Uncover how to access data governed by AWS Lake Formation to interactively query, explore, visualize data, run and debug Spark jobs as you prepare large-scale data for use in ML.
 4 | 
 5 | You may like to refer to session **Scalable data preparation & ML using Apache Spark on AWS** at [AWS Innovate 2023 (Data and Machine Learning)](https://aws.amazon.com/events/aws-innovate/apj/aiml-data/) for the code walk-through. 
 6 | 
 7 | ## Introduction 
 8 | 
 9 | While some organizations see data science, data engineering, and data analytics as separate siloed functions, we're increasingly seeing with many of our customers that data prep and analytics are foundational components of ML workflows.
10 | 
11 | For example, although organizations have data engineering teams to clean and prepare data for analytics and ML, the specific data that a data scientist may need for training a specific model may not be available in the repository of data that a data engineering team may have prepared.
12 | 
13 | ![Intro](img/img1.png)
14 | 
15 | ## Problem statement 
16 | 
17 | Lets take a problem and try to solve it. As we all know, Air pollution in cities can be an acute problem leading to damaging effects on people, animals, plants and property.
18 | 
19 | ![Intro](img/img2.png)
20 | 
21 | We need to build a machine learning model which can help to predict the amount of NO2 in the area based on weather conditions
22 | 
23 | So, ultimately we would like to have a ML model, wherein we are going to feed the weather details of a particular city at a given time, 
24 | These details would be, mean temperature, maximum temperature, minimum temperate and so on. 
25 | 
26 | And the Model should predict the NO2 or nitrogen dioxide concentration levels at that time. 
27 | 
28 | ![Intro](img/img3.png)
29 | 
30 | ## Solution 
31 | 
32 | So, what are the tasks we have in hand ? 
33 | 
34 | - We first need to clean and prepare the data for ML training and we are going to do that using `Apache Spark` 
35 | 
36 | - And then we would need to train and finally deploy the model using `Amazon SageMaker`. 
37 | 
38 | ![Intro](img/img4.png)
39 | 
40 | So, we are going to be using Amazon SageMaker for ML training, and Model hosting. We are going to use `Pandas` for data analysis and `Amazon EMR` for data processing. And our training dataset would be stored in an `Amazon S3` bucket. 
41 | 
42 | But we don’t have to worry about these multiple services and tools. 
43 | 
44 | ## Amazon SageMaker Studio	
45 | 
46 | We are going to use Amazon SageMaker Studio, which is the first fully integrated development environment, or IDE, for machine learning. SageMaker Studio provides users the ability to visually browse and connect to Amazon EMR clusters right from the Studio notebook. Additionally, you can now provision and terminate EMR clusters directly from Studio
47 | 
48 | 
49 | ![Intro](img/img5.png)
50 | 
51 | ## Steps to follow 
52 | 
53 | In this section we will walk you through how you can create the environment and launch a Studio Notebook and perform the data processing and model training and deployment using Amazon SageMaker Studio Notebook and Amazon EMR. 
54 | 
55 | ### Create the environment 
56 | 
57 | In this section, we supply AWS CloudFormation template and example notebooks to get started in a demonstration SageMaker domain.
58 | 
59 | The following stack provides an end-to-end CloudFormation template that stands up a private VPC, a SageMaker domain attached to that VPC, and a SageMaker user with visibility to the pre-created AWS Service Catalog product.
60 | 
61 | Please use this [CloudFormation template](/code/CFN-SagemakerEMRNoAuthProductWithStudio-v3.yaml) to deploy the environment. 
62 | 
63 | ### Launch the SageMaker Studio Notebook 
64 | 
65 | Once the stack is deployed, perform the following to launch a SageMaker Studio Notebook
66 | 
67 | 1. Open the **Amazon SageMaker** console
68 | 2. Click on `Domains` on the left menu. 
69 | 3. Click on `StudioDomain` 
70 | 4. Click on `Launch` button for the `studio-user` 
71 | 
72 | ![Intro](img/img6.png)
73 | 
74 | ### Air Quality Predictions with Amazon SageMaker and Amazon EMR
75 | 
76 | Download this [Jupyter Notebook](/code/demo-sm-emr.ipynb) and import it inside the Studio Notebook and follow the instructions in the notebook. 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/code/CFN-SagemakerEMRNoAuthProductWithStudio-v3.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | AWSTemplateFormatVersion: '2010-09-09'
  3 | Description: >
  4 |   This cloudformation template enables SageMaker Studio to launch and connect to EMR clusters.
  5 |   The EMR cluster is launched via Service Catalog.
  6 |   This template creates a demonstration SageMaker Studio Domain & SageMaker User Profile. It ppopulates Service Catalog 
  7 |   with a Product that consists of another cloudformation template for launching EMR.
  8 |   It creates the Studio Domain in a private VPC and establishes connectivity with EMR via No-Auth as described in
  9 |      "https://aws.amazon.com/blogs/machine-learning/part-1-create-and-manage-amazon-emr-clusters-from-sagemaker-studio-to-run-interactive-spark-and-ml-workloads/"
 10 | 
 11 | Mappings:
 12 |   VpcConfigurations:
 13 |     cidr:
 14 |       Vpc: 10.0.0.0/16
 15 |       PublicSubnet1: 10.0.10.0/24
 16 |       PrivateSubnet1: 10.0.20.0/24
 17 |   ClusterConfigurations:
 18 |     emr:
 19 |       BootStrapScriptFile: installpylibs-v2.sh
 20 |       StepScriptFile: configurekdc.sh
 21 |     s3params:
 22 |       BlogS3Bucket: aws-ml-blog
 23 |       S3Key: artifacts/sma-milestone1/
 24 | 
 25 | Parameters:
 26 |   SageMakerDomainName:
 27 |     Type: String
 28 |     Description: Name of the Studio Domain to Create
 29 |     Default: SageMakerEMRDomain
 30 | 
 31 | Resources:
 32 |   S3Bucket:
 33 |     Type: AWS::S3::Bucket
 34 |     Properties:
 35 |       BucketName: !Join [ "-", [ "sagemaker-emr-template-cfn", !Select [ 2, !Split [ "/", !Ref AWS::StackId ] ] ] ]
 36 | 
 37 |   VPC:
 38 |     Type: 'AWS::EC2::VPC'
 39 |     Properties:
 40 |       CidrBlock: !FindInMap 
 41 |         - VpcConfigurations
 42 |         - cidr
 43 |         - Vpc
 44 |       EnableDnsSupport: true
 45 |       EnableDnsHostnames: true
 46 |       Tags:
 47 |         - Key: "for-use-with-amazon-emr-managed-policies"
 48 |           Value: "true"
 49 |         - Key: Name
 50 |           Value: !Sub '${AWS::StackName}-VPC'
 51 |  
 52 |   InternetGateway:
 53 |     Type: 'AWS::EC2::InternetGateway'
 54 |     Properties:
 55 |       Tags:
 56 |         - Key: Name
 57 |           Value: !Sub '${AWS::StackName}-IGW'
 58 |  
 59 |   InternetGatewayAttachment:
 60 |     Type: 'AWS::EC2::VPCGatewayAttachment'
 61 |     Properties:
 62 |       InternetGatewayId: !Ref InternetGateway
 63 |       VpcId: !Ref VPC
 64 |  
 65 |   PublicSubnet1:
 66 |     Type: 'AWS::EC2::Subnet'
 67 |     Properties:
 68 |       VpcId: !Ref VPC
 69 |       AvailabilityZone: !Select 
 70 |         - 0
 71 |         - !GetAZs ''
 72 |       CidrBlock: !FindInMap 
 73 |         - VpcConfigurations
 74 |         - cidr
 75 |         - PublicSubnet1
 76 |       MapPublicIpOnLaunch: true
 77 |       Tags:
 78 |         - Key: Name
 79 |           Value: !Sub '${AWS::StackName} Public Subnet (AZ1)'
 80 |  
 81 |   PrivateSubnet1:
 82 |     Type: 'AWS::EC2::Subnet'
 83 |     Properties:
 84 |       VpcId: !Ref VPC
 85 |       AvailabilityZone: !Select 
 86 |         - 0
 87 |         - !GetAZs ''
 88 |       CidrBlock: !FindInMap 
 89 |         - VpcConfigurations
 90 |         - cidr
 91 |         - PrivateSubnet1
 92 |       MapPublicIpOnLaunch: false
 93 |       Tags:
 94 |         - Key: "for-use-with-amazon-emr-managed-policies"
 95 |           Value: "true"
 96 |         - Key: Name
 97 |           Value: !Sub '${AWS::StackName} Private Subnet (AZ1)'
 98 |  
 99 |   NatGateway1EIP:
100 |     Type: 'AWS::EC2::EIP'
101 |     DependsOn: InternetGatewayAttachment
102 |     Properties:
103 |       Domain: vpc
104 |  
105 |   NatGateway1:
106 |     Type: 'AWS::EC2::NatGateway'
107 |     Properties:
108 |       AllocationId: !GetAtt 
109 |         - NatGateway1EIP
110 |         - AllocationId
111 |       SubnetId: !Ref PublicSubnet1
112 |  
113 |   PublicRouteTable:
114 |     Type: 'AWS::EC2::RouteTable'
115 |     Properties:
116 |       VpcId: !Ref VPC
117 |       Tags:
118 |         - Key: Name
119 |           Value: !Sub '${AWS::StackName} Public Routes'
120 |  
121 |   DefaultPublicRoute:
122 |     Type: 'AWS::EC2::Route'
123 |     DependsOn: InternetGatewayAttachment
124 |     Properties:
125 |       RouteTableId: !Ref PublicRouteTable
126 |       DestinationCidrBlock: 0.0.0.0/0
127 |       GatewayId: !Ref InternetGateway
128 |  
129 |   PublicSubnet1RouteTableAssociation:
130 |     Type: 'AWS::EC2::SubnetRouteTableAssociation'
131 |     Properties:
132 |       RouteTableId: !Ref PublicRouteTable
133 |       SubnetId: !Ref PublicSubnet1
134 |  
135 |   PrivateRouteTable1:
136 |     Type: 'AWS::EC2::RouteTable'
137 |     Properties:
138 |       VpcId: !Ref VPC
139 |       Tags:
140 |         - Key: Name
141 |           Value: !Sub '${AWS::StackName} Private Routes (AZ1)'
142 | 
143 | 
144 |   PrivateSubnet1RouteTableAssociation:
145 |     Type: 'AWS::EC2::SubnetRouteTableAssociation'
146 |     Properties:
147 |       RouteTableId: !Ref PrivateRouteTable1
148 |       SubnetId: !Ref PrivateSubnet1
149 |  
150 |   PrivateSubnet1InternetRoute:
151 |     Type: 'AWS::EC2::Route'
152 |     Properties:
153 |       RouteTableId: !Ref PrivateRouteTable1
154 |       DestinationCidrBlock: 0.0.0.0/0
155 |       NatGatewayId: !Ref NatGateway1
156 |   
157 |   S3Endpoint:
158 |     Type: 'AWS::EC2::VPCEndpoint'
159 |     Properties:
160 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.s3'
161 |       VpcEndpointType: Gateway
162 |       PolicyDocument:
163 |         Version: 2012-10-17
164 |         Statement:
165 |           - Effect: Allow
166 |             Principal: '*'
167 |             Action:
168 |               - '*'
169 |             Resource:
170 |               - '*'
171 |       VpcId: !Ref VPC
172 |       RouteTableIds:
173 |         - !Ref PrivateRouteTable1
174 |  
175 |   SageMakerInstanceSecurityGroup:
176 |     Type: 'AWS::EC2::SecurityGroup'
177 |     Properties:
178 |       Tags:
179 |         - Key: "for-use-with-amazon-emr-managed-policies"
180 |           Value: "true"
181 |       GroupName: SMSG
182 |       GroupDescription: Security group with no ingress rule
183 |       SecurityGroupEgress:
184 |         - IpProtocol: -1
185 |           FromPort: -1
186 |           ToPort: -1
187 |           CidrIp: 0.0.0.0/0
188 |       VpcId: !Ref VPC
189 |   SageMakerInstanceSecurityGroupIngress:
190 |     Type: AWS::EC2::SecurityGroupIngress
191 |     Properties:
192 |       IpProtocol: '-1'
193 |       GroupId: !Ref SageMakerInstanceSecurityGroup
194 |       SourceSecurityGroupId: !Ref SageMakerInstanceSecurityGroup
195 |   VPCEndpointSecurityGroup:
196 |     Type: AWS::EC2::SecurityGroup
197 |     Properties:
198 |       GroupDescription: Allow TLS for VPC Endpoint
199 |       SecurityGroupEgress:
200 |         - IpProtocol: -1
201 |           FromPort: -1
202 |           ToPort: -1
203 |           CidrIp: 0.0.0.0/0
204 |       VpcId: !Ref VPC
205 |       Tags:
206 |         - Key: Name
207 |           Value: !Sub ${AWS::StackName}-endpoint-security-group
208 |   EndpointSecurityGroupIngress:
209 |     Type: AWS::EC2::SecurityGroupIngress
210 |     Properties:
211 |       IpProtocol: '-1'
212 |       GroupId: !Ref VPCEndpointSecurityGroup
213 |       SourceSecurityGroupId: !Ref SageMakerInstanceSecurityGroup
214 |   SageMakerExecutionRole:
215 |     Type: 'AWS::IAM::Role'
216 |     Properties:
217 |       RoleName: !Sub "${AWS::StackName}-EMR-SageMakerExecutionRole"
218 |       AssumeRolePolicyDocument:
219 |         Version: 2012-10-17
220 |         Statement:
221 |           - Effect: Allow
222 |             Principal:
223 |               Service:
224 |                 - sagemaker.amazonaws.com
225 |             Action:
226 |               - 'sts:AssumeRole'
227 |       Path: /
228 |       Policies:
229 |         - PolicyName: !Sub '${AWS::StackName}-sageemr'
230 |           PolicyDocument:
231 |             Version: 2012-10-17
232 |             Statement:
233 |               - Effect: Allow
234 |                 Action:
235 |                   - elasticmapreduce:ListInstances
236 |                   - elasticmapreduce:DescribeCluster
237 |                   - elasticmapreduce:DescribeSecurityConfiguration
238 |                   - elasticmapreduce:CreatePersistentAppUI
239 |                   - elasticmapreduce:DescribePersistentAppUI
240 |                   - elasticmapreduce:GetPersistentAppUIPresignedURL
241 |                   - elasticmapreduce:GetOnClusterAppUIPresignedURL
242 |                   - elasticmapreduce:ListClusters
243 |                   - iam:GetRole
244 |                 Resource: '*'
245 |               - Effect: Allow
246 |                 Action:
247 |                   - elasticmapreduce:DescribeCluster
248 |                   - elasticmapreduce:ListInstanceGroups
249 |                 Resource: !Sub "arn:${AWS::Partition}:elasticmapreduce:*:*:cluster/*"
250 |               - Effect: Allow
251 |                 Action:
252 |                   - elasticmapreduce:ListClusters
253 |                 Resource: '*'
254 |       ManagedPolicyArns:
255 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonSageMakerFullAccess"
256 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonS3ReadOnlyAccess"
257 | 
258 |   VPCEndpointSagemakerAPI:
259 |     Type: AWS::EC2::VPCEndpoint
260 |     Properties:
261 |       PolicyDocument:
262 |         Version: 2012-10-17
263 |         Statement:
264 |           - Effect: Allow
265 |             Principal: '*'
266 |             Action: '*'
267 |             Resource: '*'
268 |       VpcEndpointType: Interface
269 |       PrivateDnsEnabled: true
270 |       SubnetIds:
271 |         - !Ref PrivateSubnet1
272 |       SecurityGroupIds:
273 |         - !Ref VPCEndpointSecurityGroup
274 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.sagemaker.api'
275 |       VpcId: !Ref VPC
276 |   VPCEndpointSageMakerRuntime:
277 |     Type: AWS::EC2::VPCEndpoint
278 |     Properties:
279 |       PolicyDocument:
280 |         Version: 2012-10-17
281 |         Statement:
282 |           - Effect: Allow
283 |             Principal: '*'
284 |             Action: '*'
285 |             Resource: '*'
286 |       VpcEndpointType: Interface
287 |       PrivateDnsEnabled: true
288 |       SubnetIds:
289 |         - !Ref PrivateSubnet1
290 |       SecurityGroupIds:
291 |         - !Ref VPCEndpointSecurityGroup
292 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.sagemaker.runtime'
293 |       VpcId: !Ref VPC
294 |   VPCEndpointSTS:
295 |     Type: 'AWS::EC2::VPCEndpoint'
296 |     Properties:
297 |       PolicyDocument:
298 |         Version: 2012-10-17
299 |         Statement:
300 |           - Effect: Allow
301 |             Principal: '*'
302 |             Action: '*'
303 |             Resource: '*'
304 |       VpcEndpointType: Interface
305 |       PrivateDnsEnabled: true
306 |       SubnetIds:
307 |         - !Ref PrivateSubnet1
308 |       SecurityGroupIds:
309 |         - !Ref VPCEndpointSecurityGroup
310 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.sts'
311 |       VpcId: !Ref VPC
312 |   VPCEndpointCW:
313 |     Type: 'AWS::EC2::VPCEndpoint'
314 |     Properties:
315 |       PolicyDocument:
316 |         Version: 2012-10-17
317 |         Statement:
318 |           - Effect: Allow
319 |             Principal: '*'
320 |             Action: '*'
321 |             Resource: '*'
322 |       VpcEndpointType: Interface
323 |       PrivateDnsEnabled: true
324 |       SubnetIds:
325 |         - !Ref PrivateSubnet1
326 |       SecurityGroupIds:
327 |         - !Ref VPCEndpointSecurityGroup
328 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.monitoring'
329 |       VpcId: !Ref VPC
330 |   VPCEndpointCWL:
331 |     Type: 'AWS::EC2::VPCEndpoint'
332 |     Properties:
333 |       PolicyDocument:
334 |         Version: 2012-10-17
335 |         Statement:
336 |           - Effect: Allow
337 |             Principal: '*'
338 |             Action: '*'
339 |             Resource: '*'
340 |       VpcEndpointType: Interface
341 |       PrivateDnsEnabled: true
342 |       SubnetIds:
343 |         - !Ref PrivateSubnet1
344 |       SecurityGroupIds:
345 |         - !Ref VPCEndpointSecurityGroup
346 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.logs'
347 |       VpcId: !Ref VPC
348 |   VPCEndpointECR:
349 |     Type: 'AWS::EC2::VPCEndpoint'
350 |     Properties:
351 |       PolicyDocument:
352 |         Version: 2012-10-17
353 |         Statement:
354 |           - Effect: Allow
355 |             Principal: '*'
356 |             Action: '*'
357 |             Resource: '*'
358 |       VpcEndpointType: Interface
359 |       PrivateDnsEnabled: true
360 |       SubnetIds:
361 |         - !Ref PrivateSubnet1
362 |       SecurityGroupIds:
363 |         - !Ref VPCEndpointSecurityGroup
364 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.ecr.dkr'
365 |       VpcId: !Ref VPC
366 |   VPCEndpointECRAPI:
367 |     Type: 'AWS::EC2::VPCEndpoint'
368 |     Properties:
369 |       PolicyDocument:
370 |         Version: 2012-10-17
371 |         Statement:
372 |           - Effect: Allow
373 |             Principal: '*'
374 |             Action: '*'
375 |             Resource: '*'
376 |       VpcEndpointType: Interface
377 |       PrivateDnsEnabled: true
378 |       SubnetIds:
379 |         - !Ref PrivateSubnet1
380 |       SecurityGroupIds:
381 |         - !Ref VPCEndpointSecurityGroup
382 |       ServiceName: !Sub 'com.amazonaws.${AWS::Region}.ecr.api'
383 |       VpcId: !Ref VPC
384 |   
385 |   
386 |   StudioDomain:
387 |     Type: AWS::SageMaker::Domain
388 |     Properties:
389 |       DomainName: !Ref SageMakerDomainName
390 |       AppNetworkAccessType: VpcOnly
391 |       AuthMode: IAM
392 |       VpcId: !Ref VPC
393 |       SubnetIds: 
394 |         - !Ref PrivateSubnet1
395 |       DefaultUserSettings:
396 |         ExecutionRole: !GetAtt SageMakerExecutionRole.Arn
397 |         SecurityGroups:
398 |            - !Ref SageMakerInstanceSecurityGroup
399 |  
400 |   StudioUserProfile:
401 |     Type: AWS::SageMaker::UserProfile
402 |     Properties:
403 |       DomainId: !Ref StudioDomain
404 |       UserProfileName: studio-user
405 |       UserSettings: 
406 |         ExecutionRole: !GetAtt SageMakerExecutionRole.Arn
407 |   
408 |   # Products populated to Service Catalog
409 |   ###################################################  
410 |   
411 |   SageMakerStudioEMRNoAuthProduct:
412 |     Type: AWS::ServiceCatalog::CloudFormationProduct
413 |     Properties:
414 |       Owner: AWS
415 |       Name: SageMaker Studio Domain No Auth EMR
416 |       ProvisioningArtifactParameters:
417 |         - Name: SageMaker Studio Domain No Auth EMR
418 |           Description: Provisions a SageMaker domain and No Auth EMR Cluster
419 |           Info:
420 |             LoadTemplateFromURL: https://aws-ml-blog.s3.amazonaws.com/artifacts/astra-m4-sagemaker/end-to-end/CFN-EMR-NoStudioNoAuthTemplate-v3.yaml
421 |       Tags:
422 |         - Key: "sagemaker:studio-visibility:emr"
423 |           Value: "true"
424 |  
425 |   SageMakerStudioEMRNoAuthProductPortfolio:
426 |     Type: AWS::ServiceCatalog::Portfolio
427 |     Properties:
428 |       ProviderName: AWS
429 |       DisplayName: SageMaker Product Portfolio
430 |  
431 |   SageMakerStudioEMRNoAuthProductPortfolioAssociation:
432 |     Type: AWS::ServiceCatalog::PortfolioProductAssociation
433 |     Properties:
434 |       PortfolioId: !Ref SageMakerStudioEMRNoAuthProductPortfolio
435 |       ProductId: !Ref SageMakerStudioEMRNoAuthProduct
436 |   
437 |   EMRNoAuthLaunchConstraint:
438 |     Type: 'AWS::IAM::Role'
439 |     Properties:
440 |       Policies:
441 |         - PolicyDocument:
442 |             Statement:
443 |             - Action:
444 |               - s3:*
445 |               Effect: Allow
446 |               Resource:
447 |               - !Sub "arn:${AWS::Partition}:s3:::sagemaker-emr-template-cfn-*/*"
448 |               - !Sub "arn:${AWS::Partition}:s3:::sagemaker-emr-template-cfn-*"
449 |             - Action:
450 |               - s3:GetObject
451 |               Effect: Allow
452 |               Resource: "*"
453 |               Condition:
454 |                 StringEquals:
455 |                   s3:ExistingObjectTag/servicecatalog:provisioning: 'true'
456 |           PolicyName: !Sub ${AWS::StackName}-${AWS::Region}-S3-Policy
457 |         - PolicyDocument:
458 |             Statement:
459 |               - Action:
460 |                   - "sns:Publish"
461 |                 Effect: Allow
462 |                 Resource: !Sub "arn:${AWS::Partition}:sns:${AWS::Region}:${AWS::AccountId}:*"
463 |             Version: "2012-10-17"
464 |           PolicyName: SNSPublishPermissions
465 |         - PolicyDocument:
466 |             Statement:
467 |               - Action:
468 |                   - "ec2:CreateSecurityGroup"
469 |                   - "ec2:RevokeSecurityGroupEgress"
470 |                   - "ec2:DeleteSecurityGroup"
471 |                   - "ec2:createTags"
472 |                   - "iam:TagRole"
473 |                   - "ec2:AuthorizeSecurityGroupEgress"
474 |                   - "ec2:AuthorizeSecurityGroupIngress"
475 |                   - "ec2:RevokeSecurityGroupIngress"
476 |                 Effect: Allow
477 |                 Resource: "*"
478 |             Version: "2012-10-17"
479 |           PolicyName: EC2Permissions
480 |         - PolicyDocument:
481 |             Statement:
482 |               - Action:
483 |                   - "elasticmapreduce:RunJobFlow"
484 |                 Effect: Allow
485 |                 Resource: !Sub "arn:${AWS::Partition}:elasticmapreduce:${AWS::Region}:${AWS::AccountId}:cluster/*"
486 |             Version: "2012-10-17"
487 |           PolicyName: EMRRunJobFlowPermissions
488 |         - PolicyDocument:
489 |             Statement:
490 |               - Action:
491 |                   - "iam:PassRole"
492 |                 Effect: Allow
493 |                 Resource:
494 |                   - !GetAtt EMRClusterinstanceProfileRole.Arn
495 |                   - !GetAtt EMRClusterServiceRole.Arn
496 |               - Action:
497 |                   - "iam:CreateInstanceProfile"
498 |                   - "iam:RemoveRoleFromInstanceProfile"
499 |                   - "iam:DeleteInstanceProfile"
500 |                   - "iam:AddRoleToInstanceProfile"
501 |                 Effect: Allow
502 |                 Resource: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:instance-profile/SC-*"
503 |             Version: "2012-10-17"
504 |           PolicyName: IAMPermissions
505 |       AssumeRolePolicyDocument:
506 |         Version: "2012-10-17"
507 |         Statement:
508 |           - 
509 |             Effect: "Allow"
510 |             Principal:
511 |               Service: 
512 |                 - "servicecatalog.amazonaws.com"
513 |             Action:
514 |               - "sts:AssumeRole"
515 |       ManagedPolicyArns:
516 |         - "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AWSServiceCatalogAdminFullAccess"
517 |         - "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEMRFullAccessPolicy_v2"
518 |  
519 | # Sets the principal who can initate provisioning from Service Studio
520 | #######################################################################
521 |   
522 |   SageMakerStudioEMRNoAuthProductPortfolioPrincipalAssociation:
523 |     Type: AWS::ServiceCatalog::PortfolioPrincipalAssociation
524 |     Properties:
525 |       PrincipalARN: !GetAtt SageMakerExecutionRole.Arn
526 |       PortfolioId: !Ref SageMakerStudioEMRNoAuthProductPortfolio
527 |       PrincipalType: IAM
528 |  
529 |   SageMakerStudioPortfolioLaunchRoleConstraint:
530 |     Type: AWS::ServiceCatalog::LaunchRoleConstraint
531 |     Properties:
532 |       PortfolioId: !Ref SageMakerStudioEMRNoAuthProductPortfolio
533 |       ProductId: !Ref SageMakerStudioEMRNoAuthProduct
534 |       RoleArn: !GetAtt EMRNoAuthLaunchConstraint.Arn
535 |       Description: Role used for provisioning
536 | 
537 | # EMR IAM Roles
538 | ########################################################################
539 |   EMRClusterServiceRole:
540 |     Type: AWS::IAM::Role
541 |     Properties:
542 |       AssumeRolePolicyDocument:
543 |         Statement:
544 |           - Action:
545 |               - sts:AssumeRole
546 |             Effect: Allow
547 |             Principal:
548 |               Service:
549 |                 - elasticmapreduce.amazonaws.com
550 |         Version: '2012-10-17'
551 |       ManagedPolicyArns:
552 |         - arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2
553 |       Path: "/"
554 |       Policies:
555 |         - PolicyName:
556 |             Fn::Sub: AllowEMRInstnaceProfilePolicy-${AWS::StackName}
557 |           PolicyDocument:
558 |             Version: '2012-10-17'
559 |             Statement:
560 |               - Effect: Allow
561 |                 Action: "iam:PassRole"
562 |                 Resource: !GetAtt EMRClusterinstanceProfileRole.Arn
563 | 
564 |   # User's Should Consider using RoleBasedAccess Control now that it is available to pass your SageMaker execution role
565 |   # to the cluster instead.
566 |   EMRClusterinstanceProfileRole:
567 |     Properties:
568 |       RoleName:
569 |         Fn::Sub: "${AWS::StackName}-EMRClusterinstanceProfileRole"
570 |       AssumeRolePolicyDocument:
571 |         Statement:
572 |           - Action:
573 |               - sts:AssumeRole
574 |             Effect: Allow
575 |             Principal:
576 |               Service:
577 |                 - ec2.amazonaws.com
578 |         Version: '2012-10-17'
579 |       ManagedPolicyArns:
580 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonSageMakerFullAccess"
581 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonS3ReadOnlyAccess"
582 |       Path: "/"
583 |     Type: AWS::IAM::Role
584 | 
585 | # Manage EMR Log and Artifacts S3 Bucket
586 | ########################################################################
587 |   CopyZips:
588 |     Type: Custom::CopyZips
589 |     DependsOn: CleanUpBucketonDelete
590 |     Properties:
591 |       ServiceToken:
592 |         Fn::GetAtt: CopyZipsFunction.Arn
593 |       DestBucket:
594 |         Ref: S3Bucket
595 |       SourceBucket:
596 |         Fn::FindInMap:
597 |           - ClusterConfigurations
598 |           - s3params
599 |           - BlogS3Bucket
600 |       Prefix:
601 |         Fn::FindInMap:
602 |           - ClusterConfigurations
603 |           - s3params
604 |           - S3Key
605 |       Objects:
606 |         - Fn::FindInMap:
607 |             - ClusterConfigurations
608 |             - emr
609 |             - BootStrapScriptFile
610 |         - Fn::FindInMap:
611 |             - ClusterConfigurations
612 |             - emr
613 |             - StepScriptFile
614 |   BucketManagementRole:
615 |     Type: AWS::IAM::Role
616 |     Properties:
617 |       AssumeRolePolicyDocument:
618 |         Version: '2012-10-17'
619 |         Statement:
620 |           - Effect: Allow
621 |             Principal:
622 |               Service: lambda.amazonaws.com
623 |             Action: sts:AssumeRole
624 |       ManagedPolicyArns:
625 |         - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
626 |       Path: "/"
627 |       Policies:
628 |         - PolicyName:
629 |             Fn::Sub: BucketManagementLambdaPolicy-${AWS::StackName}
630 |           PolicyDocument:
631 |             Version: '2012-10-17'
632 |             Statement:
633 |               - Effect: Allow
634 |                 Action:
635 |                   - s3:GetObject
636 |                 Resource: "*"
637 |               - Effect: Allow
638 |                 Action:
639 |                   - s3:PutObject
640 |                   - s3:DeleteObject
641 |                 Resource:
642 |                   - Fn::Sub: arn:aws:s3:::${S3Bucket}/*
643 |   CopyZipsFunction:
644 |     Type: AWS::Lambda::Function
645 |     Properties:
646 |       Description: Copies objects from a source S3 bucket to a destination
647 |       Handler: index.handler
648 |       Runtime: python3.8
649 |       Role:
650 |         Fn::GetAtt: BucketManagementRole.Arn
651 |       Timeout: 900
652 |       Code:
653 |         ZipFile: |
654 |           import json
655 |           import logging
656 |           import threading
657 |           import boto3
658 |           import cfnresponse
659 |           def copy_objects(source_bucket, dest_bucket, prefix, objects):
660 |               s3 = boto3.client('s3')
661 |               for o in objects:
662 |                   key = prefix + o
663 |                   copy_source = {
664 |                       'Bucket': source_bucket,
665 |                       'Key': key
666 |                   }
667 |                   print('copy_source: %s' % copy_source)
668 |                   print('dest_bucket = %s'%dest_bucket)
669 |                   print('key = %s' %key)
670 |                   s3.copy_object(CopySource=copy_source, Bucket=dest_bucket,
671 |                         Key=key)
672 |           def delete_objects(bucket, prefix, objects):
673 |               s3 = boto3.client('s3')
674 |               objects = {'Objects': [{'Key': prefix + o} for o in objects]}
675 |               s3.delete_objects(Bucket=bucket, Delete=objects)
676 |           def timeout(event, context):
677 |               logging.error('Execution is about to time out, sending failure response to CloudFormation')
678 |               cfnresponse.send(event, context, cfnresponse.FAILED, {}, None)
679 |           def handler(event, context):
680 |               # make sure we send a failure to CloudFormation if the function
681 |               # is going to timeout
682 |               timer = threading.Timer((context.get_remaining_time_in_millis()
683 |                         / 1000.00) - 0.5, timeout, args=[event, context])
684 |               timer.start()
685 |               print('Received event: %s' % json.dumps(event))
686 |               status = cfnresponse.SUCCESS
687 |               try:
688 |                   source_bucket = event['ResourceProperties']['SourceBucket']
689 |                   dest_bucket = event['ResourceProperties']['DestBucket']
690 |                   prefix = event['ResourceProperties']['Prefix']
691 |                   objects = event['ResourceProperties']['Objects']
692 |                   if event['RequestType'] == 'Delete':
693 |                       delete_objects(dest_bucket, prefix, objects)
694 |                   else:
695 |                       copy_objects(source_bucket, dest_bucket, prefix, objects)
696 |               except Exception as e:
697 |                   logging.error('Exception: %s' % e, exc_info=True)
698 |                   status = cfnresponse.FAILED
699 |               finally:
700 |                   timer.cancel()
701 |                   cfnresponse.send(event, context, status, {}, None)
702 |   CleanUpBucketonDelete:
703 |     Type: Custom::emptybucket
704 |     Properties:
705 |       ServiceToken:
706 |         Fn::GetAtt:
707 |           - CleanUpBucketonDeleteLambda
708 |           - Arn
709 |       BucketName:
710 |         Ref: S3Bucket
711 |   CleanUpBucketonDeleteLambda:
712 |     Type: AWS::Lambda::Function
713 |     Properties:
714 |       Code:
715 |         ZipFile:
716 |           !Sub |
717 |           import json, boto3, logging
718 |           import cfnresponse
719 |           logger = logging.getLogger()
720 |           logger.setLevel(logging.INFO)
721 |           
722 |           def lambda_handler(event, context):
723 |               logger.info("event: {}".format(event))
724 |               try:
725 |                   bucket = event['ResourceProperties']['BucketName']
726 |                   logger.info("bucket: {}, event['RequestType']: {}".format(bucket,event['RequestType']))
727 |                   if event['RequestType'] == 'Delete':
728 |                       s3 = boto3.resource('s3')
729 |                       bucket = s3.Bucket(bucket)
730 |                       for obj in bucket.objects.filter():
731 |                           logger.info("delete obj: {}".format(obj))
732 |                           s3.Object(bucket.name, obj.key).delete()
733 |           
734 |                   sendResponseCfn(event, context, cfnresponse.SUCCESS)
735 |               except Exception as e:
736 |                   logger.info("Exception: {}".format(e))
737 |                   sendResponseCfn(event, context, cfnresponse.FAILED)
738 |           
739 |           def sendResponseCfn(event, context, responseStatus):
740 |               responseData = {}
741 |               responseData['Data'] = {}
742 |               cfnresponse.send(event, context, responseStatus, responseData, "CustomResourcePhysicalID")
743 |       Handler: "index.lambda_handler"
744 |       Runtime: python3.7
745 |       MemorySize: 128
746 |       Timeout: 60
747 |       Role: !GetAtt BucketManagementRole.Arn
748 | 
749 | # Stack Outputs
750 | ###########################################################################
751 | Outputs:
752 |   SageMakerEMRDemoCloudformationVPCId:
753 |     Description: The ID of the Sagemaker Studio VPC
754 |     Value: !Ref VPC
755 |     Export:
756 |       Name: "SageMakerEMRDemoCloudformationVPCId"
757 | 
758 |   SageMakerEMRDemoCloudformationSubnetId:
759 |     Description: The Subnet Id of Sagemaker Studio
760 |     Value: !Ref PrivateSubnet1
761 |     Export:
762 |       Name: "SageMakerEMRDemoCloudformationSubnetId"
763 | 
764 |   SageMakerEMRDemoCloudformationSecurityGroup:
765 |     Description: The Security group of Sagemaker Studio instance
766 |     Value: !Ref SageMakerInstanceSecurityGroup
767 |     Export:
768 |       Name: "SageMakerEMRDemoCloudformationSecurityGroup"
769 | 
770 |   SageMakerEMRDemoCloudformationEMRClusterinstanceProfileRole:
771 |     Description: Role for EMR Cluster's InstanceProfile
772 |     Value: !Ref EMRClusterinstanceProfileRole
773 |     Export:
774 |       Name: "SageMakerEMRDemoCloudformationEMRClusterinstanceProfileRole"
775 | 
776 |   SageMakerEMRDemoCloudformationEMRClusterServiceRole:
777 |     Description: Role for EMR Cluster's Service Role
778 |     Value: !Ref EMRClusterServiceRole
779 |     Export:
780 |       Name: "SageMakerEMRDemoCloudformationEMRClusterServiceRole"
781 | 
782 |   SageMakerEMRDemoCloudformationS3BucketName:
783 |     Description: Bucket Name for Amazon S3 bucket
784 |     Value:
785 |       Ref: S3Bucket
786 |     Export:
787 |       Name: "SageMakerEMRDemoCloudformationS3BucketName"


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/img/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/sagemaker-studio-emr-spark/img/img1.png


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/img/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/sagemaker-studio-emr-spark/img/img2.png


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/img/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/sagemaker-studio-emr-spark/img/img3.png


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/img/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/sagemaker-studio-emr-spark/img/img4.png


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/img/img5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/sagemaker-studio-emr-spark/img/img5.png


--------------------------------------------------------------------------------
/sagemaker-studio-emr-spark/img/img6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/data-engineering-on-aws/b0a8b97c09f43c63b927f32e5002a301d702c555/sagemaker-studio-emr-spark/img/img6.png


--------------------------------------------------------------------------------