├── README.md
└── glue_pyspark_job.py


/README.md:
--------------------------------------------------------------------------------
 1 | # AWS-Glue-Pyspark-ETL-Job
 2 | 
 3 | 
 4 | This module performs statistical analysis on the noval corona virus dataset. The implementation is specifically
 5 | designed for AWS Glue environment. Can be used as a Glue Pyspark Job.
 6 | The dataset being used was last updated on May 02, 2020. 
 7 | The Module performs the following Functions:
 8 | * Reads data from csv files stored on AWS S3
 9 | * Perfroms Extract, Transform, Load (ETL) operations. 
10 | * Lists max Cases for each country/region and provice/state
11 | * Lists max Deaths for each country/region and provice/state
12 | * List max Recoveries for each country/region and provice/state
13 | * stores the aggregated output in parquet format
14 | *
15 | 
16 | 


--------------------------------------------------------------------------------
/glue_pyspark_job.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module performs statistical analysis on the noval corona virus dataset. The implementation is specifically
  3 | designed for AWS Glue environment. Can be used as a Glue Pyspark Job.
  4 | The dataset being used was last updated on May 02, 2020. 
  5 | The Module performs the following Functions:
  6 | * Reads data from csv files stored on AWS S3
  7 | * Perfroms Extract, Transform, Load (ETL) operations. 
  8 | * Lists max Cases for each country/region and provice/state
  9 | * Lists max Deaths for each country/region and provice/state
 10 | * List max Recoveries for each country/region and provice/state
 11 | * stores the aggregated output in parquet format
 12 | *
 13 | """
 14 | 
 15 | import sys
 16 | from awsglue.transforms import *
 17 | from awsglue.utils import getResolvedOptions
 18 | from pyspark.context import SparkContext
 19 | from awsglue.context import GlueContext
 20 | from awsglue.dynamicframe import DynamicFrame
 21 | from pyspark.sql.functions import col
 22 | from awsglue.job import Job
 23 | 
 24 | # getting the Job Name
 25 | args = getResolvedOptions(sys.argv, ['JOB_NAME'])
 26 | 
 27 | # Creating Glue and Spark Contexts
 28 | sc = SparkContext()
 29 | glueContext = GlueContext(sc)
 30 | spark = glueContext.spark_session
 31 | job = Job(glueContext)
 32 | job.init(args['JOB_NAME'], args)
 33 | 
 34 | # catalog: database and table names, s3 output bucket
 35 | db_name = "glue-catalog-dbname"
 36 | tbl_name = "glue-catalog-table"
 37 | s3_write_bucket = "s3 bucket path"
 38 | 
 39 | ############################
 40 | #        EXTRACT           #
 41 | ############################
 42 | 
 43 | # creating datasource using the catalog table
 44 | datasource0 = glueContext.create_dynamic_frame.from_catalog(
 45 |     database=db_name, table_name=tbl_name)
 46 | 
 47 | # converting from Glue DynamicFrame to Spark Dataframe
 48 | dataframe = datasource0.toDF()
 49 | 
 50 | ############################
 51 | #        TRANSFORM         #
 52 | ############################
 53 | 
 54 | # dropping the last update column
 55 | datasource_df = dataframe.drop('last update')
 56 | 
 57 | # dropping rows if a row contains more than 4 null values
 58 | corona_df = datasource_df.dropna(thresh=4)
 59 | 
 60 | # replacing the missing value in Province/State column and populating with a default value
 61 | cleansed_data_df = corona_df.fillna(
 62 |     value='na_province_state', subset='province/state')
 63 | 
 64 | 
 65 | # Grouping the records by Province/State and Country/Region column, aggregating with max(Confirmed) column
 66 | # and sorting them in descending order of max of Confirmed cases.
 67 | 
 68 | most_cases_province_state_df = cleansed_data_df.groupBy('province/state', 'country/region').max('confirmed')\
 69 |     .select('province/state', 'country/region', col("max(confirmed)").alias("Most_Cases"))\
 70 |     .orderBy('max(confirmed)', ascending=False)
 71 | 
 72 | # Grouping the records by Province/State and Country/Region column, aggregating with max(Deaths) column
 73 | # and sorting them in descending order of max of Deaths.
 74 | 
 75 | most_deaths_province_state_df = cleansed_data_df.groupBy('province/state', 'country/region').max('deaths')\
 76 |     .select('province/state', 'country/region', col("max(deaths)").alias("Most_Deaths"))\
 77 |     .orderBy('max(deaths)', ascending=False)
 78 | 
 79 | # Grouping the records by Province/State and Country/Region column, aggregating with max(Recovered) column
 80 | # and sorting them in descending order of max of Recovered.
 81 | 
 82 | most_recoveries_province_state_df = cleansed_data_df.groupBy('province/state', 'country/region').max('recovered')\
 83 |     .select('province/state', 'country/region', col("max(recovered)").alias("Most_Recovered"))\
 84 |     .orderBy('max(recovered)', ascending=False)
 85 | 
 86 | # transforming Spark Dataframes back to Glue DynamicFrames
 87 | transform1 = DynamicFrame.fromDF(
 88 |     most_cases_province_state_df, glueContext, 'transform1')
 89 | transform2 = DynamicFrame.fromDF(
 90 |     most_deaths_province_state_df, glueContext, 'transform2')
 91 | transform3 = DynamicFrame.fromDF(
 92 |     most_recoveries_province_state_df, glueContext, 'transform3')
 93 | 
 94 | ############################
 95 | #        LOAD              #
 96 | ############################
 97 | 
 98 | # Storing the data on s3 specified path in parquet format
 99 | datasink1 = glueContext.write_dynamic_frame.from_options(frame=transform1, connection_type="s3", connection_options={
100 |                                                          "path": s3_write_bucket+'/most-cases'}, format="parquet", transformation_ctx="datasink1")
101 | datasink2 = glueContext.write_dynamic_frame.from_options(frame=transform2, connection_type="s3", connection_options={
102 |                                                          "path": s3_write_bucket+'/most-deaths'}, format="parquet", transformation_ctx="datasink2")
103 | datasink3 = glueContext.write_dynamic_frame.from_options(frame=transform3, connection_type="s3", connection_options={
104 |                                                          "path": s3_write_bucket+'/most-recoveries'}, format="parquet", transformation_ctx="datasink3")
105 | 
106 | job.commit()
107 | 


--------------------------------------------------------------------------------