├── README.md └── glue_pyspark_job.py /README.md: -------------------------------------------------------------------------------- 1 | # AWS-Glue-Pyspark-ETL-Job 2 | 3 | 4 | This module performs statistical analysis on the noval corona virus dataset. The implementation is specifically 5 | designed for AWS Glue environment. Can be used as a Glue Pyspark Job. 6 | The dataset being used was last updated on May 02, 2020. 7 | The Module performs the following Functions: 8 | * Reads data from csv files stored on AWS S3 9 | * Perfroms Extract, Transform, Load (ETL) operations. 10 | * Lists max Cases for each country/region and provice/state 11 | * Lists max Deaths for each country/region and provice/state 12 | * List max Recoveries for each country/region and provice/state 13 | * stores the aggregated output in parquet format 14 | * 15 | 16 | -------------------------------------------------------------------------------- /glue_pyspark_job.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module performs statistical analysis on the noval corona virus dataset. The implementation is specifically 3 | designed for AWS Glue environment. Can be used as a Glue Pyspark Job. 4 | The dataset being used was last updated on May 02, 2020. 5 | The Module performs the following Functions: 6 | * Reads data from csv files stored on AWS S3 7 | * Perfroms Extract, Transform, Load (ETL) operations. 8 | * Lists max Cases for each country/region and provice/state 9 | * Lists max Deaths for each country/region and provice/state 10 | * List max Recoveries for each country/region and provice/state 11 | * stores the aggregated output in parquet format 12 | * 13 | """ 14 | 15 | import sys 16 | from awsglue.transforms import * 17 | from awsglue.utils import getResolvedOptions 18 | from pyspark.context import SparkContext 19 | from awsglue.context import GlueContext 20 | from awsglue.dynamicframe import DynamicFrame 21 | from pyspark.sql.functions import col 22 | from awsglue.job import Job 23 | 24 | # getting the Job Name 25 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 26 | 27 | # Creating Glue and Spark Contexts 28 | sc = SparkContext() 29 | glueContext = GlueContext(sc) 30 | spark = glueContext.spark_session 31 | job = Job(glueContext) 32 | job.init(args['JOB_NAME'], args) 33 | 34 | # catalog: database and table names, s3 output bucket 35 | db_name = "glue-catalog-dbname" 36 | tbl_name = "glue-catalog-table" 37 | s3_write_bucket = "s3 bucket path" 38 | 39 | ############################ 40 | # EXTRACT # 41 | ############################ 42 | 43 | # creating datasource using the catalog table 44 | datasource0 = glueContext.create_dynamic_frame.from_catalog( 45 | database=db_name, table_name=tbl_name) 46 | 47 | # converting from Glue DynamicFrame to Spark Dataframe 48 | dataframe = datasource0.toDF() 49 | 50 | ############################ 51 | # TRANSFORM # 52 | ############################ 53 | 54 | # dropping the last update column 55 | datasource_df = dataframe.drop('last update') 56 | 57 | # dropping rows if a row contains more than 4 null values 58 | corona_df = datasource_df.dropna(thresh=4) 59 | 60 | # replacing the missing value in Province/State column and populating with a default value 61 | cleansed_data_df = corona_df.fillna( 62 | value='na_province_state', subset='province/state') 63 | 64 | 65 | # Grouping the records by Province/State and Country/Region column, aggregating with max(Confirmed) column 66 | # and sorting them in descending order of max of Confirmed cases. 67 | 68 | most_cases_province_state_df = cleansed_data_df.groupBy('province/state', 'country/region').max('confirmed')\ 69 | .select('province/state', 'country/region', col("max(confirmed)").alias("Most_Cases"))\ 70 | .orderBy('max(confirmed)', ascending=False) 71 | 72 | # Grouping the records by Province/State and Country/Region column, aggregating with max(Deaths) column 73 | # and sorting them in descending order of max of Deaths. 74 | 75 | most_deaths_province_state_df = cleansed_data_df.groupBy('province/state', 'country/region').max('deaths')\ 76 | .select('province/state', 'country/region', col("max(deaths)").alias("Most_Deaths"))\ 77 | .orderBy('max(deaths)', ascending=False) 78 | 79 | # Grouping the records by Province/State and Country/Region column, aggregating with max(Recovered) column 80 | # and sorting them in descending order of max of Recovered. 81 | 82 | most_recoveries_province_state_df = cleansed_data_df.groupBy('province/state', 'country/region').max('recovered')\ 83 | .select('province/state', 'country/region', col("max(recovered)").alias("Most_Recovered"))\ 84 | .orderBy('max(recovered)', ascending=False) 85 | 86 | # transforming Spark Dataframes back to Glue DynamicFrames 87 | transform1 = DynamicFrame.fromDF( 88 | most_cases_province_state_df, glueContext, 'transform1') 89 | transform2 = DynamicFrame.fromDF( 90 | most_deaths_province_state_df, glueContext, 'transform2') 91 | transform3 = DynamicFrame.fromDF( 92 | most_recoveries_province_state_df, glueContext, 'transform3') 93 | 94 | ############################ 95 | # LOAD # 96 | ############################ 97 | 98 | # Storing the data on s3 specified path in parquet format 99 | datasink1 = glueContext.write_dynamic_frame.from_options(frame=transform1, connection_type="s3", connection_options={ 100 | "path": s3_write_bucket+'/most-cases'}, format="parquet", transformation_ctx="datasink1") 101 | datasink2 = glueContext.write_dynamic_frame.from_options(frame=transform2, connection_type="s3", connection_options={ 102 | "path": s3_write_bucket+'/most-deaths'}, format="parquet", transformation_ctx="datasink2") 103 | datasink3 = glueContext.write_dynamic_frame.from_options(frame=transform3, connection_type="s3", connection_options={ 104 | "path": s3_write_bucket+'/most-recoveries'}, format="parquet", transformation_ctx="datasink3") 105 | 106 | job.commit() 107 | --------------------------------------------------------------------------------