├── source ├── clicklogger │ ├── src │ │ ├── main │ │ │ ├── resources │ │ │ │ └── application.properties │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── clicklogs │ │ │ │ ├── model │ │ │ │ ├── ClickLoggerException.java │ │ │ │ ├── JobStatusRequest.java │ │ │ │ ├── ClickLogResponse.java │ │ │ │ ├── ResponseBuilder.java │ │ │ │ └── ClickLogRequest.java │ │ │ │ └── Handlers │ │ │ │ ├── ClickLoggerEMRJobHandler.java │ │ │ │ └── ClickLoggerHandler.java │ │ └── test │ │ │ └── java │ │ │ └── com │ │ │ └── clicklogs │ │ │ └── ClickLoggerHandlerTest.java │ ├── .gitignore │ ├── pom.xml │ ├── mvnw.cmd │ └── mvnw └── loggregator │ ├── project │ ├── build.properties │ └── plugins.sbt │ ├── src │ └── main │ │ ├── resources │ │ └── application.properties │ │ └── scala │ │ └── com │ │ └── examples │ │ └── clicklogger │ │ └── Loggregator.scala │ ├── README.md │ └── build.sbt ├── assets ├── EMRStudioApplications.png ├── step_function_success.png ├── EMRServerlessApplication.png ├── s3_output_response_file.png ├── s3_source_parquet_files.png ├── EMRServerlessApplication.png.license ├── EMRStudioApplications.png.license ├── s3_output_response_file.png.license ├── s3_source_parquet_files.png.license ├── step_function_success.png.license ├── emr-serverless-click-logs-from-web-application.drawio.license ├── emr-serverless-click-logs-from-web-application.drawio.png.license └── emr-serverless-click-logs-from-web-application.drawio.png ├── terraform ├── templates │ ├── providers.tf │ ├── environments.tf │ ├── configs.tf │ ├── policies.tf │ ├── cloudwatch.tf │ ├── emr.tf │ ├── firehose.tf │ ├── s3.tf │ ├── glue.tf │ ├── lambda.tf │ ├── stepfunction.tf │ ├── vpc.tf │ └── roles.tf └── workspaces │ └── us-east-1 │ ├── providers.tf │ ├── main.tf │ ├── output.tf │ └── variables.tf ├── CODE_OF_CONDUCT.md ├── .gitignore ├── cleanup.sh ├── HELP.md ├── LICENSE ├── LICENSES └── MIT-0.txt ├── exec.sh ├── CONTRIBUTING.md └── README.md /source/clicklogger/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/loggregator/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.8.2 -------------------------------------------------------------------------------- /source/loggregator/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/EMRStudioApplications.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-emr-serverless-using-terraform/HEAD/assets/EMRStudioApplications.png -------------------------------------------------------------------------------- /assets/step_function_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-emr-serverless-using-terraform/HEAD/assets/step_function_success.png -------------------------------------------------------------------------------- /assets/EMRServerlessApplication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-emr-serverless-using-terraform/HEAD/assets/EMRServerlessApplication.png -------------------------------------------------------------------------------- /assets/s3_output_response_file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-emr-serverless-using-terraform/HEAD/assets/s3_output_response_file.png -------------------------------------------------------------------------------- /assets/s3_source_parquet_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-emr-serverless-using-terraform/HEAD/assets/s3_source_parquet_files.png -------------------------------------------------------------------------------- /assets/EMRServerlessApplication.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /assets/EMRStudioApplications.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /assets/s3_output_response_file.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /assets/s3_source_parquet_files.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /assets/step_function_success.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /assets/emr-serverless-click-logs-from-web-application.drawio.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /assets/emr-serverless-click-logs-from-web-application.drawio.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /assets/emr-serverless-click-logs-from-web-application.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-emr-serverless-using-terraform/HEAD/assets/emr-serverless-click-logs-from-web-application.drawio.png -------------------------------------------------------------------------------- /terraform/templates/providers.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | #provider "aws" { 6 | # region = "us-east-1" 7 | #} -------------------------------------------------------------------------------- /terraform/workspaces/us-east-1/providers.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | provider "aws" { 6 | region = "us-east-1" 7 | } -------------------------------------------------------------------------------- /terraform/templates/environments.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | data "aws_caller_identity" "current" { } 6 | data "aws_region" "current" {} -------------------------------------------------------------------------------- /source/loggregator/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Resolver.url("sbt-plugin-releases-scala-sbt", url("https://repo.scala-sbt.org/scalasbt/sbt-plugin-releases/")) 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9") 4 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.3") -------------------------------------------------------------------------------- /source/clicklogger/src/main/java/com/clicklogs/model/ClickLoggerException.java: -------------------------------------------------------------------------------- 1 | package com.clicklogs.model; 2 | 3 | import java.lang.RuntimeException; 4 | 5 | public class ClickLoggerException extends RuntimeException { 6 | public ClickLoggerException(String errorMessage) { 7 | super(errorMessage); 8 | } 9 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /terraform/workspaces/us-east-1/main.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | module "clicklogger" { 6 | source = "../../templates" 7 | app_prefix = var.app_prefix 8 | stage_name = var.stage_name 9 | lambda_source_zip_path = var.lambda_source_zip_path 10 | emr_source_zip_path = var.emr_source_zip_path 11 | loggregator_jar = var.loggregator_jar 12 | } 13 | -------------------------------------------------------------------------------- /source/clicklogger/.gitignore: -------------------------------------------------------------------------------- 1 | HELP.md 2 | target/ 3 | !.mvn/wrapper/maven-wrapper.jar 4 | !**/src/main/** 5 | !**/src/test/** 6 | 7 | ### STS ### 8 | .apt_generated 9 | .classpath 10 | .factorypath 11 | .project 12 | .settings 13 | .springBeans 14 | .sts4-cache 15 | 16 | ### IntelliJ IDEA ### 17 | .idea 18 | *.iws 19 | *.iml 20 | *.ipr 21 | 22 | ### NetBeans ### 23 | /nbproject/private/ 24 | /nbbuild/ 25 | /dist/ 26 | /nbdist/ 27 | /.nb-gradle/ 28 | build/ 29 | 30 | ### VS Code ### 31 | .vscode/ 32 | -------------------------------------------------------------------------------- /source/clicklogger/src/main/java/com/clicklogs/model/JobStatusRequest.java: -------------------------------------------------------------------------------- 1 | package com.clicklogs.model; 2 | 3 | /* 4 | * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 5 | * 6 | * SPDX-License-Identifier: MIT-0 7 | */ 8 | 9 | public class JobStatusRequest { 10 | 11 | public String jobRunId; 12 | 13 | public String getJobRunId() { 14 | return jobRunId; 15 | } 16 | 17 | public void setJobRunId(String jobRunId) { 18 | this.jobRunId = jobRunId; 19 | } 20 | } -------------------------------------------------------------------------------- /terraform/workspaces/us-east-1/output.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | output "S3FirehoseDeliveryBucket" { 6 | value = module.clicklogger.S3FirehoseDeliveryBucket 7 | } 8 | 9 | output "S3EMRSourceBucket" { 10 | value = module.clicklogger.S3EMRSourceBucket 11 | } 12 | 13 | output "lambda-clicklogger-ingest" { 14 | value = module.clicklogger.lambda-clicklogger-ingest 15 | } 16 | 17 | output "lambda-clicklogger-emr-job" { 18 | value = module.clicklogger.lambda-clicklogger-emr-job 19 | } -------------------------------------------------------------------------------- /terraform/templates/configs.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | variable "app_prefix" { 6 | description = "Application prefix for the AWS services that are built" 7 | default = "clicklogger" 8 | } 9 | 10 | variable "stage_name" { 11 | default = "dev" 12 | } 13 | 14 | variable "lambda_source_zip_path" { 15 | description = "Java lambda zip" 16 | } 17 | 18 | variable "emr_source_zip_path" { 19 | description = "EMR lambda zip" 20 | } 21 | 22 | variable "loggregator_jar" { 23 | default = "loggregator-0-0.1.jar" 24 | } 25 | 26 | 27 | -------------------------------------------------------------------------------- /source/loggregator/README.md: -------------------------------------------------------------------------------- 1 | $ sbt reload 2 | 3 | $ sbt compile 4 | 5 | $ sbt package 6 | 7 | $ java -jar target/scala-2.13/loggregator-assembly-0.1.jar com.examples.clicklogger "2020-06-15" "clicklogger-dev-firehose-delivery-bucket-" "clicklogger-dev-loggregator-output-bucket-" 8 | 9 | $ emr console 10 | 11 | - command-runner.jar 12 | - spark-submit --deploy-mode client --class com.examples.clicklogger.Loggregator s3://clicklogger-emr-source/loggregator-assembly-0.1.jar 2022-07-18 clicklogger-dev-firehose-delivery-bucket- clicklogger-dev-loggregator-output-bucket- 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | .mvn 4 | .terraform 5 | .vscode 6 | terraform.tfstate.backup 7 | .terraform.lock.hcl 8 | target 9 | out 10 | source/clicklogger/src/main/.DS_Store 11 | source/clicklogger/src/main/java/.DS_Store 12 | source/clicklogger/src/main/java/com/.DS_Store 13 | source/clicklogger/src/main/java/com/clicklogs/.DS_Store 14 | terraform/workspaces/us-east-1/terraform.tfstate 15 | terraform/workspaces/us-east-1/.terraform/* 16 | terraform/workspaces/us-east-1/.terraform* 17 | source/clicklogger/src/test/.DS_Store 18 | source/clicklogger/src/test/java/.DS_Store 19 | source/clicklogger/src/test/java/com/.DS_Store 20 | assets/.$emr-serverless-click-logs-from-web-application.drawio.bkp 21 | assets/.$emr-serverless-click-logs-from-web-application.drawio.dtmp 22 | -------------------------------------------------------------------------------- /terraform/workspaces/us-east-1/variables.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | variable "app_prefix" { 6 | description = "Application prefix for the AWS services that are built" 7 | default = "clicklogger" 8 | } 9 | 10 | variable "stage_name" { 11 | default = "dev" 12 | } 13 | 14 | variable "lambda_source_zip_path" { 15 | description = "Java lambda zip" 16 | default = "..//..//..//source//clicklogger//target//clicklogger-1.0-SNAPSHOT.jar" 17 | } 18 | 19 | variable "emr_source_zip_path" { 20 | description = "EMR lambda zip" 21 | default = "..//..//..//source//loggregator//target//scala-2.12//loggregator_2.12-0.1.jar" 22 | } 23 | 24 | variable "loggregator_jar" { 25 | default = "loggregator-0-0.1.jar" 26 | } 27 | 28 | 29 | -------------------------------------------------------------------------------- /cleanup.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | echo 'Cleaning up Deployed Infrastructure..' 4 | echo $PWD 5 | APP_DIR=$PWD 6 | APP_PREFIX=clicklogger 7 | STAGE_NAME=dev 8 | REGION=us-east-1 9 | 10 | ACCOUNT_ID=$(aws sts get-caller-identity | jq -r '.Account') 11 | echo $ACCOUNT_ID 12 | 13 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-emr-logs-$ACCOUNT_ID --force 14 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-firehose-delivery-$ACCOUNT_ID --force 15 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-loggregator-output-$ACCOUNT_ID --force 16 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-loggregator-source-$ACCOUNT_ID --force 17 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-emr-studio-$ACCOUNT_ID --force 18 | echo 'Deleted S3 contents' 19 | 20 | echo 'Terraform Destroy Resources' 21 | cd $APP_DIR/terraform/workspaces/$REGION 22 | terraform destroy --auto-approve 23 | 24 | cd $APP_DIR 25 | 26 | echo 'Completed Successfully!' 27 | -------------------------------------------------------------------------------- /HELP.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | ### Reference Documentation 4 | For further reference, please consider the following sections: 5 | 6 | * [Official Apache Maven documentation](https://maven.apache.org/guides/index.html) 7 | * [Spring Boot Maven Plugin Reference Guide](https://docs.spring.io/spring-boot/docs/2.4.2/maven-plugin/reference/html/) 8 | * [Create an OCI image](https://docs.spring.io/spring-boot/docs/2.4.2/maven-plugin/reference/html/#build-image) 9 | * [Spring Web](https://docs.spring.io/spring-boot/docs/2.4.2/reference/htmlsingle/#boot-features-developing-web-applications) 10 | 11 | ### Guides 12 | The following guides illustrate how to use some features concretely: 13 | 14 | * [Building a RESTful Web Service](https://spring.io/guides/gs/rest-service/) 15 | * [Serving Web Content with Spring MVC](https://spring.io/guides/gs/serving-web-content/) 16 | * [Building REST services with Spring](https://spring.io/guides/tutorials/bookmarks/) 17 | 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /LICENSES/MIT-0.txt: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /terraform/templates/policies.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | resource "aws_iam_policy" "click_loggerlambda_logging_policy" { 6 | name = "${var.app_prefix}-${var.stage_name}-lambda-logging-policy" 7 | path = "/" 8 | description = "IAM policy for logging from a lambda" 9 | 10 | policy = < headers; 15 | 16 | private String body; 17 | 18 | public ClickLogResponse(int statusCode, Map headers, String body) { 19 | this.statusCode = statusCode; 20 | this.headers = headers; 21 | this.body = body; 22 | } 23 | 24 | public int getStatusCode() { 25 | return statusCode; 26 | } 27 | 28 | public Map getHeaders() { 29 | return headers; 30 | } 31 | 32 | public String getBody() { 33 | return body; 34 | } 35 | 36 | public void setStatusCode(int statusCode) { 37 | this.statusCode = statusCode; 38 | } 39 | 40 | public void setHeaders(Map headers) { 41 | this.headers = headers; 42 | } 43 | 44 | public void setBody(String body) { 45 | this.body = body; 46 | } 47 | } -------------------------------------------------------------------------------- /source/loggregator/build.sbt: -------------------------------------------------------------------------------- 1 | name := "loggregator" 2 | 3 | version := "0.1" 4 | 5 | 6 | scalaVersion := "2.12.17" 7 | 8 | lazy val root = (project in file(".")). 9 | settings( 10 | name := "loggregator", 11 | version := "0.1", 12 | maintainer := "shiva.ramani@live.com", 13 | mainClass in Compile := Some("com.examples.clicklogger.Loggregator") 14 | ) 15 | 16 | val sparkVersion = "3.2.0" 17 | val hadoopVersion = "3.2.0" 18 | 19 | libraryDependencies ++= Seq( 20 | "org.apache.spark" %% "spark-core" % sparkVersion, 21 | "org.apache.spark" %% "spark-sql" % sparkVersion, 22 | "org.apache.hadoop" % "hadoop-client" % hadoopVersion, 23 | "org.apache.hadoop" % "hadoop-aws" % hadoopVersion, 24 | "org.apache.commons" % "commons-lang3" % "3.10", 25 | "com.amazonaws" % "aws-java-sdk-s3" % "1.12.262", 26 | "io.netty" % "netty-buffer" % "4.1.17.Final" 27 | 28 | ) 29 | 30 | val meta = """META.INF(.)*""".r 31 | assemblyMergeStrategy in assembly := { 32 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first 33 | case PathList(ps @ _*) if ps.last endsWith ".html" => MergeStrategy.first 34 | case n if n.contains("services") => MergeStrategy.concat 35 | case n if n.startsWith("reference.conf") => MergeStrategy.concat 36 | case n if n.endsWith(".conf") => MergeStrategy.concat 37 | case meta(_) => MergeStrategy.discard 38 | case x => MergeStrategy.first 39 | /*case PathList("META-INF", xs @ _*) => MergeStrategy.discard 40 | case x => MergeStrategy.first*/ 41 | } 42 | 43 | enablePlugins(JavaAppPackaging) -------------------------------------------------------------------------------- /terraform/templates/emr.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | resource "aws_emr_studio" "clicklog_dev_studio" { 6 | auth_mode = "IAM" 7 | default_s3_location = "s3://${aws_s3_bucket.click_logger_emr_studio_bucket.bucket}/clicklogger" 8 | engine_security_group_id = aws_security_group.click_logger_emr_security_group.id 9 | name = "${var.app_prefix}-${var.stage_name}-studio" 10 | service_role = aws_iam_role.emr_studio_role.arn 11 | subnet_ids = [aws_subnet.click_logger_emr_public_subnet1.id] 12 | vpc_id = aws_vpc.click_logger_emr_vpc.id 13 | workspace_security_group_id = aws_security_group.click_logger_emr_security_group.id 14 | } 15 | 16 | 17 | resource "aws_emrserverless_application" "click_log_loggregator_emr_serverless" { 18 | name = "${var.app_prefix}-${var.stage_name}-loggregrator-emr-${data.aws_caller_identity.current.account_id}" 19 | release_label = "emr-6.6.0" 20 | type = "spark" 21 | 22 | initial_capacity { 23 | initial_capacity_type = "Driver" 24 | 25 | initial_capacity_config { 26 | worker_count = 5 27 | worker_configuration { 28 | cpu = "4 vCPU" 29 | memory = "20 GB" 30 | } 31 | } 32 | } 33 | 34 | initial_capacity { 35 | initial_capacity_type = "Executor" 36 | 37 | initial_capacity_config { 38 | worker_count = 5 39 | worker_configuration { 40 | cpu = "4 vCPU" 41 | memory = "20 GB" 42 | } 43 | } 44 | } 45 | 46 | maximum_capacity { 47 | cpu = "150 vCPU" 48 | memory = "1000 GB" 49 | } 50 | 51 | tags = { 52 | Name = "EMR Serverless for ClickLogs Aggregation" 53 | Environment = var.stage_name 54 | } 55 | } -------------------------------------------------------------------------------- /source/clicklogger/src/main/java/com/clicklogs/model/ResponseBuilder.java: -------------------------------------------------------------------------------- 1 | package com.clicklogs.model; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | /* 7 | * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 8 | * 9 | * SPDX-License-Identifier: MIT-0 10 | */ 11 | 12 | public class ResponseBuilder { 13 | private static final String ACCESS_CONTROL_ALLOW_HEADERS = "Access-Control-Allow-Headers"; 14 | 15 | private static final String ACCESS_CONTROL_ALLOW_ORIGIN = "Access-Control-Allow-Origin"; 16 | 17 | private int statusCode; 18 | 19 | private Map headers = new HashMap<>(); 20 | 21 | private String body; 22 | 23 | public ResponseBuilder headers(Map headers) { 24 | this.headers = headers; 25 | return this; 26 | } 27 | 28 | public ResponseBuilder body(String body) { 29 | this.body = body; 30 | return this; 31 | } 32 | 33 | public ResponseBuilder ok() { 34 | this.statusCode = 200; 35 | return this; 36 | } 37 | 38 | public ResponseBuilder badRequest(String body) { 39 | this.body = buildErrorMsg(body); 40 | this.statusCode = 400; 41 | return this; 42 | } 43 | 44 | private String buildErrorMsg(String body) { 45 | return "{\"message\": \"" + body + "\"}"; 46 | } 47 | 48 | public ResponseBuilder originHeader(String domain) { 49 | headers.put(ACCESS_CONTROL_ALLOW_ORIGIN, domain); 50 | return this; 51 | } 52 | 53 | private void initDefaultHeaders() { 54 | headers.put(ACCESS_CONTROL_ALLOW_HEADERS, "Origin, Access-Control-Allow-Headers, X-Requested-With, Content-Type, Accept"); 55 | } 56 | 57 | public ClickLogResponse build() { 58 | this.initDefaultHeaders(); 59 | return new ClickLogResponse(statusCode, headers, body); 60 | } 61 | } -------------------------------------------------------------------------------- /terraform/templates/firehose.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | resource "aws_kinesis_firehose_delivery_stream" "click_logger_firehose_delivery_stream" { 6 | name = "${var.app_prefix}-${var.stage_name}-firehose-delivery-stream" 7 | depends_on = [aws_s3_bucket.click_logger_firehose_delivery_s3_bucket] 8 | 9 | destination = "extended_s3" 10 | 11 | extended_s3_configuration { 12 | role_arn = aws_iam_role.click_logger_stream_consumer_firehose_role.arn 13 | bucket_arn = aws_s3_bucket.click_logger_firehose_delivery_s3_bucket.arn 14 | buffering_size = 64 15 | buffering_interval = 60 16 | cloudwatch_logging_options { 17 | enabled = true 18 | log_group_name = aws_cloudwatch_log_group.click_logger_firehose_delivery_stream_log_group.name 19 | log_stream_name = aws_cloudwatch_log_stream.click_logger_firehose_delivery_stream.name 20 | } 21 | compression_format = "UNCOMPRESSED" 22 | prefix = "clicklog/data=!{timestamp:yyyy}-!{timestamp:MM}-!{timestamp:dd}/" 23 | error_output_prefix = "clicklog_error/error=!{firehose:error-output-type}data=!{timestamp:yyyy}-!{timestamp:MM}-!{timestamp:dd}/" 24 | 25 | 26 | data_format_conversion_configuration { 27 | enabled = true 28 | 29 | input_format_configuration { 30 | deserializer { 31 | open_x_json_ser_de { 32 | case_insensitive = true 33 | } 34 | } 35 | } 36 | 37 | output_format_configuration { 38 | serializer { 39 | parquet_ser_de { 40 | compression = "SNAPPY" 41 | } 42 | } 43 | } 44 | 45 | schema_configuration { 46 | database_name = aws_glue_catalog_database.aws_glue_click_logger_database.name 47 | role_arn = aws_iam_role.click_logger_stream_consumer_firehose_role.arn 48 | table_name = aws_glue_catalog_table.aws_glue_click_logger_catalog_table.name 49 | region = data.aws_region.current.id 50 | } 51 | } 52 | 53 | } 54 | } -------------------------------------------------------------------------------- /source/clicklogger/src/main/java/com/clicklogs/model/ClickLogRequest.java: -------------------------------------------------------------------------------- 1 | package com.clicklogs.model; 2 | 3 | /* 4 | * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 5 | * 6 | * SPDX-License-Identifier: MIT-0 7 | */ 8 | 9 | public class ClickLogRequest { 10 | 11 | public String requestid; 12 | public String contextid; 13 | public String callerid; 14 | public String type; 15 | public String component; 16 | public String action; 17 | public String user; 18 | public String clientip; 19 | public String createdtime; 20 | 21 | public String getRequestid() { 22 | return requestid; 23 | } 24 | 25 | public String getContextid() { 26 | return contextid; 27 | } 28 | 29 | public String getCallerid() { 30 | return callerid; 31 | } 32 | 33 | public String getType() { 34 | return type; 35 | } 36 | 37 | public String getComponent() { 38 | return component; 39 | } 40 | 41 | public String getAction() { 42 | return action; 43 | } 44 | 45 | public String getCreatedtime() { 46 | return createdtime; 47 | } 48 | 49 | public String getUser() { 50 | return user; 51 | } 52 | 53 | public String getClientip() { 54 | return clientip; 55 | } 56 | 57 | public void setRequestid(String requestid) { 58 | this.requestid = requestid; 59 | } 60 | 61 | public void setContextid(String contextid) { 62 | this.contextid = contextid; 63 | } 64 | 65 | public void setCallerid(String callerid) { 66 | this.callerid = callerid; 67 | } 68 | 69 | public void setType(String type) { 70 | this.type = type; 71 | } 72 | 73 | 74 | public void setComponent(String component) { 75 | this.component = component; 76 | } 77 | 78 | public void setAction(String action) { 79 | this.action = action; 80 | } 81 | 82 | public void setCreatedtime(String createdtime) { 83 | this.createdtime = createdtime; 84 | } 85 | 86 | public void setUser(String user) { 87 | this.user = user; 88 | } 89 | 90 | public void setClientip(String clientip) { 91 | this.clientip = clientip; 92 | } 93 | 94 | } -------------------------------------------------------------------------------- /terraform/templates/s3.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | resource "aws_s3_bucket" "click_log_loggregator_source_s3_bucket" { 6 | bucket = "${data.aws_region.current.id}-${var.app_prefix}-${var.stage_name}-loggregator-source-${data.aws_caller_identity.current.account_id}" 7 | 8 | tags = { 9 | Name = "Loggregator Source S3 Delivery bucket" 10 | Environment = var.stage_name 11 | } 12 | } 13 | 14 | 15 | resource "aws_s3_object" "click_log_loggregator_source_s3_bucket_object" { 16 | bucket = aws_s3_bucket.click_log_loggregator_source_s3_bucket.bucket 17 | 18 | key = var.loggregator_jar 19 | source = var.emr_source_zip_path 20 | etag = filemd5(var.emr_source_zip_path) 21 | } 22 | 23 | 24 | resource "aws_s3_bucket" "click_log_loggregator_output_s3_bucket" { 25 | bucket = "${data.aws_region.current.id}-${var.app_prefix}-${var.stage_name}-loggregator-output-${data.aws_caller_identity.current.account_id}" 26 | 27 | tags = { 28 | Name = "Loggregator Output S3 Delivery bucket" 29 | Environment = var.stage_name 30 | } 31 | } 32 | 33 | resource "aws_s3_bucket" "click_log_loggregator_emr_serverless_logs_s3_bucket" { 34 | bucket = "${data.aws_region.current.id}-${var.app_prefix}-${var.stage_name}-emr-logs-${data.aws_caller_identity.current.account_id}" 35 | 36 | tags = { 37 | Name = "Loggregator EMR Logs S3 Delivery bucket" 38 | Environment = var.stage_name 39 | } 40 | } 41 | 42 | 43 | resource "aws_s3_bucket" "click_logger_firehose_delivery_s3_bucket" { 44 | bucket = "${data.aws_region.current.id}-${var.app_prefix}-${var.stage_name}-firehose-delivery-${data.aws_caller_identity.current.account_id}" 45 | 46 | tags = { 47 | Name = "Firehose S3 Delivery bucket" 48 | Environment = var.stage_name 49 | } 50 | } 51 | 52 | resource "aws_s3_bucket" "click_logger_emr_studio_bucket" { 53 | bucket = "${data.aws_region.current.id}-${var.app_prefix}-${var.stage_name}-emr-studio-${data.aws_caller_identity.current.account_id}" 54 | 55 | tags = { 56 | Name = "EMR studio bucket" 57 | Environment = var.stage_name 58 | } 59 | } 60 | 61 | output "S3FirehoseDeliveryBucket" { 62 | value = aws_s3_bucket.click_logger_firehose_delivery_s3_bucket 63 | } 64 | 65 | output "S3EMRSourceBucket" { 66 | value = aws_s3_bucket.click_log_loggregator_source_s3_bucket 67 | } 68 | -------------------------------------------------------------------------------- /terraform/templates/glue.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | resource "aws_glue_catalog_database" "aws_glue_click_logger_database" { 6 | name = "${var.app_prefix}${var.stage_name}database" 7 | description = "Click logger Glue database" 8 | } 9 | 10 | resource "aws_glue_catalog_table" "aws_glue_click_logger_catalog_table" { 11 | name = "${var.app_prefix}${var.stage_name}-table" 12 | database_name = "${var.app_prefix}${var.stage_name}database" 13 | depends_on = [aws_glue_catalog_database.aws_glue_click_logger_database, aws_s3_bucket.click_logger_firehose_delivery_s3_bucket] 14 | 15 | table_type = "EXTERNAL_TABLE" 16 | 17 | parameters = { 18 | EXTERNAL = "TRUE" 19 | "parquet.compression" = "SNAPPY" 20 | } 21 | 22 | retention = 0 23 | 24 | storage_descriptor { 25 | location = aws_s3_bucket.click_logger_firehose_delivery_s3_bucket.arn 26 | input_format = "org.apache.hadoop.mapred.TextInputFormat" 27 | output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" 28 | compressed = false 29 | parameters = { 30 | "crawler_schema_serializer_version" = "1.0" 31 | "crawler_schema_deserializer_version" = "1.0" 32 | "compression_type" = "none" 33 | "classification" = "json" 34 | "type_of_data" = "file" 35 | } 36 | ser_de_info { 37 | name = "${var.app_prefix}table" 38 | serialization_library = "org.openx.data.jsonserde.JsonSerDe" 39 | 40 | parameters = { 41 | "serialization.format" = 1 42 | } 43 | } 44 | 45 | columns { 46 | name = "requestid" 47 | type = "string" 48 | } 49 | 50 | columns { 51 | name = "contextid" 52 | type = "string" 53 | } 54 | 55 | columns { 56 | name = "callerid" 57 | type = "string" 58 | comment = "" 59 | } 60 | 61 | columns { 62 | name = "component" 63 | type = "string" 64 | comment = "" 65 | } 66 | 67 | columns { 68 | name = "action" 69 | type = "string" 70 | comment = "" 71 | } 72 | 73 | columns { 74 | name = "type" 75 | type = "string" 76 | comment = "" 77 | } 78 | 79 | columns { 80 | name = "clientip" 81 | type = "string" 82 | comment = "" 83 | } 84 | 85 | columns { 86 | name = "createdtime" 87 | type = "string" 88 | comment = "" 89 | } 90 | } 91 | } -------------------------------------------------------------------------------- /terraform/templates/lambda.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | resource "aws_lambda_function" "lambda_clicklogger_ingest" { 6 | description = "Lambda to ingest data." 7 | filename = var.lambda_source_zip_path 8 | function_name = "${var.app_prefix}-${var.stage_name}-ingestion-lambda" 9 | role = aws_iam_role.click_logger_lambda_role.arn 10 | handler = "com.clicklogs.Handlers.ClickLoggerHandler::handleRequest" 11 | runtime = "java11" 12 | memory_size = 2048 13 | timeout = 300 14 | 15 | source_code_hash = filebase64sha256(var.lambda_source_zip_path) 16 | depends_on = [ 17 | aws_iam_role.click_logger_lambda_role, aws_kinesis_firehose_delivery_stream.click_logger_firehose_delivery_stream 18 | ] 19 | 20 | environment { 21 | variables = { 22 | STREAM_NAME = aws_kinesis_firehose_delivery_stream.click_logger_firehose_delivery_stream.name 23 | REGION = data.aws_region.current.id 24 | } 25 | } 26 | 27 | vpc_config { 28 | subnet_ids = [aws_subnet.click_logger_emr_private_subnet1.id] 29 | security_group_ids = [aws_security_group.click_logger_emr_security_group.id] 30 | } 31 | } 32 | 33 | resource "aws_lambda_function" "lambda_clicklogger_emr_job_status" { 34 | description = "Lambda to check status of job on EMR Serverless cluster." 35 | filename = var.lambda_source_zip_path 36 | function_name = "${var.app_prefix}-${var.stage_name}-emr-job-status-lambda" 37 | role = aws_iam_role.click_logger_emr_lambda_role.arn 38 | handler = "com.clicklogs.Handlers.ClickLoggerEMRJobHandler::handleRequest" 39 | runtime = "java11" 40 | memory_size = 2048 41 | timeout = 600 42 | 43 | source_code_hash = filebase64sha256(var.lambda_source_zip_path) 44 | depends_on = [aws_iam_role.click_logger_emr_lambda_role] 45 | 46 | environment { 47 | variables = { 48 | APPLICATION_ID = aws_emrserverless_application.click_log_loggregator_emr_serverless.id 49 | LOGS_OUTPUT_PATH = "s3://${aws_s3_bucket.click_log_loggregator_emr_serverless_logs_s3_bucket.id}" 50 | REGION = data.aws_region.current.id 51 | EMR_GET_SLEEP_TIME = 5000 52 | } 53 | } 54 | 55 | vpc_config { 56 | subnet_ids = [aws_subnet.click_logger_emr_private_subnet1.id] 57 | security_group_ids = [aws_security_group.click_logger_emr_security_group.id] 58 | } 59 | } 60 | 61 | output "lambda-clicklogger-ingest" { 62 | value = aws_lambda_function.lambda_clicklogger_ingest 63 | } 64 | 65 | output "lambda-clicklogger-emr-job" { 66 | value = aws_lambda_function.lambda_clicklogger_emr_job_status 67 | } -------------------------------------------------------------------------------- /exec.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | echo $PWD 4 | APP_DIR=$PWD 5 | APP_PREFIX=clicklogger 6 | STAGE_NAME=dev 7 | REGION=us-east-1 8 | ACCOUNT_ID=$(aws sts get-caller-identity | jq -r '.Account') 9 | 10 | echo 'Building Source Lambda Jar' 11 | cd $APP_DIR/source/clicklogger 12 | mvn clean package 13 | echo 'Building Source EMR Jar' 14 | cd $APP_DIR/source/loggregator 15 | # Build Scala JAR with Java 11 (Minimum) 16 | sbt clean 17 | sbt compile 18 | sbt package 19 | 20 | echo 'Deploying Terraform Resources' 21 | cd $APP_DIR/terraform/workspaces/$REGION 22 | 23 | terraform init 24 | terraform plan 25 | terraform apply --auto-approve 26 | # shellcheck disable=SC2103 27 | 28 | cd $APP_DIR 29 | echo 'Deployed Successfully!' 30 | 31 | echo 'Inserting Sample Data' 32 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-001","contextid":"OAP-ctxt-001","callerid":"OrderingApplication","component":"login","action":"load","type":"webpage"}' out 33 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-002","contextid":"OAP-ctxt-002","callerid":"OrderingApplication","component":"login","action":"load","type":"webpage"}' out 34 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-003","contextid":"OAP-ctxt-003","callerid":"OrderingApplication","component":"products","action":"show","type":"webpage"}' out 35 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-004","contextid":"OAP-ctxt-004","callerid":"OrderingApplication","component":"products","action":"show","type":"webpage"}' out 36 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-005","contextid":"OAP-ctxt-005","callerid":"OrderingApplication","component":"checkout","action":"show","type":"webpage"}' out 37 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-006","contextid":"OAP-ctxt-006","callerid":"OrderingApplication","component":"checkout","action":"show","type":"webpage"}' out 38 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-007","contextid":"OAP-ctxt-007","callerid":"OrderingApplication","component":"submitorder","action":"backend","type":"process"}' out 39 | aws lambda invoke --function-name $APP_PREFIX-$STAGE_NAME-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-008","contextid":"OAP-ctxt-008","callerid":"OrderingApplication","component":"submitorder","action":"backend","type":"process"}' out 40 | 41 | 42 | echo 'All process completed successfully!!' 43 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /source/clicklogger/src/test/java/com/clicklogs/ClickLoggerHandlerTest.java: -------------------------------------------------------------------------------- 1 | package com.clicklogs; 2 | 3 | import com.amazonaws.services.kinesisfirehose.AmazonKinesisFirehose; 4 | import com.amazonaws.services.kinesisfirehose.model.PutRecordRequest; 5 | import com.amazonaws.services.kinesisfirehose.model.PutRecordResult; 6 | import com.amazonaws.services.kinesisfirehose.model.Record; 7 | import com.amazonaws.services.lambda.runtime.Context; 8 | import com.clicklogs.Handlers.ClickLoggerHandler; 9 | import com.clicklogs.model.ClickLogRequest; 10 | 11 | import org.junit.Assert; 12 | import java.nio.ByteBuffer; 13 | import com.google.gson.Gson; 14 | import org.junit.Before; 15 | import org.junit.Test; 16 | import org.junit.runner.RunWith; 17 | import org.mockito.Mockito; 18 | import org.mockito.junit.MockitoJUnitRunner; 19 | 20 | import static org.mockito.ArgumentMatchers.any; 21 | import static org.mockito.Mockito.when; 22 | 23 | /* 24 | * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 25 | * 26 | * SPDX-License-Identifier: MIT-0 27 | */ 28 | 29 | @RunWith(MockitoJUnitRunner.class) 30 | public class ClickLoggerHandlerTest { 31 | 32 | private static String deliveryStreamName = "delivery-stream"; 33 | Gson gson = new Gson(); 34 | ClickLogRequest clickLogRequest = new ClickLogRequest(); 35 | PutRecordRequest putRecordRequest = new PutRecordRequest(); 36 | Record record = new Record(); 37 | PutRecordResult putRecordResult = new PutRecordResult(); 38 | 39 | Context context = Mockito.mock(Context.class); 40 | 41 | protected AmazonKinesisFirehose amazonKinesisFirehoseClient = Mockito.mock(AmazonKinesisFirehose.class); 42 | ClickLoggerHandler clickLoggerHandler = Mockito.mock(ClickLoggerHandler.class); 43 | 44 | @Before 45 | public void setup() { 46 | clickLogRequest = new ClickLogRequest(); 47 | clickLogRequest.setAction("ACTION"); 48 | clickLogRequest.setCallerid("CALLERID"); 49 | clickLogRequest.setClientip("CLIENTIP"); 50 | clickLogRequest.setComponent("COMPONENT"); 51 | clickLogRequest.setContextid("CONTEXTID"); 52 | clickLogRequest.setCreatedtime("CREATEDTIME"); 53 | clickLogRequest.setRequestid("REQUESTID"); 54 | clickLogRequest.setType("TYPE"); 55 | clickLogRequest.setUser("USER"); 56 | 57 | when(clickLogRequest).thenReturn(this.clickLogRequest); 58 | 59 | amazonKinesisFirehoseClient = Mockito.mock(AmazonKinesisFirehose.class); 60 | when(amazonKinesisFirehoseClient).thenReturn(amazonKinesisFirehoseClient); 61 | 62 | putRecordResult = new PutRecordResult(); 63 | putRecordResult.setRecordId("SUCCESS_RECORD_ID"); 64 | when(any(PutRecordResult.class)).thenReturn(putRecordResult); 65 | } 66 | 67 | @Test 68 | void invokeTest() { 69 | clickLoggerHandler.handleRequest(clickLogRequest, context); 70 | 71 | putRecordRequest.setDeliveryStreamName(deliveryStreamName); 72 | 73 | Gson gson = new Gson(); 74 | String messageJson = gson.toJson(clickLogRequest); 75 | System.out.println("gson - " + messageJson); 76 | record = new Record().withData(ByteBuffer.wrap(messageJson.toString().getBytes())); 77 | putRecordRequest.setRecord(record); 78 | 79 | amazonKinesisFirehoseClient = Mockito.mock(AmazonKinesisFirehose.class); 80 | PutRecordResult result = amazonKinesisFirehoseClient.putRecord( putRecordRequest ); 81 | Assert.assertEquals(result.getRecordId(), "SUCCESS_RECORD_ID"); 82 | 83 | } 84 | 85 | } -------------------------------------------------------------------------------- /terraform/templates/stepfunction.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | resource "aws_sfn_state_machine" "sfn_state_machine" { 6 | name = "${var.app_prefix}-${var.stage_name}-state-machine" 7 | role_arn = aws_iam_role.lambda_clicklogger_emr_sfn_start_job_role.arn 8 | 9 | definition = < { 27 | 28 | Gson gson = new GsonBuilder().setPrettyPrinting().create(); 29 | LambdaLogger logger = null; 30 | private String applicationId = ""; 31 | private String logsOutputPath = ""; 32 | private Integer emrJobTimout = 5000; 33 | 34 | @Override 35 | public ClickLogResponse handleRequest(final JobStatusRequest jobStatusRequest, final Context context) { 36 | logger = context.getLogger(); 37 | final String success_response = new String("200 OK"); 38 | final String fail_response = new String("400 ERROR"); 39 | 40 | ResponseBuilder responseBuilder = new ResponseBuilder(); 41 | ClickLogResponse response = responseBuilder.badRequest(fail_response).build(); 42 | 43 | logger.log("Incoming request - " + gson.toJson(jobStatusRequest)); 44 | 45 | String envAppId = System.getenv("APPLICATION_ID"); 46 | if (!StringUtils.isBlank(envAppId)) { 47 | applicationId = envAppId; 48 | } 49 | 50 | String envLogsOutputBucket = System.getenv("LOGS_OUTPUT_PATH"); 51 | if (!StringUtils.isBlank(envLogsOutputBucket)) { 52 | logsOutputPath = envLogsOutputBucket; 53 | } 54 | 55 | emrJobTimout = Integer.parseInt(System.getenv("EMR_GET_SLEEP_TIME")); 56 | 57 | String jobRunId = ""; 58 | if (!StringUtils.isBlank(jobStatusRequest.getJobRunId())) { 59 | jobRunId = jobStatusRequest.getJobRunId(); 60 | } 61 | 62 | logger.log("Checking EMR Serverless job status for Job Run ID: " + jobRunId); 63 | 64 | try { 65 | CheckEMRJobStatus(applicationId, emrJobTimout, jobRunId); 66 | } catch (InterruptedException e) { 67 | logger.log("Error occurred checking EMR Job status."); 68 | throw new ClickLoggerException("Error occurred checking EMR Job status."); 69 | } 70 | logger.log("Stopping application"); 71 | stopApplication(applicationId); 72 | logger.log(success_response); 73 | responseBuilder = new ResponseBuilder(); 74 | responseBuilder.ok(); 75 | response = responseBuilder.originHeader("*").build(); 76 | logger.log("Returning response " + gson.toJson(response)); 77 | return response; 78 | } 79 | 80 | private void CheckEMRJobStatus(String applicationId, Integer emrJobTimout, String jobRunId) throws InterruptedException { 81 | EmrServerlessClient client = getClient(); 82 | GetJobRunRequest getJobRunRequest = GetJobRunRequest.builder() 83 | .applicationId(applicationId) 84 | .jobRunId(jobRunId) 85 | .build(); 86 | GetJobRunResponse jobRunResponse = client.getJobRun(getJobRunRequest); 87 | 88 | while (true) { 89 | Thread.sleep(emrJobTimout); 90 | jobRunResponse = client.getJobRun(getJobRunRequest); 91 | 92 | if (jobRunResponse != null) { 93 | JobRunState jobState = jobRunResponse.jobRun().state(); 94 | if (jobState.name().equals("SUCCESS") || jobState.name().equals("FAILED") || 95 | jobState.name().equals("CANCELLING") || jobState.name().equals("CANCELLED")) { 96 | logger.log("Job Completed!!"); 97 | break; 98 | } 99 | } 100 | } 101 | } 102 | 103 | private StopApplicationResponse stopApplication(String applicationId) { 104 | EmrServerlessClient client = getClient(); 105 | StopApplicationRequest stopApp = StopApplicationRequest.builder().applicationId(applicationId).build(); 106 | return client.stopApplication(stopApp); 107 | } 108 | 109 | private EmrServerlessClient getClient() { 110 | return EmrServerlessClient.builder() 111 | .credentialsProvider(DefaultCredentialsProvider.create()) 112 | .httpClient(UrlConnectionHttpClient.builder().build()) 113 | .build(); 114 | } 115 | } -------------------------------------------------------------------------------- /source/clicklogger/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.clicklogs 5 | clicklogger 6 | jar 7 | 1.0-SNAPSHOT 8 | clicklogger 9 | 10 | UTF-8 11 | 1.8 12 | 1.8 13 | 14 | 15 | 16 | 17 | software.amazon.awssdk 18 | bom 19 | 2.17.202 20 | pom 21 | import 22 | 23 | 24 | 25 | 26 | 27 | com.amazonaws 28 | aws-lambda-java-core 29 | 1.2.1 30 | 31 | 32 | com.amazonaws 33 | aws-lambda-java-events 34 | 2.2.8 35 | 36 | 37 | com.amazonaws 38 | aws-java-sdk-kinesis 39 | 1.11.774 40 | 41 | 42 | software.amazon.awssdk 43 | regions 44 | 45 | 46 | software.amazon.awssdk 47 | aws-core 48 | 49 | 50 | software.amazon.awssdk 51 | aws-json-protocol 52 | 53 | 54 | software.amazon.awssdk 55 | url-connection-client 56 | 57 | 58 | software.amazon.awssdk 59 | emrserverless 60 | 61 | 62 | org.apache.commons 63 | commons-lang3 64 | 3.10 65 | 66 | 67 | com.google.code.gson 68 | gson 69 | 2.8.9 70 | 71 | 72 | org.apache.logging.log4j 73 | log4j-api 74 | 2.17.1 75 | test 76 | 77 | 78 | org.apache.logging.log4j 79 | log4j-core 80 | 2.17.1 81 | test 82 | 83 | 84 | org.apache.logging.log4j 85 | log4j-slf4j18-impl 86 | 2.13.0 87 | test 88 | 89 | 90 | org.junit.jupiter 91 | junit-jupiter-api 92 | 5.6.0 93 | test 94 | 95 | 96 | org.junit.jupiter 97 | junit-jupiter-engine 98 | 5.6.0 99 | test 100 | 101 | 107 | 108 | org.mockito 109 | mockito-core 110 | 2.22.0 111 | test 112 | 113 | 114 | junit 115 | junit 116 | 4.13.1 117 | 118 | 119 | 120 | 121 | 122 | 123 | maven-surefire-plugin 124 | 2.22.2 125 | 126 | 127 | org.apache.maven.plugins 128 | maven-shade-plugin 129 | 3.2.2 130 | 131 | false 132 | 133 | 134 | 135 | package 136 | 137 | shade 138 | 139 | 140 | 141 | 142 | 143 | org.apache.maven.plugins 144 | maven-compiler-plugin 145 | 3.8.1 146 | 147 | 1.8 148 | 1.8 149 | 150 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /source/loggregator/src/main/scala/com/examples/clicklogger/Loggregator.scala: -------------------------------------------------------------------------------- 1 | package com.examples.clicklogger 2 | 3 | import org.apache.spark.{SparkConf, SparkContext, sql} 4 | import org.apache.spark.sql.{DataFrameReader, SQLContext, SparkSession} 5 | import java.lang.Boolean 6 | import java.util 7 | import java.io.IOException 8 | import java.text.Format 9 | import java.text.SimpleDateFormat 10 | import java.util.Date 11 | 12 | import com.amazonaws.AmazonServiceException 13 | import com.amazonaws.SdkClientException 14 | import com.amazonaws.services.s3.AmazonS3 15 | import com.amazonaws.services.s3.AmazonS3ClientBuilder 16 | import org.apache.commons.lang3.StringUtils 17 | 18 | 19 | /* 20 | * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 21 | * 22 | * SPDX-License-Identifier: MIT-0 23 | */ 24 | 25 | object Loggregator { 26 | 27 | private var sourceBucketName = "" 28 | private var outputBucketName = "" 29 | private var region = "" 30 | 31 | private var configMap = new util.HashMap[String, String] 32 | 33 | def main(args: Array[String]): Unit = { 34 | 35 | if (args.length != 3) { 36 | System.out.println("Invalid no of Arguments!!!") 37 | System.exit(-1) 38 | } 39 | 40 | val date = args(0) 41 | sourceBucketName = args(1) 42 | outputBucketName = args(2) 43 | 44 | System.out.println("Input Date " + date) 45 | region = scala.util.Properties.envOrElse("REGION", "us-east-1") 46 | 47 | System.out.format("values receive bucket_name %s, output_bucket_name %s \n", sourceBucketName, outputBucketName) 48 | 49 | val dataFrame = processFiles(date, sourceBucketName) 50 | System.out.println("") 51 | writeOutputToS3(date, dataFrame, outputBucketName) 52 | 53 | System.out.println("Completed successfully!") 54 | } 55 | 56 | private def writeOutputToS3(date: String, dataFrame: sql.DataFrame, outputBucketName: String): String = { 57 | var outputResponse: String = "|*createdTime*|*callerid*|*component*|*count*\n" 58 | 59 | if (dataFrame != null) { 60 | outputResponse = outputResponse + "|------------|-----------------------|-----------|-------\n" 61 | for (row <- dataFrame.rdd.collect) { 62 | val createdTime = row.mkString(",").split(",")(0) 63 | val callerid = row.mkString(",").split(",")(1) 64 | val component = row.mkString(",").split(",")(2) 65 | val count = row.mkString(",").split(",")(3) 66 | outputResponse = outputResponse + "*" + createdTime + "*|" + callerid + "|" + component + "|" + count + "\n" 67 | } 68 | } 69 | System.out.println("printing output schema from data frame") 70 | System.out.println(outputResponse) 71 | 72 | var f = new SimpleDateFormat("yyyy") 73 | var year = f.format(new Date()) 74 | f = new SimpleDateFormat("MM") 75 | var month = f.format(new Date()) 76 | f = new SimpleDateFormat("dd") 77 | var onlydate = f.format(new Date()) 78 | 79 | // 2020-07-18 80 | if (date.length == 10) { 81 | val dateArr = date.split("-") 82 | year = dateArr(0) 83 | month = dateArr(1) 84 | onlydate = dateArr(2) 85 | } 86 | 87 | var fileObjKeyName = year + "/" + month + "/" + onlydate + "/" 88 | if (date.equalsIgnoreCase("ALL")) { 89 | fileObjKeyName = "ALL/" + year + "/" + month + "/" + onlydate + "/" 90 | } 91 | val fileName = "response.md" 92 | 93 | System.out.println("fileObjKeyName " + fileObjKeyName + " fileName " + fileName) 94 | try { 95 | val s3Client = AmazonS3ClientBuilder.standard.build 96 | s3Client.putObject(outputBucketName, fileObjKeyName + fileName, outputResponse) 97 | } catch { 98 | case e: AmazonServiceException => 99 | System.out.println(e.getMessage) 100 | case e: SdkClientException => 101 | System.out.println(e.getMessage) 102 | } 103 | 104 | return outputResponse 105 | } 106 | 107 | 108 | def processFiles(date: String, bucket: String): sql.DataFrame = { 109 | System.out.println("processing a date - " + date + " from bucket - " + bucket) 110 | var s3Path = String.format("s3a://%s/clicklog/data=%s/", bucket, date) 111 | var spark = getSparkSession() 112 | val s3FolderDF = spark.read.parquet(s3Path) 113 | s3FolderDF.createOrReplaceTempView("ClickLoggerTable") 114 | return getClickLoggerDataFrame(spark) 115 | } 116 | 117 | def getClickLoggerDataFrame(spark: SparkSession): sql.DataFrame = { 118 | val sql = "select substring(createdTime,0, 10) as createdTime, callerid, component from ClickLoggerTable" 119 | 120 | val clickLoggerDF = spark.sql(sql) 121 | clickLoggerDF.groupBy("createdTime", "callerid", "component").count() 122 | .orderBy("createdTime", "callerid", "component") 123 | .show() 124 | 125 | // DO not printSchema in production 126 | clickLoggerDF.printSchema() 127 | 128 | System.out.println("DataFrame for date completed successfully ---------") 129 | return clickLoggerDF.groupBy("createdTime", "callerid", "component").count() 130 | .orderBy("createdTime", "callerid", "component") 131 | } 132 | 133 | def getSparkSession(): SparkSession = { 134 | System.out.println("starting spark session -------------") 135 | 136 | val sparkConfig = new SparkConf() 137 | //.setMaster("local[*]") 138 | //.setAppName("ClickLogger") 139 | val spark: SparkSession = SparkSession.builder() 140 | .config(conf = sparkConfig) 141 | .getOrCreate() 142 | val sparkContext: SparkContext = spark.sparkContext 143 | val hadoopConf = sparkContext.hadoopConfiguration 144 | hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 145 | hadoopConf.set("fs.s3a.path.style.access", "true") 146 | hadoopConf.set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com") 147 | return spark; 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /source/clicklogger/src/main/java/com/clicklogs/Handlers/ClickLoggerHandler.java: -------------------------------------------------------------------------------- 1 | package com.clicklogs.Handlers; 2 | 3 | import com.amazonaws.services.kinesisfirehose.model.Record; 4 | import com.amazonaws.services.kinesisfirehose.AmazonKinesisFirehose; 5 | import com.amazonaws.services.kinesisfirehose.AmazonKinesisFirehoseClientBuilder; 6 | import com.amazonaws.services.kinesisfirehose.model.PutRecordRequest; 7 | import com.amazonaws.services.kinesisfirehose.model.PutRecordResult; 8 | 9 | import com.amazonaws.services.lambda.runtime.Context; 10 | import com.amazonaws.services.lambda.runtime.RequestHandler; 11 | import com.amazonaws.services.lambda.runtime.LambdaLogger; 12 | 13 | import com.google.gson.Gson; 14 | import com.google.gson.GsonBuilder; 15 | import org.apache.commons.lang3.StringUtils; 16 | 17 | import java.nio.ByteBuffer; 18 | import java.text.Format; 19 | import java.text.SimpleDateFormat; 20 | import java.util.Date; 21 | 22 | import com.clicklogs.model.ClickLogRequest; 23 | import com.clicklogs.model.ClickLogResponse; 24 | import com.clicklogs.model.ResponseBuilder; 25 | 26 | /* 27 | * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 28 | * 29 | * SPDX-License-Identifier: MIT-0 30 | */ 31 | 32 | public class ClickLoggerHandler implements RequestHandler { 33 | 34 | private String stream_name = "click-logger-firehose-delivery-stream"; 35 | private String region = "us-east-1"; 36 | 37 | Gson gson = new GsonBuilder().setPrettyPrinting().create(); 38 | 39 | @Override 40 | public ClickLogResponse handleRequest(final ClickLogRequest clickLogRequest, final Context context) { 41 | final LambdaLogger logger = context.getLogger(); 42 | final String success_response = new String("200 OK"); 43 | final String fail_response = new String("400 ERROR"); 44 | 45 | ResponseBuilder responseBuilder = new ResponseBuilder(); 46 | ClickLogResponse response = responseBuilder.badRequest(fail_response).build(); 47 | 48 | 49 | String env_stream_name = System.getenv("STREAM_NAME"); 50 | if (!StringUtils.isBlank(env_stream_name)) { 51 | stream_name = env_stream_name; 52 | } 53 | 54 | String env_region = System.getenv("REGION"); 55 | logger.log("Environment region name - " + env_region); 56 | if (!StringUtils.isBlank(env_region)) { 57 | region = env_region; 58 | } 59 | if (clickLogRequest != null) { 60 | String req = clickLogRequest.getRequestid() + " - " + clickLogRequest.getCallerid() + " - " 61 | + clickLogRequest.getComponent() + " - " + clickLogRequest.getType() + " - " + clickLogRequest.getAction() 62 | + " - " + clickLogRequest.getUser() + " - " + clickLogRequest.getClientip() + " - " 63 | + clickLogRequest.getCreatedtime(); 64 | logger.log("Incoming request variables - " + req); 65 | 66 | if (validateRequest(clickLogRequest, logger, response)) return response; 67 | } 68 | 69 | System.out.println("Calling updateclicklogs method for the received clicklogrequest"); 70 | 71 | updateClickLogRequestToStream(clickLogRequest); 72 | logger.log(success_response); 73 | responseBuilder = new ResponseBuilder(); 74 | responseBuilder.ok(); 75 | response = responseBuilder.originHeader("*").build(); 76 | return response; 77 | } 78 | 79 | private boolean validateRequest(ClickLogRequest clickLogRequest, LambdaLogger logger, ClickLogResponse response) { 80 | logger.log("Validating inputs"); 81 | if (StringUtils.isBlank(clickLogRequest.getRequestid())) { 82 | logger.log("error occurred - requestid missing"); 83 | return true; 84 | } 85 | if (StringUtils.isBlank(clickLogRequest.getContextid())) { 86 | logger.log("error occurred - contextid missing"); 87 | return true; 88 | } 89 | if (StringUtils.isBlank(clickLogRequest.getCallerid())) { 90 | logger.log("error occurred - caller missing"); 91 | return true; 92 | } 93 | if (StringUtils.isBlank(clickLogRequest.getType())) { 94 | logger.log("error occurred - type missing"); 95 | return true; 96 | } 97 | if (StringUtils.isBlank(clickLogRequest.getAction())) { 98 | logger.log("error occurred - action missing"); 99 | return true; 100 | } 101 | if (StringUtils.isBlank(clickLogRequest.getComponent())) { 102 | logger.log("error occurred - component missing"); 103 | return true; 104 | } 105 | 106 | String user = "GUEST"; 107 | if (StringUtils.isBlank(clickLogRequest.getUser())) { 108 | logger.log("setting default user"); 109 | clickLogRequest.setUser(user); 110 | } 111 | 112 | String clientip = "APIGWY"; 113 | if (StringUtils.isBlank(clickLogRequest.getClientip())) { 114 | logger.log("setting default clientip"); 115 | clickLogRequest.setClientip(clientip); 116 | } 117 | 118 | String datetime = ""; 119 | if (StringUtils.isBlank(clickLogRequest.getCreatedtime())) { 120 | logger.log("setting default createdtime"); 121 | Format f = new SimpleDateFormat("MM-dd-yyyy hh:mm:ss"); 122 | datetime = f.format(new Date()); 123 | clickLogRequest.setCreatedtime(datetime); 124 | } 125 | logger.log("Validated inputs"); 126 | return false; 127 | } 128 | 129 | private Boolean updateClickLogRequestToStream(ClickLogRequest clickLogRequest) { 130 | System.out.println("Inside updateClickLogRequestToStream method for the input"); 131 | try { 132 | 133 | AmazonKinesisFirehose amazonKinesisFirehoseClient = AmazonKinesisFirehoseClientBuilder.standard().withRegion(region).build(); 134 | 135 | PutRecordRequest putRecordRequest = new PutRecordRequest(); 136 | putRecordRequest.setDeliveryStreamName(stream_name); 137 | Gson gson = new Gson(); 138 | String messageJson = gson.toJson(clickLogRequest); 139 | System.out.println("gson - " + messageJson); 140 | Record record = new Record().withData(ByteBuffer.wrap(messageJson.toString().getBytes())); 141 | putRecordRequest.setRecord(record); 142 | PutRecordResult putRecordResult = amazonKinesisFirehoseClient.putRecord(putRecordRequest); 143 | System.out.println("updated the stream for recordid - " + putRecordResult.getRecordId()); 144 | return true; 145 | } catch (Exception e) { 146 | System.out.println("Error occurred - " + e.getMessage()); 147 | } 148 | return false; 149 | } 150 | 151 | } -------------------------------------------------------------------------------- /source/clicklogger/mvnw.cmd: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Licensed to the Apache Software Foundation (ASF) under one 3 | @REM or more contributor license agreements. See the NOTICE file 4 | @REM distributed with this work for additional information 5 | @REM regarding copyright ownership. The ASF licenses this file 6 | @REM to you under the Apache License, Version 2.0 (the 7 | @REM "License"); you may not use this file except in compliance 8 | @REM with the License. You may obtain a copy of the License at 9 | @REM 10 | @REM https://www.apache.org/licenses/LICENSE-2.0 11 | @REM 12 | @REM Unless required by applicable law or agreed to in writing, 13 | @REM software distributed under the License is distributed on an 14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | @REM KIND, either express or implied. See the License for the 16 | @REM specific language governing permissions and limitations 17 | @REM under the License. 18 | @REM ---------------------------------------------------------------------------- 19 | 20 | @REM ---------------------------------------------------------------------------- 21 | @REM Maven Start Up Batch script 22 | @REM 23 | @REM Required ENV vars: 24 | @REM JAVA_HOME - location of a JDK home dir 25 | @REM 26 | @REM Optional ENV vars 27 | @REM M2_HOME - location of maven2's installed home dir 28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands 29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending 30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven 31 | @REM e.g. to debug Maven itself, use 32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files 34 | @REM ---------------------------------------------------------------------------- 35 | 36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' 37 | @echo off 38 | @REM set title of command window 39 | title %0 40 | @REM enable echoing by setting MAVEN_BATCH_ECHO to 'on' 41 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% 42 | 43 | @REM set %HOME% to equivalent of $HOME 44 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") 45 | 46 | @REM Execute a user defined script before this one 47 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre 48 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending 49 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat" 50 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd" 51 | :skipRcPre 52 | 53 | @setlocal 54 | 55 | set ERROR_CODE=0 56 | 57 | @REM To isolate internal variables from possible post scripts, we use another setlocal 58 | @setlocal 59 | 60 | @REM ==== START VALIDATION ==== 61 | if not "%JAVA_HOME%" == "" goto OkJHome 62 | 63 | echo. 64 | echo Error: JAVA_HOME not found in your environment. >&2 65 | echo Please set the JAVA_HOME variable in your environment to match the >&2 66 | echo location of your Java installation. >&2 67 | echo. 68 | goto error 69 | 70 | :OkJHome 71 | if exist "%JAVA_HOME%\bin\java.exe" goto init 72 | 73 | echo. 74 | echo Error: JAVA_HOME is set to an invalid directory. >&2 75 | echo JAVA_HOME = "%JAVA_HOME%" >&2 76 | echo Please set the JAVA_HOME variable in your environment to match the >&2 77 | echo location of your Java installation. >&2 78 | echo. 79 | goto error 80 | 81 | @REM ==== END VALIDATION ==== 82 | 83 | :init 84 | 85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn". 86 | @REM Fallback to current working directory if not found. 87 | 88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% 89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir 90 | 91 | set EXEC_DIR=%CD% 92 | set WDIR=%EXEC_DIR% 93 | :findBaseDir 94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound 95 | cd .. 96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound 97 | set WDIR=%CD% 98 | goto findBaseDir 99 | 100 | :baseDirFound 101 | set MAVEN_PROJECTBASEDIR=%WDIR% 102 | cd "%EXEC_DIR%" 103 | goto endDetectBaseDir 104 | 105 | :baseDirNotFound 106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR% 107 | cd "%EXEC_DIR%" 108 | 109 | :endDetectBaseDir 110 | 111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig 112 | 113 | @setlocal EnableExtensions EnableDelayedExpansion 114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a 115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% 116 | 117 | :endReadAdditionalConfig 118 | 119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" 120 | set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" 121 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 122 | 123 | set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 124 | 125 | FOR /F "tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO ( 126 | IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B 127 | ) 128 | 129 | @REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central 130 | @REM This allows using the maven wrapper in projects that prohibit checking in binary data. 131 | if exist %WRAPPER_JAR% ( 132 | if "%MVNW_VERBOSE%" == "true" ( 133 | echo Found %WRAPPER_JAR% 134 | ) 135 | ) else ( 136 | if not "%MVNW_REPOURL%" == "" ( 137 | SET DOWNLOAD_URL="%MVNW_REPOURL%/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 138 | ) 139 | if "%MVNW_VERBOSE%" == "true" ( 140 | echo Couldn't find %WRAPPER_JAR%, downloading it ... 141 | echo Downloading from: %DOWNLOAD_URL% 142 | ) 143 | 144 | powershell -Command "&{"^ 145 | "$webclient = new-object System.Net.WebClient;"^ 146 | "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^ 147 | "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^ 148 | "}"^ 149 | "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^ 150 | "}" 151 | if "%MVNW_VERBOSE%" == "true" ( 152 | echo Finished downloading %WRAPPER_JAR% 153 | ) 154 | ) 155 | @REM End of extension 156 | 157 | @REM Provide a "standardized" way to retrieve the CLI args that will 158 | @REM work with both Windows and non-Windows executions. 159 | set MAVEN_CMD_LINE_ARGS=%* 160 | 161 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* 162 | if ERRORLEVEL 1 goto error 163 | goto end 164 | 165 | :error 166 | set ERROR_CODE=1 167 | 168 | :end 169 | @endlocal & set ERROR_CODE=%ERROR_CODE% 170 | 171 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost 172 | @REM check for post script, once with legacy .bat ending and once with .cmd ending 173 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat" 174 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd" 175 | :skipRcPost 176 | 177 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' 178 | if "%MAVEN_BATCH_PAUSE%" == "on" pause 179 | 180 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE% 181 | 182 | exit /B %ERROR_CODE% 183 | -------------------------------------------------------------------------------- /source/clicklogger/mvnw: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # ---------------------------------------------------------------------------- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # https://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # ---------------------------------------------------------------------------- 20 | 21 | # ---------------------------------------------------------------------------- 22 | # Maven Start Up Batch script 23 | # 24 | # Required ENV vars: 25 | # ------------------ 26 | # JAVA_HOME - location of a JDK home dir 27 | # 28 | # Optional ENV vars 29 | # ----------------- 30 | # M2_HOME - location of maven2's installed home dir 31 | # MAVEN_OPTS - parameters passed to the Java VM when running Maven 32 | # e.g. to debug Maven itself, use 33 | # set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 34 | # MAVEN_SKIP_RC - flag to disable loading of mavenrc files 35 | # ---------------------------------------------------------------------------- 36 | 37 | if [ -z "$MAVEN_SKIP_RC" ] ; then 38 | 39 | if [ -f /etc/mavenrc ] ; then 40 | . /etc/mavenrc 41 | fi 42 | 43 | if [ -f "$HOME/.mavenrc" ] ; then 44 | . "$HOME/.mavenrc" 45 | fi 46 | 47 | fi 48 | 49 | # OS specific support. $var _must_ be set to either true or false. 50 | cygwin=false; 51 | darwin=false; 52 | mingw=false 53 | case "`uname`" in 54 | CYGWIN*) cygwin=true ;; 55 | MINGW*) mingw=true;; 56 | Darwin*) darwin=true 57 | # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home 58 | # See https://developer.apple.com/library/mac/qa/qa1170/_index.html 59 | if [ -z "$JAVA_HOME" ]; then 60 | if [ -x "/usr/libexec/java_home" ]; then 61 | export JAVA_HOME="`/usr/libexec/java_home`" 62 | else 63 | export JAVA_HOME="/Library/Java/Home" 64 | fi 65 | fi 66 | ;; 67 | esac 68 | 69 | if [ -z "$JAVA_HOME" ] ; then 70 | if [ -r /etc/gentoo-release ] ; then 71 | JAVA_HOME=`java-config --jre-home` 72 | fi 73 | fi 74 | 75 | if [ -z "$M2_HOME" ] ; then 76 | ## resolve links - $0 may be a link to maven's home 77 | PRG="$0" 78 | 79 | # need this for relative symlinks 80 | while [ -h "$PRG" ] ; do 81 | ls=`ls -ld "$PRG"` 82 | link=`expr "$ls" : '.*-> \(.*\)$'` 83 | if expr "$link" : '/.*' > /dev/null; then 84 | PRG="$link" 85 | else 86 | PRG="`dirname "$PRG"`/$link" 87 | fi 88 | done 89 | 90 | saveddir=`pwd` 91 | 92 | M2_HOME=`dirname "$PRG"`/.. 93 | 94 | # make it fully qualified 95 | M2_HOME=`cd "$M2_HOME" && pwd` 96 | 97 | cd "$saveddir" 98 | # echo Using m2 at $M2_HOME 99 | fi 100 | 101 | # For Cygwin, ensure paths are in UNIX format before anything is touched 102 | if $cygwin ; then 103 | [ -n "$M2_HOME" ] && 104 | M2_HOME=`cygpath --unix "$M2_HOME"` 105 | [ -n "$JAVA_HOME" ] && 106 | JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 107 | [ -n "$CLASSPATH" ] && 108 | CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 109 | fi 110 | 111 | # For Mingw, ensure paths are in UNIX format before anything is touched 112 | if $mingw ; then 113 | [ -n "$M2_HOME" ] && 114 | M2_HOME="`(cd "$M2_HOME"; pwd)`" 115 | [ -n "$JAVA_HOME" ] && 116 | JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" 117 | fi 118 | 119 | if [ -z "$JAVA_HOME" ]; then 120 | javaExecutable="`which javac`" 121 | if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then 122 | # readlink(1) is not available as standard on Solaris 10. 123 | readLink=`which readlink` 124 | if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then 125 | if $darwin ; then 126 | javaHome="`dirname \"$javaExecutable\"`" 127 | javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" 128 | else 129 | javaExecutable="`readlink -f \"$javaExecutable\"`" 130 | fi 131 | javaHome="`dirname \"$javaExecutable\"`" 132 | javaHome=`expr "$javaHome" : '\(.*\)/bin'` 133 | JAVA_HOME="$javaHome" 134 | export JAVA_HOME 135 | fi 136 | fi 137 | fi 138 | 139 | if [ -z "$JAVACMD" ] ; then 140 | if [ -n "$JAVA_HOME" ] ; then 141 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 142 | # IBM's JDK on AIX uses strange locations for the executables 143 | JAVACMD="$JAVA_HOME/jre/sh/java" 144 | else 145 | JAVACMD="$JAVA_HOME/bin/java" 146 | fi 147 | else 148 | JAVACMD="`which java`" 149 | fi 150 | fi 151 | 152 | if [ ! -x "$JAVACMD" ] ; then 153 | echo "Error: JAVA_HOME is not defined correctly." >&2 154 | echo " We cannot execute $JAVACMD" >&2 155 | exit 1 156 | fi 157 | 158 | if [ -z "$JAVA_HOME" ] ; then 159 | echo "Warning: JAVA_HOME environment variable is not set." 160 | fi 161 | 162 | CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher 163 | 164 | # traverses directory structure from process work directory to filesystem root 165 | # first directory with .mvn subdirectory is considered project base directory 166 | find_maven_basedir() { 167 | 168 | if [ -z "$1" ] 169 | then 170 | echo "Path not specified to find_maven_basedir" 171 | return 1 172 | fi 173 | 174 | basedir="$1" 175 | wdir="$1" 176 | while [ "$wdir" != '/' ] ; do 177 | if [ -d "$wdir"/.mvn ] ; then 178 | basedir=$wdir 179 | break 180 | fi 181 | # workaround for JBEAP-8937 (on Solaris 10/Sparc) 182 | if [ -d "${wdir}" ]; then 183 | wdir=`cd "$wdir/.."; pwd` 184 | fi 185 | # end of workaround 186 | done 187 | echo "${basedir}" 188 | } 189 | 190 | # concatenates all lines of a file 191 | concat_lines() { 192 | if [ -f "$1" ]; then 193 | echo "$(tr -s '\n' ' ' < "$1")" 194 | fi 195 | } 196 | 197 | BASE_DIR=`find_maven_basedir "$(pwd)"` 198 | if [ -z "$BASE_DIR" ]; then 199 | exit 1; 200 | fi 201 | 202 | ########################################################################################## 203 | # Extension to allow automatically downloading the maven-wrapper.jar from Maven-central 204 | # This allows using the maven wrapper in projects that prohibit checking in binary data. 205 | ########################################################################################## 206 | if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then 207 | if [ "$MVNW_VERBOSE" = true ]; then 208 | echo "Found .mvn/wrapper/maven-wrapper.jar" 209 | fi 210 | else 211 | if [ "$MVNW_VERBOSE" = true ]; then 212 | echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." 213 | fi 214 | if [ -n "$MVNW_REPOURL" ]; then 215 | jarUrl="$MVNW_REPOURL/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 216 | else 217 | jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 218 | fi 219 | while IFS="=" read key value; do 220 | case "$key" in (wrapperUrl) jarUrl="$value"; break ;; 221 | esac 222 | done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" 223 | if [ "$MVNW_VERBOSE" = true ]; then 224 | echo "Downloading from: $jarUrl" 225 | fi 226 | wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" 227 | if $cygwin; then 228 | wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` 229 | fi 230 | 231 | if command -v wget > /dev/null; then 232 | if [ "$MVNW_VERBOSE" = true ]; then 233 | echo "Found wget ... using wget" 234 | fi 235 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then 236 | wget "$jarUrl" -O "$wrapperJarPath" 237 | else 238 | wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" 239 | fi 240 | elif command -v curl > /dev/null; then 241 | if [ "$MVNW_VERBOSE" = true ]; then 242 | echo "Found curl ... using curl" 243 | fi 244 | if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then 245 | curl -o "$wrapperJarPath" "$jarUrl" -f 246 | else 247 | curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f 248 | fi 249 | 250 | else 251 | if [ "$MVNW_VERBOSE" = true ]; then 252 | echo "Falling back to using Java to download" 253 | fi 254 | javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" 255 | # For Cygwin, switch paths to Windows format before running javac 256 | if $cygwin; then 257 | javaClass=`cygpath --path --windows "$javaClass"` 258 | fi 259 | if [ -e "$javaClass" ]; then 260 | if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then 261 | if [ "$MVNW_VERBOSE" = true ]; then 262 | echo " - Compiling MavenWrapperDownloader.java ..." 263 | fi 264 | # Compiling the Java class 265 | ("$JAVA_HOME/bin/javac" "$javaClass") 266 | fi 267 | if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then 268 | # Running the downloader 269 | if [ "$MVNW_VERBOSE" = true ]; then 270 | echo " - Running MavenWrapperDownloader.java ..." 271 | fi 272 | ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") 273 | fi 274 | fi 275 | fi 276 | fi 277 | ########################################################################################## 278 | # End of extension 279 | ########################################################################################## 280 | 281 | export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} 282 | if [ "$MVNW_VERBOSE" = true ]; then 283 | echo $MAVEN_PROJECTBASEDIR 284 | fi 285 | MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" 286 | 287 | # For Cygwin, switch paths to Windows format before running java 288 | if $cygwin; then 289 | [ -n "$M2_HOME" ] && 290 | M2_HOME=`cygpath --path --windows "$M2_HOME"` 291 | [ -n "$JAVA_HOME" ] && 292 | JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 293 | [ -n "$CLASSPATH" ] && 294 | CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 295 | [ -n "$MAVEN_PROJECTBASEDIR" ] && 296 | MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` 297 | fi 298 | 299 | # Provide a "standardized" way to retrieve the CLI args that will 300 | # work with both Windows and non-Windows executions. 301 | MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" 302 | export MAVEN_CMD_LINE_ARGS 303 | 304 | WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 305 | 306 | exec "$JAVACMD" \ 307 | $MAVEN_OPTS \ 308 | -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ 309 | "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ 310 | ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" 311 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Running a Data Processing Job on EMR Serverless with AWS Step Functions and AWS Lambda using Terraform (By HashiCorp) 2 | 3 | *Update Feb 2023* – AWS Step Functions adds direct integration for 35 services including Amazon EMR Serverless. In the current version of this blog, we are able to submit an EMR Serverless job by invoking the APIs directly from a Step Functions workflow. We are using the Lambda only for polling the status of the job in EMR. Read more about this feature enhancement [here](https://aws.amazon.com/about-aws/whats-new/2023/02/aws-step-functions-integration-35-services-emr-serverless/). 4 | 5 | In this blog we showcase how to build and orchestrate a [Scala](https://www.scala-lang.org/) Spark Application using [Amazon EMR Serverless](https://aws.amazon.com/emr/serverless/) , AWS Step Functions and [Terraform By HashiCorp](https://www.terraform.io/). In this end to end solution we execute a Spark job on EMR Serverless which processes sample click-stream data in Amazon S3 bucket and stores the aggregation results in Amazon S3. 6 | 7 | With EMR Serverless, customers don’t have to configure, optimize, secure, or operate clusters to run applications. You will continue to get the benefits of [Amazon EMR](https://aws.amazon.com/emr/), such as open source compatibility, concurrency, and optimized runtime performance for popular data frameworks. EMR Serverless is suitable for customers who want ease in operating applications using open-source frameworks. It offers quick job startup, automatic capacity management, and straightforward cost controls. 8 | 9 | There are several ‘infrastructure as code’ frameworks available today, to help customers define their infrastructure, such as the AWS CDK or Terraform. Terraform, an AWS Partner Network (APN) Advanced Technology Partner and member of the AWS DevOps Competency, is an infrastructure as code tool similar to AWS CloudFormation that allows you to create, update, and version your AWS infrastructure. Terraform provides friendly syntax (similar to [AWS CloudFormation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/Welcome.html)) along with other features like planning (visibility to see the changes before they actually happen), graphing, ability to create templates to break infra configurations into smaller chunks which allows better maintenance and reusability. We will leverage the capabilities and features of Terraform to build an API based ingestion process into AWS. Let’s get started! 10 | 11 | We will provide the Terraform infrastructure definition and the source code for an AWS Lambda using which sample customer user clicks for online website inputs are ingested into an [Amazon Kinesis Data Firehose](https://aws.amazon.com/kinesis/data-firehose/). The solution leverages Firehose’s capability to convert the incoming data into a Parquet file (an open-source file format for Hadoop) before pushing it to [Amazon S3](https://aws.amazon.com/s3/) using [AWS Glue](https://aws.amazon.com/glue/) catalog. The generated output S3 Parquet file logs are then processed by an EMR Serverless process and outputs a report detailing aggregate click stream statistics in S3 bucket. The EMR serverless operation is triggered using [AWS Step Functions](https://aws.amazon.com/step-functions). The sample architecture and code will be spun up as below. 12 | 13 | Provided samples have the source code for building the infrastructure using Terraform for running the Amazon EMR Application. Setup scripts are provided to create the sample ingestion using AWS Lambda for incoming application logs. Similar ingestion pattern sample was terraformed in an earlier [blog](https://aws.amazon.com/blogs/developer/provision-aws-infrastructure-using-terraform-by-hashicorp-an-example-of-web-application-logging-customer-data/). 14 | 15 | Overview of the steps and the AWS Services used in this solution: 16 | 17 | * Java source build – Provided application code is packaged & built using Apache Maven 18 | * Terraform commands are used to deploy the infrastructure in AWS. 19 | * [Amazon EMR Serverless](https://aws.amazon.com/emr/serverless/) Application - provides the option to submit a Spark job. 20 | * [AWS Lambda](https://aws.amazon.com/lambda/): 21 | * Ingestion Lambda – This lambda processes the incoming request and pushes the data into Firehose stream. 22 | * EMR Job Status Check Lambda - This lambda does a polling mechanism to check the status of the job that was submitted to EMR Serverless. 23 | * [AWS Step Functions](https://aws.amazon.com/step-functions) Submits the data processing job to an EMR Serverless application and triggers a Lambda which polls to check the status of the submitted job. 24 | * [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3) 25 | * Firehose Delivery Bucket - Stores the ingested application logs in parquet file format 26 | * Loggregator Source Bucket - Stores the scala code/jar for EMR job execution 27 | * Loggregator Output Bucket - EMR processed output is stored in this bucket 28 | * EMR Serverless logs Bucket - Stores EMR process application logs 29 | * Sample AWS Invoke commands (run as part of initial set up process) inserts the data using the Ingestion Lambda and Firehose stream converts the incoming stream into a Parquet file and stored in an S3 bucket 30 | 31 | 32 | ![Alt text](assets/emr-serverless-click-logs-from-web-application.drawio.png?raw=true "Title") 33 | ### Prerequisites 34 | 35 | * [AWS Cli](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) - At the time of writing this article version 2.7.18 was used. This will be required to query aws emr-serverless cli commands from your local machine. Optionally all the AWS Services used in this blog can be viewed/operated from AWS Console also. 36 | * Make sure to have [Java](https://www.java.com/en/download/) installed, JDK/JRE 8 is set in the environment path of your machine. For instructions, see [Java Development Kit](https://www.java.com/en/download/) 37 | * [Apache Maven](https://maven.apache.org/download.cgi) – Java Lambdas are built using mvn packages and are deployed using Terraform into AWS 38 | * [Scala Build Tool](https://www.scala-sbt.org/download.html) (sbt) - Version 1.4.7 is used at the time of this article. Make sure to download and install based on your operating system needs. 39 | * Set up [Terraform](https://www.terraform.io/downloads). For steps, see Terraform downloads. Version 1.2.5 is used at the time of this article. 40 | * An [AWS Account](https://aws.amazon.com/free/) 41 | 42 | ### Design Decisions 43 | 44 | * We use AWS Step Functions and its support for SDK Integrations with EMR Serverless to submit the data processing job to the EMR Serverless Application. 45 | * AWS Lambda Code & EMR Serverless Log Aggregation code are developed using Java & Scala respectively. 46 | * AWS CLI V2 is required for querying Amazon EMR Serverless applications from command line. These can be viewed from AWS Console also. A sample CLI command provided below in the “Testing” section below. 47 | 48 | ### Steps 49 | 50 | 51 | Clone [this repository](https://github.com/aws-samples/aws-emr-serverless-using-terraform) and execute the below command to spin up the infrastructure and the application 52 | Provided “exec.sh” shell script builds the Java application jar (For the Lambda Ingestion), the Scala application Jar (For the EMR Processing) and deploys the AWS Infrastructure that is needed for this use case. 53 | 54 | Execute the below commands 55 | 56 | 57 | ``` 58 | $ chmod +x exec.sh 59 | $ ./exec.sh 60 | ``` 61 | 62 | 63 | To run the commands individually 64 | 65 | Set the application deployment region and account number. An example below. Modify as needed. 66 | 67 | ``` 68 | $ APP_DIR=$PWD 69 | $ APP_PREFIX=clicklogger 70 | $ STAGE_NAME=dev 71 | $ REGION=us-east-1 72 | $ ACCOUNT_ID=$(aws sts get-caller-identity | jq -r '.Account') 73 | ``` 74 | 75 | Maven build AWS Lambda Application Jar & Scala Application package 76 | 77 | ``` 78 | $ cd $APP_DIR/source/clicklogger 79 | $ mvn clean package 80 | $ cd $APP_DIR/source/loggregator 81 | $ sbt reload 82 | $ sbt compile 83 | $ sbt package 84 | ``` 85 | 86 | 87 | Deploy the AWS Infrastructure using Terraform 88 | 89 | ``` 90 | $ terraform init 91 | $ terraform plan 92 | $ terraform apply --auto-approve 93 | ``` 94 | 95 | ### Testing 96 | 97 | 98 | Once the application is built and deployed, you can also insert sample data for the EMR processing. An example as below. Note exec.sh has multiple sample insertions for AWS Lambda. The ingested logs will be used by the EMR Serverless Application job 99 | 100 | Below sample AWS CLI Invoke command inserts sample data for the application logs 101 | 102 | ``` 103 | aws lambda invoke --function-name clicklogger-dev-ingestion-lambda --cli-binary-format raw-in-base64-out --payload '{"requestid":"OAP-guid-001","contextid":"OAP-ctxt-001","callerid":"OrderingApplication","component":"login","action":"load","type":"webpage"}' out 104 | ``` 105 | 106 | Validate the Deployments 107 | 108 | * Output – Once the Lambda is successfully executed, you should see the output in S3 buckets as shown below 109 | * Validate the saved ingested data as below 110 | * Navigate to the bucket created as part of the stack. 111 | * Select the file and view the file from “Select From” sub tab. 112 | * You should see something ingested stream got converted into parquet file. * 113 | * Select the file and view the data. A sample is shown below 114 | 115 | ![Alt text](assets/s3_source_parquet_files.png?raw=true "Title") 116 | 117 | * Run AWS Step Function to validate the Serverless application 118 | * Open AWS Console > AWS Step Function > Open "clicklogger-dev-state-machine". 119 | * The step function will show the steps that ran to trigger the AWS Lambda and Job submission to EMR Serverless Application 120 | * Start a new StepFunctions execution to trigger the workflow with the sample input below. Enter the date value equal to the date when sample data was ingested to S3 with the ingest lambda. 121 | ``` 122 | { 123 | "InputDate": "2025-07-13" 124 | } 125 | ``` 126 | * You can run the same using AWS CLI like below. 127 | ``` 128 | aws stepfunctions start-execution \ 129 | --state-machine-arn "arn:aws:states:us-east-1:$(aws sts get-caller-identity --query Account --output text):stateMachine:clicklogger-dev-state-machine" \ 130 | --name "test-execution-$(date +%s)" \ 131 | --input '{"InputDate": "2025-07-13"}' 132 | 133 | ``` 134 | * Once the AWS Step Function is successful, you can view the output files using AWS CLI: 135 | ``` 136 | # List all output files 137 | aws s3 ls s3://us-east-1-clicklogger-dev-loggregator-output-$(aws sts get-caller-identity --query Account --output text)/ --recursive 138 | 139 | # View the output file content 140 | aws s3 cp s3://us-east-1-clicklogger-dev-loggregator-output-$(aws sts get-caller-identity --query Account --output text)/2025/07/13/response.md - 141 | ``` 142 | * Or navigate to Amazon S3 > us-east-1-clicklogger-dev-loggregator-output- to see the output files. 143 | * These will be partitioned by year/month/date/response.md. A sample is shown below 144 | 145 | ![Alt text](assets/s3_output_response_file.png?raw=true "Title") 146 | 147 | 148 | AWS CLI can be used to check the deployed AWS Serverless Application 149 | 150 | ``` 151 | $ aws emr-serverless list-applications \ 152 | | jq -r '.applications[] | select(.name=="clicklogger-dev-loggregrator-emr-").id' 153 | 154 | ``` 155 | 156 | ![Alt text](assets/step_function_success.png?raw=true "Title") 157 | 158 | EMR Studio 159 | 160 | * Open AWS Console, Navigate to “EMR” > “Serverless” tab on the left pane. 161 | * Select “clicklogger-dev-studio” and click “Manage Applications” 162 | 163 | 164 | 165 | ![Alt text](assets/EMRStudioApplications.png?raw=true "Title") 166 | 167 | ![Alt text](assets/EMRServerlessApplication.png?raw=true "Title") 168 | 169 | Reviewing the Serverless Application Output: 170 | 171 | 172 | * Open AWS Console, Navigate to Amazon S3 173 | * Open the outputs S3 bucket. This will be like - us-east-1-clicklogger-dev-loggregator-output- 174 | * The EMR Serverless application writes the output based on the date partition as below 175 | * 2022/07/28/response.md 176 | * Output of the file will be like below 177 | 178 | ``` 179 | 180 | |*createdTime*|*callerid*|*component*|*count* 181 | |------------|-----------|-----------|------- 182 | *07-28-2022*|OrderingApplication|checkout|2 183 | *07-28-2022*|OrderingApplication|login|2 184 | *07-28-2022*|OrderingApplication|products|2 185 | ``` 186 | 187 | ## Cleanup 188 | 189 | 190 | Provided "./cleanup.sh" has the required steps to delete all the files from Amazon S3 buckets that were created as part of this blog. terraform destroy command will clean up the AWS infrastructure those were spun up as mentioned above 191 | 192 | 193 | ``` 194 | $ chmod +x cleanup.sh 195 | $ ./cleanup.sh 196 | ``` 197 | 198 | * To do the steps manually, 199 | 200 | S3 and created services can be deleted using CLI also. Execute the below commands (an example below, modify as needed): 201 | 202 | ``` 203 | 204 | 205 | # CLI Commands to delete the S3 206 | 207 | APP_DIR=$PWD 208 | APP_PREFIX=clicklogger 209 | STAGE_NAME=dev 210 | REGION=us-east-1 211 | 212 | ACCOUNT_ID=$(aws sts get-caller-identity | jq -r '.Account') 213 | echo $ACCOUNT_ID 214 | 215 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-emr-logs-$ACCOUNT_ID --force 216 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-firehose-delivery-$ACCOUNT_ID --force 217 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-loggregator-output-$ACCOUNT_ID --force 218 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-loggregator-source-$ACCOUNT_ID --force 219 | aws s3 rb s3://$REGION-$APP_PREFIX-$STAGE_NAME-emr-studio-$ACCOUNT_ID --force 220 | 221 | # Destroy the AWS Infrastructure 222 | terraform destroy --auto-approve 223 | 224 | 225 | ``` 226 | 227 | 228 | 229 | ## Conclusion 230 | 231 | 232 | To recap, in this post we built, deployed & ran a data processing spark job in Amazon EMR Serverless that interacts with various AWS Services. The post walked through deploying a lambda packaged with Java using maven, a Scala application code for EMR Serverless Application triggered with AWS Step Functions with infrastructure as code. You may use any combination of applicable programming languages to build your lambda functions, EMR Job application. EMR Serverless can be triggered manually, automated or can be orchestrated using AWS Services like AWS Step Function, Amazon Managed Apache airflow, etc., 233 | 234 | We encourage you to test this example and see for yourself how this overall application design works within AWS. Then, it will be just the matter of replacing your individual code base, package them and let the Amazon EMR Serverless handle the process efficiently. 235 | 236 | If you implement this example and run into any issues, or have any questions or feedback about this blog please provide your comments below! 237 | 238 | ## References 239 | 240 | * [Terraform: Beyond the basics with AWS](https://aws.amazon.com/blogs/apn/terraform-beyond-the-basics-with-aws/) 241 | * [Amazon EMR Serverless General Availability](https://aws.amazon.com/about-aws/whats-new/2022/06/amazon-emr-serverless-generally-available/) 242 | * [Amazon EMR Serverless Now Generally Available – Run Big Data Applications without Managing Servers](https://aws.amazon.com/blogs/aws/amazon-emr-serverless-now-generally-available-run-big-data-applications-without-managing-servers/) 243 | * [Provision AWS infrastructure using Terraform (By HashiCorp): an example of web application logging customer data](https://aws.amazon.com/blogs/developer/provision-aws-infrastructure-using-terraform-by-hashicorp-an-example-of-web-application-logging-customer-data/) 244 | 245 | 246 | -------------------------------------------------------------------------------- /terraform/templates/roles.tf: -------------------------------------------------------------------------------- 1 | ## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved 2 | ## 3 | ### SPDX-License-Identifier: MIT-0 4 | 5 | data "aws_iam_policy_document" "AWSLambdaTrustPolicy" { 6 | statement { 7 | actions = ["sts:AssumeRole"] 8 | effect = "Allow" 9 | principals { 10 | type = "Service" 11 | identifiers = ["lambda.amazonaws.com"] 12 | } 13 | } 14 | } 15 | 16 | resource "aws_iam_role" "click_logger_emr_lambda_role" { 17 | name = "${var.app_prefix}-${var.stage_name}-lambda-emr-role" 18 | assume_role_policy = data.aws_iam_policy_document.AWSLambdaTrustPolicy.json 19 | } 20 | 21 | resource "aws_iam_role_policy_attachment" "click_logger_emr_lambda_policy" { 22 | role = aws_iam_role.click_logger_emr_lambda_role.name 23 | policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" 24 | } 25 | 26 | resource "aws_iam_role_policy_attachment" "click_logger_emr_lambda_iam_role_policy_attachment_vpc_access_execution" { 27 | role = aws_iam_role.click_logger_emr_lambda_role.name 28 | policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" 29 | } 30 | 31 | resource "aws_iam_role_policy" "click_logger_emr_lambda_inline_policy" { 32 | name = "${var.app_prefix}-${var.stage_name}-emr-lambda-inline_policy" 33 | role = aws_iam_role.click_logger_emr_lambda_role.id 34 | policy = <