├── .github
└── PULL_REQUEST_TEMPLATE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── build_and_push_to_s3.sh
├── cloudformation-templates
├── allsteps_cf.template
├── step1_vpc.template
├── step2_iam.template
├── step3_firehose.template
├── step4_kinesisstream.template
├── step5_emr.template
└── step6_ec2_instance.template
├── kinesis-lambda
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── optimize
│ └── downstream
│ ├── additionaldata
│ ├── AdditionalIOTData.java
│ └── TestAdditonalData.java
│ └── lambda
│ └── ProcessKinesisRecords.java
├── pom.xml
├── sample-kinesis-producer
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── optimize
│ └── downstream
│ ├── datagenerator
│ ├── GenerateDataMain.java
│ └── GenerateDataWorker.java
│ ├── entry
│ └── Main.java
│ ├── samplekinesisproducer
│ ├── IOTDevice.java
│ ├── IOTDeviceConsumerFromBlockingQueueToKinesisStreams.java
│ ├── IOTDeviceProducerToBlockingQueue.java
│ └── SampleEvent.java
│ └── sensors
│ ├── Accelerometer.java
│ ├── GPS.java
│ ├── Illuminance.java
│ └── TemperatureSensor.java
└── spark-process
├── pom.xml
└── src
└── main
└── scala
└── com
└── optimize
└── downstream
└── process
└── ProcessFilesFromS3AndConvertToParquet.scala
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 |
3 | *Description of changes:*
4 |
5 |
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check [existing open](https://github.com/aws-samples/amazon-emr-optimize-data-processing/issues), or [recently closed](https://github.com/aws-samples/amazon-emr-optimize-data-processing/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/amazon-emr-optimize-data-processing/labels/help%20wanted) issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](https://github.com/aws-samples/amazon-emr-optimize-data-processing/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Optimizing downstream data processing with Amazon Kinesis Data Firehose and Amazon EMR running Apache Spark
2 |
3 | Optimizing downstream data processing with Amazon Kinesis Data Firehose and Amazon EMR running Apache Spark
4 |
5 | ## License Summary
6 |
7 | This sample code is made available under the MIT-0 license. See the LICENSE file.
8 |
9 | ## Solution Overview
10 |
11 | ### AWS Blog link
12 | ##### For complete blog details, check AWS blog @ ""
13 | ### The steps we follow in this blog post are:
14 | ##### 1.Create a virtual private cloud (VPC) and an Amazon S3 bucket.
15 | ##### 2.Provision a Kinesis Data data stream, and an AWS Lambda function to process the messages from the Kinesis data stream.
16 | ##### 3.Provision Kinesis Data Firehose to deliver messages to Amazon S3 sent from the Lambda function in step 2. This step also provisions provisions an Amazon EMR cluster to process the data in Amazon S3.
17 | ##### 4.Generate test data with custom code running on an Amazon EC2 instance.
18 | ##### 5.Run a sample Spark program from the Amazon EMR cluster’s master instance to read the files from Amazon S3, convert them into parquet format and write back to an Amazon S3 destination.
19 |
20 |
--------------------------------------------------------------------------------
/build_and_push_to_s3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Change JAVA_HOME as per your java home location
4 | JAVA_HOME="/Library/Java/JavaVirtualMachines/jdk1.8.0_202.jdk/Contents/Home"
5 | S3_PATH="aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing"
6 |
7 |
8 | if [[ ! -d "${JAVA_HOME}" ]]; then
9 | echo "JAVA_HOME path not found. Check the path : ${JAVA_HOME}"
10 | exit 99;
11 | fi
12 | export JAVA_HOME=${JAVA_HOME}
13 | echo "Starting project build"
14 | mvn clean compile assembly:single
15 | echo "Project build completed"
16 |
17 | # Copy to east-1 region - blog assumes everything is created in us-east-1 (N.VIRGINIA) region.
18 |
19 | echo "Copying Files to S3 bucket path : ${S3_PATH}"
20 | # Kinesis Producer Jar file
21 | aws s3 cp sample-kinesis-producer/target/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar s3://${S3_PATH}/appjars/ --acl public-read
22 |
23 | # Lambda Jar file
24 | aws s3 cp kinesis-lambda/target/kinesis-lambda-1.0-SNAPSHOT-jar-with-dependencies.jar s3://${S3_PATH}/appjars/ --acl public-read
25 |
26 | # Spark scala code to process files in S3.
27 | aws s3 cp spark-process/target/spark-process-1.0-SNAPSHOT-jar-with-dependencies.jar s3://${S3_PATH}/appjars/ --acl public-read
28 |
29 | # Cloudformation templates
30 | aws s3 cp cloudformation-templates/allsteps_cf.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
31 | aws s3 cp cloudformation-templates/step1_vpc.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
32 | aws s3 cp cloudformation-templates/step2_iam.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
33 | aws s3 cp cloudformation-templates/step3_firehose.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
34 | aws s3 cp cloudformation-templates/step4_kinesisstream.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
35 | aws s3 cp cloudformation-templates/step5_emr.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
36 | aws s3 cp cloudformation-templates/step6_ec2_instance.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
37 |
--------------------------------------------------------------------------------
/cloudformation-templates/allsteps_cf.template:
--------------------------------------------------------------------------------
1 | {
2 | "AWSTemplateFormatVersion": "2010-09-09",
3 | "Description": "AWS BLOGS - Root Template - One Step Solution",
4 | "Parameters": {
5 | "S3BucketName": {
6 | "Type": "String",
7 | "Description": "S3 Bucket Name that will be created in your account"
8 | },
9 | "ClientIP": {
10 | "Description": "The IP address range that can be used to connect to the EC2 instance from your local machine.It must be a valid IP CIDR range of the form x.x.x.x/x.Pls get your address using checkip.amazonaws.com or whatsmyip.org",
11 | "Type": "String",
12 | "MinLength": "9",
13 | "MaxLength": "18",
14 | "Default": "0.0.0.0/0",
15 | "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})",
16 | "ConstraintDescription": "It must be a valid IP CIDR range of the form x.x.x.x/x. Suggest to enable access to your IP address only. Pls get your address using checkip.amazonaws.com or whatsmyip.org."
17 | },
18 | "FirehoseDeliveryStreamName": {
19 | "Type": "String",
20 | "Description": "Name of the Amazon Firehose delivery stream. Default value is set to 'AWSBlogs-LambdaToFireHose'",
21 | "Default": "AWSBlogs-LambdaToFireHose",
22 | "AllowedValues": [
23 | "AWSBlogs-LambdaToFireHose"
24 | ]
25 | },
26 | "KinesisStreamName": {
27 | "Type": "String",
28 | "Description" : "Name of the Amazon Kinesis stream. Default value is set to 'AWS-Blog-BaseKinesisStream'",
29 | "Default": "AWS-Blog-BaseKinesisStream",
30 | "AllowedValues": [
31 | "AWS-Blog-BaseKinesisStream"
32 | ]
33 | },
34 | "Region": {
35 | "Description": "AWS Region - Select us-east-1 by default.",
36 | "Type": "String",
37 | "Default": "us-east-1",
38 | "AllowedValues": [
39 | "us-east-1"
40 | ]
41 | },
42 | "EMRClusterName": {
43 | "Type": "String",
44 | "Description": "ClusterName"
45 | },
46 | "KeyName": {
47 | "Description": "Name of an existing EC2 key pair to access the Amazon EMR cluster",
48 | "Type": "AWS::EC2::KeyPair::KeyName"
49 | },
50 | "InstanceType": {
51 | "Description": "EC2 instance specs configuration",
52 | "Type": "String",
53 | "Default": "r4.xlarge",
54 | "AllowedValues": [
55 | "r4.xlarge",
56 | "r4.2xlarge",
57 | "r4.4xlarge"
58 | ]
59 | }
60 | },
61 | "Resources": {
62 | "STEP1": {
63 | "Type": "AWS::CloudFormation::Stack",
64 | "Properties": {
65 | "TemplateURL": {
66 | "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step1_vpc.template"
67 | },
68 | "Parameters": {
69 | "S3BucketName": {
70 | "Ref": "S3BucketName"
71 | },
72 | "ClientIP": {
73 | "Ref": "ClientIP"
74 | }
75 | }
76 | }
77 | },
78 | "STEP2": {
79 | "Type": "AWS::CloudFormation::Stack",
80 | "DependsOn":"STEP1",
81 | "Properties": {
82 | "TemplateURL": {
83 | "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step2_iam.template"
84 | }
85 | }
86 | },
87 | "STEP3": {
88 | "Type": "AWS::CloudFormation::Stack",
89 | "DependsOn":"STEP2",
90 | "Properties": {
91 | "TemplateURL": {
92 | "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step3_firehose.template"
93 | },
94 | "Parameters": {
95 | "Role": {
96 | "Fn::GetAtt": [
97 | "STEP2",
98 | "Outputs.FirehoseRoleArn"
99 | ]
100 | },
101 | "S3BucketARN": {
102 | "Fn::GetAtt": [
103 | "STEP1",
104 | "Outputs.S3BucketARN"
105 | ]
106 | },
107 | "FirehoseDeliveryStreamName": {
108 | "Ref": "FirehoseDeliveryStreamName"
109 | }
110 | }
111 | }
112 | },
113 | "STEP4": {
114 | "Type": "AWS::CloudFormation::Stack",
115 | "DependsOn":"STEP3",
116 | "Properties": {
117 | "TemplateURL": {
118 | "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step4_kinesisstream.template"
119 | },
120 | "Parameters": {
121 | "Role": {
122 | "Fn::GetAtt": [
123 | "STEP2",
124 | "Outputs.LambdaRoleArn"
125 | ]
126 | },
127 | "Region": {
128 | "Ref": "Region"
129 | },
130 | "S3Bucket": {
131 | "Ref": "S3BucketName"
132 | },
133 | "KinesisStreamName": {
134 | "Ref": "KinesisStreamName"
135 | }
136 | }
137 | }
138 | },
139 | "STEP5": {
140 | "Type": "AWS::CloudFormation::Stack",
141 | "DependsOn":"STEP4",
142 | "Properties": {
143 | "TemplateURL": {
144 | "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step5_emr.template"
145 | },
146 | "Parameters": {
147 | "EMRClusterName": {
148 | "Ref": "EMRClusterName"
149 | },
150 | "ClusterSecurityGroup": {
151 | "Fn::GetAtt": [
152 | "STEP1",
153 | "Outputs.SecurityGroup"
154 | ]
155 | },
156 | "ClusterSubnetID": {
157 | "Fn::GetAtt": [
158 | "STEP1",
159 | "Outputs.SubnetID"
160 | ]
161 | },
162 | "KeyName": {
163 | "Ref": "KeyName"
164 | }
165 | }
166 | }
167 | },
168 | "STEP6": {
169 | "Type": "AWS::CloudFormation::Stack",
170 | "DependsOn":"STEP5",
171 | "Properties": {
172 | "TemplateURL": {
173 | "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step6_ec2_instance.template"
174 | },
175 | "Parameters": {
176 | "EC2SecurityGroupId": {
177 | "Fn::GetAtt": [
178 | "STEP1",
179 | "Outputs.SecurityGroup"
180 | ]
181 | },
182 | "KeyName": {
183 | "Ref": "KeyName"
184 | },
185 | "EC2Subnet": {
186 | "Fn::GetAtt": [
187 | "STEP1",
188 | "Outputs.SubnetID"
189 | ]
190 | },
191 | "InstanceType": {
192 | "Ref": "InstanceType"
193 | }
194 | }
195 | }
196 | }
197 | }
198 | }
--------------------------------------------------------------------------------
/cloudformation-templates/step1_vpc.template:
--------------------------------------------------------------------------------
1 | {
2 | "AWSTemplateFormatVersion": "2010-09-09",
3 | "Description": "AWS BLOGS - This template creates an Amazon VPC and subnet with the required configuration.",
4 | "Parameters": {
5 | "S3BucketName": {
6 | "Type": "String"
7 | },
8 | "ClientIP": {
9 | "Description": "The IP address range that can be used to connect to the EC2 instance from your local machine.It must be a valid IP CIDR range of the form x.x.x.x/x.Pls get your address using checkip.amazonaws.com or whatsmyip.org",
10 | "Type": "String",
11 | "MinLength": "9",
12 | "MaxLength": "18",
13 | "Default": "0.0.0.0/0",
14 | "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})",
15 | "ConstraintDescription": "It must be a valid IP CIDR range of the form x.x.x.x/x. Suggest to enable access to your IP address only. Pls get your address using checkip.amazonaws.com or whatsmyip.org."
16 | }
17 | },
18 | "Resources": {
19 | "VPC": {
20 | "Type": "AWS::EC2::VPC",
21 | "Properties": {
22 | "CidrBlock": "10.0.0.0/16",
23 | "EnableDnsSupport": true,
24 | "EnableDnsHostnames": true,
25 | "InstanceTenancy": "default",
26 | "Tags": [{
27 | "Key": "Name",
28 | "Value": "awsblog-small-files-vpc"
29 | }]
30 | }
31 | },
32 | "PublicSubnet": {
33 | "Type": "AWS::EC2::Subnet",
34 | "Properties": {
35 | "VpcId": {
36 | "Ref": "VPC"
37 | },
38 | "CidrBlock": "10.0.1.0/24",
39 | "MapPublicIpOnLaunch": "True",
40 | "Tags": [{
41 | "Key": "Name",
42 | "Value": "awsblog-small-files-subnet"
43 | }]
44 | }
45 | },
46 | "InternetGateway": {
47 | "Type": "AWS::EC2::InternetGateway",
48 | "Properties": {
49 | "Tags": [{
50 | "Key": "Name",
51 | "Value": "awsblog-small-files-gateway"
52 | }]
53 | }
54 | },
55 | "MyGatewayAttachment": {
56 | "Type": "AWS::EC2::VPCGatewayAttachment",
57 | "Properties": {
58 | "InternetGatewayId": {
59 | "Ref": "InternetGateway"
60 | },
61 | "VpcId": {
62 | "Ref": "VPC"
63 | }
64 | }
65 | },
66 | "PublicRouteTable": {
67 | "Type": "AWS::EC2::RouteTable",
68 | "Properties": {
69 | "VpcId": {
70 | "Ref": "VPC"
71 | }
72 | }
73 | },
74 | "PublicRoute": {
75 | "Type": "AWS::EC2::Route",
76 | "Properties": {
77 | "RouteTableId": {
78 | "Ref": "PublicRouteTable"
79 | },
80 | "DestinationCidrBlock": "0.0.0.0/0",
81 | "GatewayId": {
82 | "Ref": "InternetGateway"
83 | }
84 | },
85 | "DependsOn": [
86 | "MyGatewayAttachment"
87 | ]
88 | },
89 | "PublicSubnetRouteAssociation": {
90 | "Type": "AWS::EC2::SubnetRouteTableAssociation",
91 | "Properties": {
92 | "RouteTableId": {
93 | "Ref": "PublicRouteTable"
94 | },
95 | "SubnetId": {
96 | "Ref": "PublicSubnet"
97 | }
98 | }
99 | },
100 | "InstanceSecurityGroup": {
101 | "Type": "AWS::EC2::SecurityGroup",
102 | "Properties": {
103 | "GroupDescription": "CloudFormationGroup",
104 | "VpcId": {
105 | "Ref": "VPC"
106 | },
107 | "SecurityGroupIngress": [{
108 | "IpProtocol": "tcp",
109 | "CidrIp": { "Ref" : "ClientIP"},
110 | "FromPort": "22",
111 | "ToPort": "22"
112 | }],
113 | "SecurityGroupEgress": [
114 | {
115 | "CidrIp": "0.0.0.0/0",
116 | "IpProtocol": "-1",
117 | "FromPort": -1,
118 | "ToPort": -1
119 | }
120 | ],
121 | "Tags": [{
122 | "Key": "Name",
123 | "Value": "awsblog-small-files-securitygroup"
124 | }]
125 | }
126 | },
127 | "VPCDefaultSecurityGroupIngress": {
128 | "Type": "AWS::EC2::SecurityGroupIngress",
129 | "Properties": {
130 | "GroupId": {
131 | "Fn::GetAtt": ["InstanceSecurityGroup", "GroupId"]
132 | },
133 | "IpProtocol": "-1",
134 | "FromPort": "-1",
135 | "ToPort": "-1",
136 | "SourceSecurityGroupId": {
137 | "Fn::GetAtt": [
138 | "InstanceSecurityGroup",
139 | "GroupId"
140 | ]
141 | }
142 | }
143 | },
144 | "S3Bucket": {
145 | "Type": "AWS::S3::Bucket",
146 | "Properties": {
147 | "BucketName": {
148 | "Ref": "S3BucketName"
149 | }
150 | }
151 | }
152 | },
153 | "Outputs": {
154 | "StackName": {
155 | "Value": {
156 | "Ref": "AWS::StackName"
157 | }
158 | },
159 | "SubnetID": {
160 | "Description": "Use this subnet ID for your other AWS resources",
161 | "Value": {
162 | "Ref": "PublicSubnet"
163 | }
164 | },
165 | "SecurityGroup": {
166 | "Description": "Use this security group ID for your other AWS resources.",
167 | "Value": {
168 | "Fn::GetAtt": ["InstanceSecurityGroup", "GroupId"]
169 | }
170 | },
171 | "VPCID": {
172 | "Description": "Use this VPC ID for your other AWS resources..",
173 | "Value": {
174 | "Ref": "VPC"
175 | }
176 | },
177 | "S3BucketDomain": {
178 | "Description": "S3 Bucket Domain that was created",
179 | "Value": {
180 | "Fn::GetAtt": ["S3Bucket", "DomainName" ]
181 | }
182 | },
183 | "S3BucketARN": {
184 | "Description": "S3 Bucket ARN that was created",
185 | "Value": {
186 | "Fn::GetAtt": ["S3Bucket", "Arn" ]
187 | }
188 | }
189 | }
190 | }
191 |
--------------------------------------------------------------------------------
/cloudformation-templates/step2_iam.template:
--------------------------------------------------------------------------------
1 | {
2 | "AWSTemplateFormatVersion": "2010-09-09",
3 | "Description": "AWS BLOGS - IAM Roles required for the AWS Services",
4 | "Resources": {
5 | "LambdaRole": {
6 | "Type": "AWS::IAM::Role",
7 | "Properties": {
8 | "RoleName": "small-files-lambdarole",
9 | "AssumeRolePolicyDocument": {
10 | "Version": "2012-10-17",
11 | "Statement": [
12 | {
13 | "Effect": "Allow",
14 | "Principal": {
15 | "Service": "lambda.amazonaws.com"
16 | },
17 | "Action": "sts:AssumeRole"
18 | }
19 | ]
20 | },
21 | "Path": "/",
22 | "Policies": [
23 | {
24 | "PolicyName": "root",
25 | "PolicyDocument": {
26 | "Version": "2012-10-17",
27 | "Statement": {
28 | "Effect": "Allow",
29 | "Action": [
30 | "ec2:*",
31 | "logs:*",
32 | "kinesis:*",
33 | "firehose:*",
34 | "s3:*"
35 | ],
36 | "Resource": "*"
37 | }
38 | }
39 | }
40 | ]
41 | }
42 | },
43 | "FirehoseRole": {
44 | "Type": "AWS::IAM::Role",
45 | "Properties": {
46 | "RoleName": "small-files-firehoserole",
47 | "AssumeRolePolicyDocument": {
48 | "Version": "2012-10-17",
49 | "Statement": [
50 | {
51 | "Effect": "Allow",
52 | "Principal": {
53 | "Service": "firehose.amazonaws.com"
54 | },
55 | "Action": "sts:AssumeRole"
56 | }
57 | ]
58 | },
59 | "Path": "/",
60 | "Policies": [
61 | {
62 | "PolicyName": "root",
63 | "PolicyDocument": {
64 | "Version": "2012-10-17",
65 | "Statement": {
66 | "Effect": "Allow",
67 | "Action": [
68 | "s3:*"
69 | ],
70 | "Resource": "*"
71 | }
72 | }
73 | }
74 | ]
75 | }
76 | }
77 | },
78 | "Outputs": {
79 | "FirehoseRoleArn": {
80 | "Value": {
81 | "Fn::GetAtt": [
82 | "FirehoseRole",
83 | "Arn"
84 | ]
85 | }
86 | },
87 | "LambdaRoleArn": {
88 | "Value": {
89 | "Fn::GetAtt": [
90 | "LambdaRole",
91 | "Arn"
92 | ]
93 | }
94 | }
95 | }
96 | }
--------------------------------------------------------------------------------
/cloudformation-templates/step3_firehose.template:
--------------------------------------------------------------------------------
1 | {
2 | "AWSTemplateFormatVersion": "2010-09-09",
3 | "Description": "AWS BLOGS - Creating Amazon Kinesis Firehose Delivery Stream",
4 | "Parameters": {
5 | "Role": {
6 | "Type": "String",
7 | "Description" : "Fire Hose IAM Role ARN that was created as part of the Cloudformation template 2."
8 |
9 | },
10 | "S3BucketARN": {
11 | "Type": "String",
12 | "Description" : "S3 Bucket ARN that was created as part of the Cloudformation template 1."
13 | }
14 | ,
15 | "FirehoseDeliveryStreamName": {
16 | "Type": "String",
17 | "Description" : "Name of the Amazon Firehose delivery stream. Default value is set to 'AWSBlogs-LambdaToFireHose'",
18 | "Default": "AWSBlogs-LambdaToFireHose",
19 | "AllowedValues": [
20 | "AWSBlogs-LambdaToFireHose"
21 | ]
22 | }
23 | },
24 | "Resources": {
25 | "KinesisDeliveryStreamFromLambda": {
26 | "Type": "AWS::KinesisFirehose::DeliveryStream",
27 | "Properties": {
28 | "DeliveryStreamName": {"Ref": "FirehoseDeliveryStreamName"},
29 | "DeliveryStreamType": "DirectPut",
30 | "ExtendedS3DestinationConfiguration": {
31 | "BufferingHints": {
32 | "IntervalInSeconds": 300,
33 | "SizeInMBs": 128
34 | },
35 | "CompressionFormat": "UNCOMPRESSED",
36 | "BucketARN": {"Ref":"S3BucketARN"},
37 | "Prefix": "fromfirehose/",
38 | "RoleARN": {"Ref":"Role"}
39 | }
40 | }
41 | }
42 | }
43 | }
--------------------------------------------------------------------------------
/cloudformation-templates/step4_kinesisstream.template:
--------------------------------------------------------------------------------
1 | {
2 | "AWSTemplateFormatVersion": "2010-09-09",
3 | "Description": "AWS BLOGS - Kinesis Stream and a Lambda Function",
4 | "Parameters": {
5 | "Role": {
6 | "Description": "IAM Role created for Lambda Function as part of the 2nd CF template. Get the value from the output of 2nd CF template.",
7 | "Type": "String"
8 | },
9 | "Region": {
10 | "Description": "AWS Region - Select us-east-1 by default.",
11 | "Type": "String",
12 | "Default": "us-east-1",
13 | "AllowedValues": [
14 | "us-east-1"
15 | ]
16 | },
17 | "KinesisStreamName": {
18 | "Type": "String",
19 | "Description" : "Name of the Amazon Kinesis stream. Default value is set to 'AWS-Blog-BaseKinesisStream'",
20 | "Default": "AWS-Blog-BaseKinesisStream",
21 | "AllowedValues": [
22 | "AWS-Blog-BaseKinesisStream"
23 | ]
24 | },
25 | "S3Bucket": {
26 | "Description": "Existing S3 Bucket name that was created using 1st CF template. Do not use the domain name, just provide the bucket name.",
27 | "Type": "String"
28 | }
29 | },
30 | "Resources": {
31 | "BaseKinesisStream": {
32 | "Type": "AWS::Kinesis::Stream",
33 | "Properties": {
34 | "Name": {"Ref": "KinesisStreamName"},
35 | "ShardCount": "10"
36 | }
37 | },
38 | "LambdaProcessKinesisRecords": {
39 | "Type": "AWS::Lambda::Function",
40 | "DependsOn": "BaseKinesisStream",
41 | "Properties": {
42 | "Code": {
43 | "S3Bucket": "aws-bigdata-blog",
44 | "S3Key": {
45 | "Fn::Sub": "artifacts/aws-blog-avoid-small-files/appjars/kinesis-lambda-1.0-SNAPSHOT-jar-with-dependencies.jar"
46 | }
47 | },
48 | "Description": "AWS BLOGS - Processing Incoming Kinesis Records",
49 | "FunctionName": "LambdaForProcessingKinesisRecords",
50 | "Handler": "com.awsblogs.smallfiles.lambda.ProcessKinesisRecords",
51 | "Role": {
52 | "Ref": "Role"
53 | },
54 | "Runtime": "java8",
55 | "MemorySize": 1920,
56 | "Timeout": 300,
57 | "Environment": {
58 | "Variables": {
59 | "kinesis_region": {
60 | "Ref": "Region"
61 | },
62 | "kinesis_stream_name": {
63 | "Ref": "BaseKinesisStream"
64 | },
65 | "s3region": {
66 | "Ref": "Region"
67 | },
68 | "s3bucketName": {
69 | "Ref": "S3Bucket"
70 | },
71 | "s3directorySub": "raw-from-firehose/",
72 | "kinesisfirehosestream" : "AWSBlogs-LambdaToFireHose"
73 | }
74 | }
75 | }
76 | },
77 | "KinesisLambdaEventTrigger": {
78 | "Type" : "AWS::Lambda::EventSourceMapping",
79 | "DependsOn": [
80 | "BaseKinesisStream", "LambdaProcessKinesisRecords"
81 | ],
82 | "Properties" : {
83 | "BatchSize" : 100,
84 | "Enabled" : true,
85 | "EventSourceArn" : {
86 | "Fn::GetAtt": ["BaseKinesisStream", "Arn" ]
87 | },
88 | "FunctionName" : {"Ref":"LambdaProcessKinesisRecords"},
89 | "StartingPosition" : "TRIM_HORIZON"
90 | }
91 | }
92 | }
93 | }
--------------------------------------------------------------------------------
/cloudformation-templates/step5_emr.template:
--------------------------------------------------------------------------------
1 | {
2 | "AWSTemplateFormatVersion": "2010-09-09",
3 | "Description": "AWS BLOGS - Creating an EMR cluster",
4 | "Parameters": {
5 | "EMRClusterName": {
6 | "Type": "String",
7 | "Description": "ClusterName"
8 | },
9 | "ClusterSecurityGroup": {
10 | "Description": "ID of an existing security-group for the for the Amazon EMR cluster",
11 | "Type": "AWS::EC2::SecurityGroup::Id"
12 | },
13 | "ClusterSubnetID": {
14 | "Description": "ID of an existing subnet for the Amazon EMR cluster",
15 | "Type": "AWS::EC2::Subnet::Id"
16 | },
17 | "KeyName": {
18 | "Description": "Name of an existing EC2 key pair to access the Amazon EMR cluster",
19 | "Type": "AWS::EC2::KeyPair::KeyName"
20 | }
21 | },
22 | "Resources": {
23 | "EMRCluster": {
24 | "Properties":
25 | {
26 | "Name": {
27 | "Ref": "EMRClusterName"
28 | },
29 | "Applications": [{
30 | "Name": "Spark"
31 | },
32 | {
33 | "Name": "Ganglia"
34 | },
35 | {
36 | "Name": "hive"
37 | }
38 | ],
39 | "Configurations": [{
40 | "Classification": "spark",
41 | "ConfigurationProperties": {
42 | "maximizeResourceAllocation": "true"
43 | }
44 | },
45 | {
46 | "Classification": "spark-hive-site",
47 | "ConfigurationProperties": {
48 | "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
49 | }
50 | }
51 | ],
52 | "Instances": {
53 | "CoreInstanceGroup": {
54 | "InstanceCount": 2,
55 | "InstanceType": "r4.xlarge",
56 | "Name": "Core Instance Group"
57 | },
58 | "Ec2KeyName": {
59 | "Ref": "KeyName"
60 | },
61 | "Ec2SubnetId": {"Ref": "ClusterSubnetID"},
62 | "MasterInstanceGroup": {
63 | "InstanceCount": 1,
64 | "InstanceType": "r4.xlarge",
65 | "Name": "Master Instance Group"
66 | },
67 | "AdditionalMasterSecurityGroups": [{
68 | "Ref": "ClusterSecurityGroup"
69 | }],
70 | "AdditionalSlaveSecurityGroups": [{
71 | "Ref": "ClusterSecurityGroup"
72 | }]
73 | },
74 | "JobFlowRole": "EMR_EC2_DefaultRole",
75 | "ServiceRole": "EMR_DefaultRole",
76 | "ReleaseLabel": "emr-5.16.0",
77 | "VisibleToAllUsers": "true"
78 | },
79 | "Type": "AWS::EMR::Cluster"
80 | }
81 | },
82 | "Outputs": {
83 | "EMRClusterMaster": {
84 | "Description": "SSH Connection String to EMR Master Instance",
85 | "Value" : {
86 | "Fn::Join" : [
87 | "",
88 | [
89 | "ssh hadoop@",
90 | {
91 | "Fn::GetAtt" : [
92 | "EMRCluster",
93 | "MasterPublicDNS"
94 | ]
95 | },
96 | " -i ",
97 | {
98 | "Ref" : "KeyName"
99 | },
100 | ".pem"
101 | ]
102 | ]
103 | }
104 | }
105 | }
106 | }
--------------------------------------------------------------------------------
/cloudformation-templates/step6_ec2_instance.template:
--------------------------------------------------------------------------------
1 | {
2 | "AWSTemplateFormatVersion" : "2010-09-09",
3 | "Description": "AWS BLOGS - Creating Amazon EC2 Instance For test dataset generation and loading into Kinesis.",
4 | "Parameters":
5 | {
6 | "EC2SecurityGroupId": {
7 | "Description": "Existing security Group",
8 | "Type":"AWS::EC2::SecurityGroup::Id"
9 | },
10 | "KeyName": {
11 | "Description": "Name of an existing EC2 KeyPair to enable SSH access to the instance",
12 | "Type": "AWS::EC2::KeyPair::KeyName",
13 | "ConstraintDescription": "must be the name of an existing EC2 KeyPair."
14 | },
15 | "EC2Subnet": {
16 | "Description": "Subnet List - Choose the Subnet",
17 | "Type": "AWS::EC2::Subnet::Id"
18 | },
19 | "InstanceType" : {
20 | "Description" : "EC2 instance specs configuration",
21 | "Type" : "String",
22 | "Default" : "r4.xlarge",
23 | "AllowedValues" : ["r4.xlarge","r4.2xlarge", "r4.4xlarge"]
24 | }
25 | },
26 | "Mappings": {
27 | "AWSInstanceType2Arch": {
28 | "r4.xlarge": {
29 | "Arch": "HVM64"
30 | },
31 | "r4.2xlarge": {
32 | "Arch": "HVM64"
33 | },
34 | "r4.4xlarge": {
35 | "Arch": "HVM64"
36 | }
37 | },
38 | "AWSRegionArch2AMI": {
39 | "us-west-2": {
40 | "HVM64": "ami-6cd6f714"
41 | },
42 | "us-east-1": {
43 | "HVM64": "ami-1853ac65"
44 | }
45 | }
46 | },
47 | "Resources" : {
48 | "EC2IAMRole": {
49 | "Type": "AWS::IAM::Role",
50 | "Properties": {
51 | "RoleName": "small-files-ec2role",
52 | "AssumeRolePolicyDocument": {
53 | "Version": "2012-10-17",
54 | "Statement": [
55 | {
56 | "Effect": "Allow",
57 | "Principal": {
58 | "Service": "ec2.amazonaws.com"
59 | },
60 | "Action": "sts:AssumeRole"
61 | }
62 | ]
63 | },
64 | "Path": "/",
65 | "Policies": [
66 | {
67 | "PolicyName": "root",
68 | "PolicyDocument": {
69 | "Version": "2012-10-17",
70 | "Statement": {
71 | "Effect": "Allow",
72 | "Action": [
73 | "ec2:*",
74 | "logs:*",
75 | "kinesis:*",
76 | "firehose:*",
77 | "s3:*"
78 | ],
79 | "Resource": "*"
80 | }
81 | }
82 | }
83 | ]
84 | }
85 | },
86 | "EC2InstanceProfile" : {
87 | "Type" : "AWS::IAM::InstanceProfile",
88 | "DependsOn": [
89 | "EC2IAMRole"
90 | ],
91 | "Properties" : {
92 | "Path" : "/",
93 | "Roles" : [
94 | {
95 | "Ref" : "EC2IAMRole"
96 | }
97 | ]
98 | }
99 | },
100 | "EC2InstanceForDataLoadingIntoKinesis" : {
101 | "Type" : "AWS::EC2::Instance",
102 | "DependsOn": [
103 | "EC2InstanceProfile"
104 | ],
105 | "Properties" : {
106 | "KeyName" : { "Ref" : "KeyName" },
107 | "InstanceType" : { "Ref" : "InstanceType" },
108 | "ImageId" : { "Fn::FindInMap" : [ "AWSRegionArch2AMI", { "Ref" : "AWS::Region" },
109 | { "Fn::FindInMap" : [ "AWSInstanceType2Arch", { "Ref" : "InstanceType" }, "Arch" ] } ] },
110 | "SecurityGroupIds" : [
111 | {
112 | "Ref" : "EC2SecurityGroupId"
113 | }
114 | ],
115 | "SubnetId": {"Ref": "EC2Subnet"},
116 | "IamInstanceProfile": {"Ref": "EC2InstanceProfile"},
117 | "Tags" : [
118 | {
119 | "Key" : "Name",
120 | "Value" : "AWS-BLOGs-Small-Files-EC2-For-DataLoading"
121 | }
122 | ],
123 | "UserData" : {"Fn::Base64" : { "Fn::Join" : ["",[
124 | "#!/bin/bash -ex","\n",
125 | "\n","sudo yum install -y java-1.8.0-openjdk-devel.x86_64","\n",
126 | "\n","aws s3 cp s3://aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/appjars/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar .","\n",
127 | "\n","sudo cp /sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar /home/ec2-user/","\n",
128 | "\n","sudo chown -R ec2-user:ec2-user /home/ec2-user/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar","\n",
129 | "\n","sudo chmod -R 755 /home/ec2-user/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar","\n"
130 | ]]}
131 | }
132 | }
133 | }
134 | },
135 | "Outputs" : {
136 | "EC2Instance" : {
137 | "Description" : "EC2 IP address",
138 | "Value" : {
139 | "Fn::Join" : [
140 | "",
141 | [
142 | "ssh ec2-user@",
143 | {
144 | "Fn::GetAtt" : [
145 | "EC2InstanceForDataLoadingIntoKinesis",
146 | "PublicIp"
147 | ]
148 | },
149 | " -i ",
150 | {
151 | "Ref" : "KeyName"
152 | },
153 | ".pem"
154 | ]
155 | ]
156 | }
157 | }
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/kinesis-lambda/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | 1.0-SNAPSHOT
7 | com.optimize.downstream
8 | data-processing
9 |
10 | 4.0.0
11 |
12 | kinesis-lambda
13 | jar
14 |
15 |
30 |
31 |
32 | false
33 |
34 |
35 |
36 |
37 | com.amazonaws
38 | aws-lambda-java-core
39 | 1.2.0
40 |
41 |
42 |
43 | com.amazonaws
44 | aws-lambda-java-events
45 | 2.2.2
46 |
47 |
48 |
49 | com.amazonaws
50 | aws-java-sdk-core
51 | 1.11.362
52 |
53 |
54 |
55 | com.amazonaws
56 | aws-java-sdk-kinesis
57 | 1.11.362
58 |
59 |
60 |
61 | com.google.code.gson
62 | gson
63 | 2.8.0
64 |
65 |
66 |
67 |
68 | org.json
69 | json
70 | 20180813
71 |
72 |
73 |
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/kinesis-lambda/src/main/java/com/optimize/downstream/additionaldata/AdditionalIOTData.java:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.additionaldata;
2 |
3 | import java.util.*;
4 |
5 | public class AdditionalIOTData
6 | {
7 | private String hashDeviceId;
8 | private String processedDate;
9 | private List testAdditonalDataList;
10 | private Random random = new Random();
11 |
12 | public AdditionalIOTData(String hashDeviceId,
13 | String processedDate,
14 | List testAdditonalDataList)
15 | {
16 | this.hashDeviceId = hashDeviceId;
17 | this.processedDate = processedDate;
18 | this.testAdditonalDataList = testAdditonalDataList;
19 | }
20 |
21 | public String getHashDeviceId() {
22 | return hashDeviceId;
23 | }
24 |
25 | public String getProcessedDate() {
26 | return processedDate;
27 | }
28 |
29 | public List getTestAdditonalDataList() {
30 | return testAdditonalDataList;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/kinesis-lambda/src/main/java/com/optimize/downstream/additionaldata/TestAdditonalData.java:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.additionaldata;
2 |
3 | public class TestAdditonalData
4 | {
5 | private int dimension_X;
6 | private int dimension_Y;
7 | private int dimension_Z;
8 |
9 | public void setDimension_Y(int dimension_Y) {
10 | this.dimension_Y = dimension_Y;
11 | }
12 |
13 | public void setDimension_Z(int dimension_Z) {
14 | this.dimension_Z = dimension_Z;
15 | }
16 |
17 | public void setDimension_X(int dimension_X)
18 | {
19 | this.dimension_X = dimension_X;
20 | }
21 |
22 | public int getDimension_X() {
23 | return dimension_X;
24 | }
25 |
26 | public int getDimension_Y() {
27 | return dimension_Y;
28 | }
29 |
30 | public int getDimension_Z() {
31 | return dimension_Z;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/kinesis-lambda/src/main/java/com/optimize/downstream/lambda/ProcessKinesisRecords.java:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.lambda;
2 |
3 | import com.amazonaws.ClientConfiguration;
4 | import com.amazonaws.regions.Region;
5 | import com.amazonaws.regions.Regions;
6 | import com.amazonaws.services.kinesisfirehose.AmazonKinesisFirehoseClient;
7 | import com.amazonaws.services.kinesisfirehose.model.PutRecordRequest;
8 | import com.amazonaws.services.kinesisfirehose.model.PutRecordResult;
9 | import com.amazonaws.services.kinesisfirehose.model.Record;
10 | import com.amazonaws.services.lambda.runtime.Context;
11 | import com.amazonaws.services.lambda.runtime.RequestHandler;
12 | import com.amazonaws.services.lambda.runtime.events.KinesisEvent;
13 | import com.amazonaws.services.lambda.runtime.events.KinesisEvent.KinesisEventRecord;
14 | import com.optimize.downstream.additionaldata.AdditionalIOTData;
15 | import com.optimize.downstream.additionaldata.TestAdditonalData;
16 | import com.google.gson.Gson;
17 | import org.json.JSONObject;
18 | import java.io.ByteArrayOutputStream;
19 | import java.io.IOException;
20 | import java.nio.ByteBuffer;
21 | import java.nio.charset.Charset;
22 | import java.text.DateFormat;
23 | import java.text.SimpleDateFormat;
24 | import java.util.*;
25 | import java.util.zip.GZIPOutputStream;
26 |
27 | public class ProcessKinesisRecords implements RequestHandler{
28 | private static Charset charset = Charset.forName("UTF-8");
29 | private String hashDeviceId;
30 | private String processedDate;
31 | private Random random = new Random();
32 | private Gson gson;
33 | private AmazonKinesisFirehoseClient kinesisFirehoseClient;
34 | private String firehoseStreamName = System.getenv("kinesisfirehosestream");
35 | private String REGION=System.getenv("kinesis_region");
36 |
37 | @Override
38 | public Void handleRequest(KinesisEvent event, Context context)
39 | {
40 | //System.out.print("In Handle Request");
41 | gson = new Gson();
42 | ClientConfiguration config = new ClientConfiguration();
43 | config.setMaxErrorRetry(5);
44 | config.setSocketTimeout(100);
45 | kinesisFirehoseClient = new AmazonKinesisFirehoseClient(config);
46 | kinesisFirehoseClient.setRegion(Region.getRegion(Regions.fromName(REGION)));
47 | String mergedJsonString = "";
48 | String recordId;
49 | try {
50 | for (KinesisEventRecord rec : event.getRecords())
51 | {
52 | //System.out.println(new String(rec.getKinesis().getData().array()));
53 | String jsonMessage = new String(rec.getKinesis().getData().array());
54 | //System.out.println("Kinesis JSON Message is ::: ");
55 | //System.out.println(jsonMessage);
56 | AdditionalIOTData additionalDeviceMessage = generateAdditionalIOTDeviceData();
57 | String addJson = gson.toJson(additionalDeviceMessage);
58 | //System.out.println("Additional JSON Is :: " + addJson);
59 |
60 | try {
61 | mergedJsonString = mergeJsonStrings(jsonMessage, addJson);
62 | System.out.println(mergedJsonString);
63 | System.out.println("Sending record to Firehose");
64 | recordId = sendToFireHose(mergedJsonString);
65 | System.out.println("Record sent to Firehose. Result Record Id is : " + recordId);
66 | }catch (Exception e)
67 | {
68 | e.printStackTrace();
69 | }
70 | }
71 | }
72 | catch (Exception ie)
73 | {
74 | ie.getStackTrace();
75 | }
76 | return null;
77 | }
78 |
79 | private String mergeJsonStrings(String kinJsonMessage, String addJson)
80 | {
81 | JSONObject kinesisJsonObject;
82 | JSONObject addJsonObject;
83 | kinesisJsonObject = new JSONObject(kinJsonMessage);
84 | addJsonObject = new JSONObject(addJson);
85 | JSONObject mergedJson = new JSONObject();
86 | if (kinesisJsonObject.length()>0){
87 | mergedJson = new JSONObject(kinesisJsonObject, JSONObject.getNames(kinesisJsonObject));
88 | }
89 | if (addJsonObject.length()>0){
90 | for(String key : JSONObject.getNames(addJsonObject))
91 | {
92 | mergedJson.put(key, addJsonObject.get(key));
93 | }
94 | }
95 | return mergedJson.toString();
96 | }
97 |
98 | private static byte[] compressMessage(byte[] inputDataMessage) throws IOException
99 | {
100 | ByteArrayOutputStream array = new ByteArrayOutputStream();
101 | GZIPOutputStream output = new GZIPOutputStream(array);
102 | try
103 | {
104 | output.write(inputDataMessage);
105 | output.finish();
106 | output.close();
107 | array.close();
108 | }
109 | catch (Exception e)
110 | {
111 | e.printStackTrace();
112 | }
113 | return array.toByteArray();
114 | }
115 |
116 | private AdditionalIOTData generateAdditionalIOTDeviceData()
117 | {
118 | UUID uuid = UUID.randomUUID();
119 | hashDeviceId = uuid.toString();
120 | DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
121 | Date date = new Date();
122 | processedDate = dateFormat.format(date);
123 | //AdditionalIOTData additionalIOTData = new AdditionalIOTData(hashDeviceId, processedDate, getAdditionalData());
124 | return (new AdditionalIOTData(hashDeviceId, processedDate, getAdditionalData()));
125 | }
126 |
127 | private ArrayList getAdditionalData()
128 | {
129 | ArrayList additionalIOTDataArrayList = new ArrayList();
130 |
131 | // Adding extra content to make the message size more than 1 MB.
132 | // The below generated data will be appended/merged with the message coming from Kineisis Stream.
133 | // It is just to demonstrate that if the message size is more than 1MB, we can gzip the message and send it to KinesisFirehose.
134 | for(int i =0;i <5000; i++)
135 | {
136 | TestAdditonalData t = new TestAdditonalData();
137 | t.setDimension_X(getRandomInt(10,1));
138 | t.setDimension_Y(getRandomInt(10,1));
139 | t.setDimension_Z(getRandomInt(10,1));
140 | additionalIOTDataArrayList.add(t);
141 | }
142 | return additionalIOTDataArrayList;
143 | }
144 |
145 | private String sendToFireHose(String mergedJsonString)
146 | {
147 | PutRecordResult res = null;
148 | try {
149 | //To Firehose -
150 | System.out.println("MESSAGE SIZE BEFORE COMPRESSION IS : " + mergedJsonString.toString().getBytes(charset).length);
151 | System.out.println("MESSAGE SIZE AFTER GZIP COMPRESSION IS : " + compressMessage(mergedJsonString.toString().getBytes(charset)).length);
152 | PutRecordRequest req = new PutRecordRequest()
153 | .withDeliveryStreamName(firehoseStreamName);
154 |
155 | // Without compression - Send to Firehose
156 | //Record record = new Record().withData(ByteBuffer.wrap((mergedJsonString.toString() + "\r\n").getBytes()));
157 |
158 | // With compression - send to Firehose
159 | Record record = new Record().withData(ByteBuffer.wrap(compressMessage((mergedJsonString.toString() + "\r\n").getBytes())));
160 | req.setRecord(record);
161 | res = kinesisFirehoseClient.putRecord(req);
162 | }
163 | catch (IOException ie) {
164 | ie.printStackTrace();
165 | }
166 | return res.getRecordId();
167 | }
168 | private int getRandomInt(int max, int min)
169 | {
170 | return random.nextInt(max - min + 1) + min;
171 | }
172 | }
173 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.optimize.downstream
8 | data-processing
9 | pom
10 | 1.0-SNAPSHOT
11 |
12 |
13 | kinesis-lambda
14 | spark-process
15 | sample-kinesis-producer
16 |
17 |
18 |
19 | true
20 |
21 |
22 |
23 |
24 |
25 | maven-assembly-plugin
26 | 2.6
27 |
28 |
29 | jar-with-dependencies
30 |
31 | ${skip.assembly}
32 |
33 |
34 |
35 | make-assembly
36 | package
37 |
38 | single
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
92 |
93 |
--------------------------------------------------------------------------------
/sample-kinesis-producer/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | com.optimize.downstream
7 | data-processing
8 | 1.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 |
13 | sample-kinesis-producer
14 | jar
15 |
16 |
17 | false
18 |
19 |
20 |
21 |
22 | com.amazonaws
23 | aws-lambda-java-core
24 | 1.2.0
25 |
26 |
27 |
28 | com.amazonaws
29 | aws-lambda-java-events
30 | 2.2.2
31 |
32 |
33 |
34 | com.amazonaws
35 | aws-java-sdk-core
36 | 1.11.362
37 |
38 |
39 |
40 | com.amazonaws
41 | aws-java-sdk-kinesis
42 | 1.11.362
43 |
44 |
45 |
46 | com.amazonaws
47 | amazon-kinesis-producer
48 | 0.12.9
49 |
50 |
51 |
52 | com.googlecode.json-simple
53 | json-simple
54 | 1.1.1
55 |
56 |
57 |
58 |
59 | org.fluttercode.datafactory
60 | datafactory
61 | 0.8
62 |
63 |
64 |
65 | org.apache.commons
66 | commons-lang3
67 | 3.5
68 |
69 |
70 |
71 | com.google.code.gson
72 | gson
73 | 2.8.0
74 |
75 |
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/datagenerator/GenerateDataMain.java:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.datagenerator;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 | import java.util.UUID;
6 |
7 |
8 | public class GenerateDataMain
9 | {
10 |
11 | public ArrayList getDeviceIDS(int totalDeviceIds)
12 | {
13 | ArrayList listOfUUIDS = new ArrayList();
14 | for(int i =0; i< totalDeviceIds; i++)
15 | {
16 | UUID uuid = UUID.randomUUID();
17 | listOfUUIDS.add(uuid.toString());
18 | }
19 | return listOfUUIDS;
20 | }
21 |
22 | public void kickAllWorkers()
23 | {
24 | ArrayList allDeviceIds = getDeviceIDS(5);
25 | List workers = new ArrayList();
26 |
27 | for (int i=0; i inputQueue = new LinkedBlockingDeque();
29 |
30 | Thread[] consumerThread = new Thread[numberOfThreads];
31 | Thread producerThread = new Thread(new IOTDeviceProducerToBlockingQueue(inputQueue, numberOfMessages, numberOfSamplesInEachMessage));
32 | System.out.println("Starting producer and consumer.....");
33 | producerThread.start();
34 |
35 | for (int i = 0; i < numberOfThreads; i++) {
36 | consumerThread[i] = new Thread(new IOTDeviceConsumerFromBlockingQueueToKinesisStreams(inputQueue));
37 | consumerThread[i].start();
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/samplekinesisproducer/IOTDevice.java:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.samplekinesisproducer;
2 |
3 | import com.optimize.downstream.sensors.Accelerometer;
4 | import com.optimize.downstream.sensors.GPS;
5 | import com.optimize.downstream.sensors.Illuminance;
6 | import com.optimize.downstream.sensors.TemperatureSensor;
7 |
8 | import java.io.Serializable;
9 | import java.util.List;
10 |
11 | public class IOTDevice implements Serializable
12 | {
13 | private String deviceId;
14 | private String currentDate;
15 | private List accelerometerSensorList;
16 | private List gpsSensorList;
17 | private List tempSensorList;
18 | private List illuminancesSensorList;
19 |
20 | IOTDevice(String deviceId,
21 | String currentDate,
22 | List accelerometerArrayList,
23 | List gpsArrayList,
24 | List temperatureSensorArrayList,
25 | List illuminanceArrayList
26 | )
27 | {
28 | this.deviceId = deviceId;
29 | this.currentDate = currentDate;
30 | this.accelerometerSensorList = accelerometerArrayList;
31 | this.gpsSensorList = gpsArrayList;
32 | this.tempSensorList = temperatureSensorArrayList;
33 | this.illuminancesSensorList = illuminanceArrayList;
34 |
35 | }
36 |
37 | public String getDeviceId()
38 | {
39 | return deviceId;
40 | }
41 |
42 | public String getCurrentDate()
43 | {
44 | return currentDate;
45 | }
46 |
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/samplekinesisproducer/IOTDeviceConsumerFromBlockingQueueToKinesisStreams.java:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.samplekinesisproducer;
2 |
3 | import java.nio.ByteBuffer;
4 | import java.nio.charset.Charset;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 | import java.util.concurrent.BlockingQueue;
8 | import com.amazonaws.regions.Regions;
9 | import com.amazonaws.services.kinesis.AmazonKinesis;
10 | import com.amazonaws.services.kinesis.AmazonKinesisClient;
11 | import com.amazonaws.services.kinesis.model.PutRecordsRequest;
12 | import com.amazonaws.services.kinesis.model.PutRecordsRequestEntry;
13 | import com.google.gson.Gson;
14 |
15 | public class IOTDeviceConsumerFromBlockingQueueToKinesisStreams implements Runnable
16 | {
17 | private AmazonKinesis kinesis;
18 | private List entries;
19 | private BlockingQueue inputQueue;
20 | private int dataSize;
21 | Gson gson;
22 | private final String STREAM_NAME = "AWS-Blog-BaseKinesisStream";
23 | private final String REGION = "us-east-1";
24 | private static Charset charset = Charset.forName("UTF-8");
25 |
26 | public IOTDeviceConsumerFromBlockingQueueToKinesisStreams(BlockingQueue inputQueue)
27 | {
28 | gson = new Gson();
29 | this.inputQueue = inputQueue;
30 | kinesis = new AmazonKinesisClient().withRegion(Regions.fromName(REGION));
31 | entries = new ArrayList();
32 | dataSize = 0;
33 | }
34 |
35 | @Override
36 | public void run()
37 | {
38 | long threadId = Thread.currentThread().getId();
39 | System.out.println("Thread # " + threadId + " is doing this task");
40 | while(!inputQueue.isEmpty())
41 | {
42 | try {
43 | IOTDevice deviceMessage = inputQueue.take();
44 | String partitionKey = deviceMessage.getDeviceId();
45 |
46 | String json = gson.toJson(deviceMessage);
47 |
48 | //System.out.println("================= JSON String IS ================");
49 | //System.out.println(json);
50 | //System.out.println("Partition Key / Device Id before inserting into Kinesis stream is : " + partitionKey);
51 |
52 | //System.out.println("SRIKANTH : SIZE IS : " + json.getBytes(charset).length);
53 | //ByteBuffer data = ByteBuffer.wrap(SerializationUtils.serialize(deviceMessage));
54 | ByteBuffer data = ByteBuffer.wrap(json.getBytes());
55 | pushToKinesis(new PutRecordsRequestEntry().withPartitionKey(partitionKey).withData(data));
56 | } catch (Exception e) {
57 | e.printStackTrace();
58 | }
59 | }
60 | System.exit(0);
61 | }
62 | private void flush() {
63 | System.out.println("Sending a record to Kinesis Stream with " + entries.size() + " messages grouped together.");
64 | kinesis.putRecords(new PutRecordsRequest()
65 | .withStreamName(STREAM_NAME)
66 | .withRecords(entries));
67 | entries.clear();
68 |
69 | }
70 | private void pushToKinesis(PutRecordsRequestEntry entry)
71 | {
72 | /*System.out.println("===================================================================");
73 | System.out.println("Data Size is : " + dataSize);
74 | System.out.println("Remaining Data is : " + entry.getData().remaining());
75 | System.out.println("Partition Key length is : " + entry.getPartitionKey().length());*/
76 |
77 | int newDataSize = dataSize + entry.getData().remaining() +
78 | entry.getPartitionKey().length();
79 | if (newDataSize <= 5 * 1024 * 1024 && entries.size() < 500)
80 | {
81 | dataSize = newDataSize;
82 | entries.add(entry);
83 | //System.out.println("Data size is : " + dataSize );
84 | }
85 | else {
86 | //System.out.println("In Else : Entries size is : " + entries.size() + " --- New Data size is ::: " + newDataSize);
87 | //System.out.println("Sending records to Kinesis Stream... Size is ::: " + dataSize);
88 | /*kinesis.putRecords(new PutRecordsRequest()
89 | .withStreamName(STREAM_NAME)
90 | .withRecords(entry));*/
91 | flush();
92 | System.out.println("Record sent to Kinesis Stream. Record size is ::: " + dataSize + " KB");
93 | dataSize = 0;
94 | pushToKinesis(entry);
95 | }
96 | }
97 |
98 | /*private String generateJSONObject(IOTDevice deviceMessage)
99 | {
100 | GPS gps;
101 | TemperatureSensor tempSensor;
102 |
103 | JSONObject mainObj = new JSONObject();
104 | mainObj.put("deviceid", deviceMessage.getDeviceId());
105 | mainObj.put("currentDate", deviceMessage.getCurrentDate());
106 | mainObj.put("accelerometerreadings", getAccelerometerReadings(deviceMessage.accelerometerSensor));
107 | mainObj.put("gpsreadings", getGPSReadings(deviceMessage.gpsSensor));
108 | mainObj.put("temperaturereadings", getTemperatureReadings(deviceMessage.tempSensor));
109 | mainObj.put("illuminancereadings", getIlluminanceReadings(deviceMessage.illuminancesSensor));
110 |
111 | return mainObj.toJSONString();
112 | }
113 |
114 | private JSONArray getAccelerometerReadings(ArrayList acc)
115 | {
116 | JSONArray accelerometerReadings = new JSONArray();
117 | for (Accelerometer a : acc)
118 | {
119 | JSONObject accelerometerObj = new JSONObject();
120 |
121 | accelerometerObj.put("accelerometer_X", a.getAccelerometer_X());
122 |
123 | accelerometerObj.put("accelerometer_Y", a.getAccelerometer_Y());
124 | accelerometerObj.put("accelerometer_Z", a.getAccelerometer_Z());
125 |
126 | accelerometerObj.put("linearAccelerationSensor_X", a.getLinearAccelerationSensor_X());
127 | accelerometerObj.put("linearAccelerationSensor_Y", a.getLinearAccelerationSensor_Y());
128 | accelerometerObj.put("linearAccelerationSensor_Z", a.getLinearAccelerationSensor_Z());
129 |
130 | accelerometerObj.put("gravitySensor_X", a.getGravitySensor_X());
131 | accelerometerObj.put("gravitySensor_Y", a.getGravitySensor_Y());
132 | accelerometerObj.put("gravitySensor_Z", a.getGravitySensor_Z());
133 |
134 | accelerometerReadings.add(accelerometerObj);
135 | }
136 | return accelerometerReadings;
137 | }
138 |
139 | private JSONArray getTemperatureReadings(ArrayList temp)
140 | {
141 | JSONArray temperatureReadings = new JSONArray();
142 | for (TemperatureSensor t : temp)
143 | {
144 | JSONObject accelerometerObj = new JSONObject();
145 | accelerometerObj.put("celcius", t.getCelsius());
146 | accelerometerObj.put("fahrenheit", t.getFahrenheit());
147 | accelerometerObj.put("kelvin", t.getKelvin());
148 |
149 | temperatureReadings.add(accelerometerObj);
150 | }
151 | return temperatureReadings;
152 | }
153 |
154 | private JSONArray getGPSReadings(ArrayList gps)
155 | {
156 | JSONArray gpsReadings = new JSONArray();
157 | int gpsLength = gps.size();
158 |
159 | for (GPS g : gps)
160 | {
161 | JSONObject gpsObj = new JSONObject();
162 | gpsObj.put("altitude", g.getAltitude());
163 | gpsObj.put("heading", g.getHeading());
164 | gpsObj.put("lat", g.getLatitude());
165 | gpsObj.put("long", g.getLongitude());
166 |
167 | gpsReadings.add(gpsObj);
168 | }
169 | return gpsReadings;
170 | }
171 |
172 | private JSONArray getIlluminanceReadings(ArrayList illuminances)
173 | {
174 | JSONArray illuminancesReadings = new JSONArray();
175 | int gpsLength = illuminances.size();
176 |
177 | for (Illuminance i : illuminances)
178 | {
179 | JSONObject illuminancesObj = new JSONObject();
180 | illuminancesObj.put("illuminance", i.getIlluminance());
181 | illuminancesReadings.add(illuminancesObj);
182 | }
183 | return illuminancesReadings;
184 | }*/
185 | }
186 |
--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/samplekinesisproducer/IOTDeviceProducerToBlockingQueue.java:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.samplekinesisproducer;
2 |
3 | import com.optimize.downstream.sensors.Accelerometer;
4 | import com.optimize.downstream.sensors.GPS;
5 | import com.optimize.downstream.sensors.Illuminance;
6 | import com.optimize.downstream.sensors.TemperatureSensor;
7 |
8 |
9 | import java.text.DateFormat;
10 | import java.text.SimpleDateFormat;
11 | import java.util.ArrayList;
12 | import java.util.Date;
13 | import java.util.Random;
14 | import java.util.UUID;
15 | import java.util.concurrent.BlockingQueue;
16 | import java.util.concurrent.atomic.AtomicLong;
17 |
18 | public class IOTDeviceProducerToBlockingQueue implements Runnable
19 | {
20 | private final BlockingQueue inputQueue;
21 | private volatile boolean shutdown = false;
22 | private final AtomicLong recordsPut = new AtomicLong(0);
23 | private Random random = new Random();
24 | private int numberOfSamplesInEachMessage;
25 | private int numberOfMessages;
26 |
27 | public IOTDeviceProducerToBlockingQueue(BlockingQueue inputQueue, int numberOfMessages, int numberOfSamplesInEachMessage)
28 | {
29 | this.inputQueue = inputQueue;
30 | this.numberOfSamplesInEachMessage = numberOfSamplesInEachMessage;
31 | this.numberOfMessages = numberOfMessages;
32 | }
33 |
34 | public void run() {
35 | long threadId = Thread.currentThread().getId();
36 | //System.out.println("Thread # " + threadId + " is doing this task");
37 |
38 | //while (!shutdown) {
39 | for(int i=0;i getAccelerometerList()
84 | {
85 | ArrayList accelerometerArrayList = new ArrayList();
86 |
87 | for(int i =0;i getGpsArrayList()
109 | {
110 | ArrayList gpsArrayList = new ArrayList();
111 |
112 | for(int i =0; i< numberOfSamplesInEachMessage; i++)
113 | {
114 | GPS g = new GPS();
115 | g.setAltitude(getRandomDouble(1,10));
116 | g.setHeading(getRandomDouble(1,10));
117 | g.setLatitude(Math.random() * Math.PI * 2);
118 | g.setLongitude(Math.acos(Math.random() * 2 - 1));
119 |
120 | gpsArrayList.add(g);
121 | }
122 |
123 | return gpsArrayList;
124 | }
125 |
126 | private ArrayList getIlluminanceArrayList()
127 | {
128 | ArrayList illuminanceArrayList = new ArrayList();
129 |
130 | for(int i =0; i< numberOfSamplesInEachMessage; i++)
131 | {
132 | Illuminance il = new Illuminance();
133 | il.setIlluminance(getRandomDouble(1,100));
134 | illuminanceArrayList.add(il);
135 | }
136 |
137 | return illuminanceArrayList;
138 | }
139 |
140 | private ArrayList getTemperatureSensorArrayList()
141 | {
142 | ArrayList temperatureSensorArrayList = new ArrayList();
143 |
144 | for (int i=0;i
2 |
5 |
6 | com.optimize.downstream
7 | data-processing
8 | 1.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | spark-process
13 | jar
14 |
15 |
59 |
60 |
61 | false
62 |
63 |
64 |
72 |
73 |
74 |
75 |
76 |
77 | org.apache.spark
78 | spark-core_2.11
79 | 2.3.1
80 |
81 |
82 |
83 |
84 | org.apache.spark
85 | spark-sql_2.11
86 | 2.3.1
87 |
88 |
89 |
90 |
91 | org.scala-lang
92 | scala-reflect
93 | 2.11.8
94 |
95 |
96 |
97 |
98 | com.amazonaws
99 | aws-java-sdk
100 | 1.11.390
101 |
102 |
103 |
104 |
105 | commons-io
106 | commons-io
107 | 2.7
108 |
109 |
110 |
111 |
112 |
113 |
114 | src/main/scala
115 | src/test/scala
116 |
117 |
118 | org.scala-tools
119 | maven-scala-plugin
120 |
121 |
122 |
123 | compile
124 | testCompile
125 |
126 |
127 |
128 |
129 | 2.11.8
130 |
131 |
132 |
133 |
134 |
--------------------------------------------------------------------------------
/spark-process/src/main/scala/com/optimize/downstream/process/ProcessFilesFromS3AndConvertToParquet.scala:
--------------------------------------------------------------------------------
1 | package com.optimize.downstream.process
2 |
3 | import java.io.InputStream
4 | import java.util.zip.GZIPInputStream
5 | import com.amazonaws.auth.BasicAWSCredentials
6 | import com.amazonaws.services.s3.AmazonS3Client
7 | import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing}
8 | import org.apache.spark.sql._
9 |
10 | import scala.collection.JavaConversions.{collectionAsScalaIterable => asScala}
11 | import scala.io.Source
12 |
13 | object ProcessFilesFromS3AndConvertToParquet {
14 |
15 | val pageLength = 1000
16 | def main(args: Array[String]) =
17 | {
18 | if(args.length != 3)
19 | {
20 | println("Requires 3 parameters")
21 | println("Usage: ")
22 | System.exit(-1)
23 | }
24 | val s3BucketName = args(0)
25 | val s3InputLocation = args(1)
26 | val s3OutputLocation = args(2)
27 |
28 | //def s3Client = new AmazonS3Client(new BasicAWSCredentials(accesskeyID, secretAccessKey))
29 | def s3Client = new AmazonS3Client()
30 |
31 | val spark = SparkSession
32 | .builder()
33 | .appName("AWS-Small-Blogs-Job")
34 | .getOrCreate()
35 |
36 | val request = new ListObjectsRequest()
37 | request.setBucketName(s3BucketName)
38 | request.setPrefix(s3InputLocation) //Get the prefix part only
39 | request.setMaxKeys(pageLength)
40 |
41 |
42 | var objs= new ObjectListing()
43 | objs = s3Client.listObjects(request)
44 | val s3ObjectKeys = objs.getObjectSummaries.map(x => x.getKey).toList
45 | println("Printing the keys")
46 | s3ObjectKeys.foreach { println }
47 |
48 | val allLinesRDD = spark.sparkContext.parallelize(s3ObjectKeys).flatMap
49 | { key => Source.fromInputStream(new GZIPInputStream(s3Client.getObject(s3BucketName, key).getObjectContent: InputStream)).getLines }
50 |
51 | var finalDF = spark.read.json(allLinesRDD).toDF()
52 |
53 | while(objs.isTruncated())
54 | {
55 | objs = s3Client.listNextBatchOfObjects(objs)
56 | val s3ObjectKeys = objs.getObjectSummaries.map(x => x.getKey).toList
57 | //println("Printing the keys")
58 | s3ObjectKeys.foreach { println }
59 | val allLinesRDD = spark.sparkContext.parallelize(s3ObjectKeys).flatMap
60 | { key => Source.fromInputStream(new GZIPInputStream(s3Client.getObject(s3BucketName, key).getObjectContent: InputStream)).getLines }
61 |
62 | val allLines = spark.read.json(allLinesRDD).toDF()
63 | finalDF = finalDF.union(allLines)
64 | }
65 | finalDF.write
66 | .mode("append")
67 | .parquet("s3://" + s3BucketName + "/" + s3OutputLocation)
68 | }
69 | }
70 |
--------------------------------------------------------------------------------