├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── build_and_push_to_s3.sh
├── cloudformation-templates
    ├── allsteps_cf.template
    ├── step1_vpc.template
    ├── step2_iam.template
    ├── step3_firehose.template
    ├── step4_kinesisstream.template
    ├── step5_emr.template
    └── step6_ec2_instance.template
├── kinesis-lambda
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── optimize
    │                   └── downstream
    │                       ├── additionaldata
    │                           ├── AdditionalIOTData.java
    │                           └── TestAdditonalData.java
    │                       └── lambda
    │                           └── ProcessKinesisRecords.java
├── pom.xml
├── sample-kinesis-producer
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── optimize
    │                   └── downstream
    │                       ├── datagenerator
    │                           ├── GenerateDataMain.java
    │                           └── GenerateDataWorker.java
    │                       ├── entry
    │                           └── Main.java
    │                       ├── samplekinesisproducer
    │                           ├── IOTDevice.java
    │                           ├── IOTDeviceConsumerFromBlockingQueueToKinesisStreams.java
    │                           ├── IOTDeviceProducerToBlockingQueue.java
    │                           └── SampleEvent.java
    │                       └── sensors
    │                           ├── Accelerometer.java
    │                           ├── GPS.java
    │                           ├── Illuminance.java
    │                           └── TemperatureSensor.java
└── spark-process
    ├── pom.xml
    └── src
        └── main
            └── scala
                └── com
                    └── optimize
                        └── downstream
                            └── process
                                └── ProcessFilesFromS3AndConvertToParquet.scala


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws-samples/amazon-emr-optimize-data-processing/issues), or [recently closed](https://github.com/aws-samples/amazon-emr-optimize-data-processing/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/amazon-emr-optimize-data-processing/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws-samples/amazon-emr-optimize-data-processing/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Optimizing downstream data processing with Amazon Kinesis Data Firehose and Amazon EMR running Apache Spark
 2 | 
 3 | Optimizing downstream data processing with Amazon Kinesis Data Firehose and Amazon EMR running Apache Spark
 4 | 
 5 | ## License Summary
 6 | 
 7 | This sample code is made available under the MIT-0 license. See the LICENSE file.
 8 | 
 9 | ## Solution Overview
10 | 
11 | ### AWS Blog link
12 | ##### For complete blog details, check AWS blog @ ""
13 | ### The steps we follow in this blog post are:
14 | ##### 1.Create a virtual private cloud (VPC) and an Amazon S3 bucket.
15 | ##### 2.Provision a Kinesis Data data stream, and an AWS Lambda function to process the messages from the Kinesis data stream.
16 | ##### 3.Provision Kinesis Data Firehose to deliver messages to Amazon S3 sent from the Lambda function in step 2. This step also provisions provisions an Amazon EMR cluster to process the data in Amazon S3.
17 | ##### 4.Generate test data with custom code running on an Amazon EC2 instance.
18 | ##### 5.Run a sample Spark program from the Amazon EMR cluster’s master instance to read the files from Amazon S3, convert them into parquet format and write back to an Amazon S3 destination.
19 | 
20 | 


--------------------------------------------------------------------------------
/build_and_push_to_s3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Change JAVA_HOME as per your java home location
 4 | JAVA_HOME="/Library/Java/JavaVirtualMachines/jdk1.8.0_202.jdk/Contents/Home"
 5 | S3_PATH="aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing"
 6 | 
 7 | 
 8 | if [[ ! -d "${JAVA_HOME}" ]]; then
 9 |    echo "JAVA_HOME path not found. Check the path : ${JAVA_HOME}"
10 |    exit 99;
11 | fi
12 | export JAVA_HOME=${JAVA_HOME}
13 | echo "Starting project build"
14 | mvn clean compile assembly:single
15 | echo "Project build completed"
16 | 
17 | # Copy to east-1 region - blog assumes everything is created in us-east-1 (N.VIRGINIA) region.
18 | 
19 | echo "Copying Files to S3 bucket path : ${S3_PATH}"
20 | # Kinesis Producer Jar file
21 | aws s3 cp sample-kinesis-producer/target/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar s3://${S3_PATH}/appjars/ --acl public-read
22 | 
23 | # Lambda Jar file
24 | aws s3 cp kinesis-lambda/target/kinesis-lambda-1.0-SNAPSHOT-jar-with-dependencies.jar s3://${S3_PATH}/appjars/ --acl public-read
25 | 
26 | # Spark scala code to process files in S3.
27 | aws s3 cp spark-process/target/spark-process-1.0-SNAPSHOT-jar-with-dependencies.jar s3://${S3_PATH}/appjars/ --acl public-read
28 | 
29 | # Cloudformation templates
30 | aws s3 cp cloudformation-templates/allsteps_cf.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
31 | aws s3 cp cloudformation-templates/step1_vpc.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
32 | aws s3 cp cloudformation-templates/step2_iam.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
33 | aws s3 cp cloudformation-templates/step3_firehose.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
34 | aws s3 cp cloudformation-templates/step4_kinesisstream.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
35 | aws s3 cp cloudformation-templates/step5_emr.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
36 | aws s3 cp cloudformation-templates/step6_ec2_instance.template s3://${S3_PATH}/cloudformation-templates/ --acl public-read
37 | 


--------------------------------------------------------------------------------
/cloudformation-templates/allsteps_cf.template:
--------------------------------------------------------------------------------
  1 | {
  2 |   "AWSTemplateFormatVersion": "2010-09-09",
  3 |   "Description": "AWS BLOGS - Root Template - One Step Solution",
  4 |   "Parameters": {
  5 |     "S3BucketName": {
  6 |       "Type": "String",
  7 |       "Description": "S3 Bucket Name that will be created in your account"
  8 |     },
  9 |     "ClientIP": {
 10 |       "Description": "The IP address range that can be used to connect to the EC2 instance from your local machine.It must be a valid IP CIDR range of the form x.x.x.x/x.Pls get your address using checkip.amazonaws.com or whatsmyip.org",
 11 |       "Type": "String",
 12 |       "MinLength": "9",
 13 |       "MaxLength": "18",
 14 |       "Default": "0.0.0.0/0",
 15 |       "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})",
 16 |       "ConstraintDescription": "It must be a valid IP CIDR range of the form x.x.x.x/x. Suggest to enable access to your IP address only. Pls get your address using checkip.amazonaws.com or whatsmyip.org."
 17 |     },
 18 |     "FirehoseDeliveryStreamName": {
 19 |       "Type": "String",
 20 |       "Description": "Name of the Amazon Firehose delivery stream. Default value is set to 'AWSBlogs-LambdaToFireHose'",
 21 |       "Default": "AWSBlogs-LambdaToFireHose",
 22 |       "AllowedValues": [
 23 |         "AWSBlogs-LambdaToFireHose"
 24 |       ]
 25 |     },
 26 |     "KinesisStreamName": {
 27 |       "Type": "String",
 28 |       "Description" : "Name of the Amazon Kinesis stream. Default value is set to 'AWS-Blog-BaseKinesisStream'",
 29 |       "Default": "AWS-Blog-BaseKinesisStream",
 30 |       "AllowedValues": [
 31 |         "AWS-Blog-BaseKinesisStream"
 32 |       ]
 33 |     },
 34 |     "Region": {
 35 |       "Description": "AWS Region - Select us-east-1 by default.",
 36 |       "Type": "String",
 37 |       "Default": "us-east-1",
 38 |       "AllowedValues": [
 39 |         "us-east-1"
 40 |       ]
 41 |     },
 42 |     "EMRClusterName": {
 43 |       "Type": "String",
 44 |       "Description": "ClusterName"
 45 |     },
 46 |     "KeyName": {
 47 |       "Description": "Name of an existing EC2 key pair to access the Amazon EMR cluster",
 48 |       "Type": "AWS::EC2::KeyPair::KeyName"
 49 |     },
 50 |     "InstanceType": {
 51 |       "Description": "EC2 instance specs configuration",
 52 |       "Type": "String",
 53 |       "Default": "r4.xlarge",
 54 |       "AllowedValues": [
 55 |         "r4.xlarge",
 56 |         "r4.2xlarge",
 57 |         "r4.4xlarge"
 58 |       ]
 59 |     }
 60 |   },
 61 |   "Resources": {
 62 |     "STEP1": {
 63 |       "Type": "AWS::CloudFormation::Stack",
 64 |       "Properties": {
 65 |         "TemplateURL": {
 66 |           "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step1_vpc.template"
 67 |         },
 68 |         "Parameters": {
 69 |           "S3BucketName": {
 70 |             "Ref": "S3BucketName"
 71 |           },
 72 |           "ClientIP": {
 73 |             "Ref": "ClientIP"
 74 |           }
 75 |         }
 76 |       }
 77 |     },
 78 |     "STEP2": {
 79 |       "Type": "AWS::CloudFormation::Stack",
 80 |       "DependsOn":"STEP1",
 81 |       "Properties": {
 82 |         "TemplateURL": {
 83 |           "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step2_iam.template"
 84 |         }
 85 |       }
 86 |     },
 87 |     "STEP3": {
 88 |       "Type": "AWS::CloudFormation::Stack",
 89 |       "DependsOn":"STEP2",
 90 |       "Properties": {
 91 |         "TemplateURL": {
 92 |           "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step3_firehose.template"
 93 |         },
 94 |         "Parameters": {
 95 |           "Role": {
 96 |             "Fn::GetAtt": [
 97 |               "STEP2",
 98 |               "Outputs.FirehoseRoleArn"
 99 |             ]
100 |           },
101 |           "S3BucketARN": {
102 |             "Fn::GetAtt": [
103 |               "STEP1",
104 |               "Outputs.S3BucketARN"
105 |             ]
106 |           },
107 |           "FirehoseDeliveryStreamName": {
108 |             "Ref": "FirehoseDeliveryStreamName"
109 |           }
110 |         }
111 |       }
112 |     },
113 |     "STEP4": {
114 |       "Type": "AWS::CloudFormation::Stack",
115 |       "DependsOn":"STEP3",
116 |       "Properties": {
117 |         "TemplateURL": {
118 |           "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step4_kinesisstream.template"
119 |         },
120 |         "Parameters": {
121 |           "Role": {
122 |             "Fn::GetAtt": [
123 |               "STEP2",
124 |               "Outputs.LambdaRoleArn"
125 |             ]
126 |           },
127 |           "Region": {
128 |             "Ref": "Region"
129 |           },
130 |           "S3Bucket": {
131 |             "Ref": "S3BucketName"
132 |           },
133 |           "KinesisStreamName": {
134 |             "Ref": "KinesisStreamName"
135 |           }
136 |         }
137 |       }
138 |     },
139 |     "STEP5": {
140 |       "Type": "AWS::CloudFormation::Stack",
141 |       "DependsOn":"STEP4",
142 |       "Properties": {
143 |         "TemplateURL": {
144 |           "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step5_emr.template"
145 |         },
146 |         "Parameters": {
147 |           "EMRClusterName": {
148 |             "Ref": "EMRClusterName"
149 |           },
150 |           "ClusterSecurityGroup": {
151 |             "Fn::GetAtt": [
152 |               "STEP1",
153 |               "Outputs.SecurityGroup"
154 |             ]
155 |           },
156 |           "ClusterSubnetID": {
157 |             "Fn::GetAtt": [
158 |               "STEP1",
159 |               "Outputs.SubnetID"
160 |             ]
161 |           },
162 |           "KeyName": {
163 |             "Ref": "KeyName"
164 |           }
165 |         }
166 |       }
167 |     },
168 |     "STEP6": {
169 |       "Type": "AWS::CloudFormation::Stack",
170 |       "DependsOn":"STEP5",
171 |       "Properties": {
172 |         "TemplateURL": {
173 |           "Fn::Sub": "https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/cloudformation-templates/step6_ec2_instance.template"
174 |         },
175 |         "Parameters": {
176 |           "EC2SecurityGroupId": {
177 |             "Fn::GetAtt": [
178 |               "STEP1",
179 |               "Outputs.SecurityGroup"
180 |             ]
181 |           },
182 |           "KeyName": {
183 |             "Ref": "KeyName"
184 |           },
185 |           "EC2Subnet": {
186 |             "Fn::GetAtt": [
187 |               "STEP1",
188 |               "Outputs.SubnetID"
189 |             ]
190 |           },
191 |           "InstanceType": {
192 |             "Ref": "InstanceType"
193 |           }
194 |         }
195 |       }
196 |     }
197 |   }
198 | }


--------------------------------------------------------------------------------
/cloudformation-templates/step1_vpc.template:
--------------------------------------------------------------------------------
  1 | {
  2 |   "AWSTemplateFormatVersion": "2010-09-09",
  3 |   "Description": "AWS BLOGS - This template creates an Amazon VPC and subnet with the required configuration.",
  4 |   "Parameters": {
  5 |     "S3BucketName": {
  6 |       "Type": "String"
  7 |     },
  8 |     "ClientIP": {
  9 |       "Description": "The IP address range that can be used to connect to the EC2 instance from your local machine.It must be a valid IP CIDR range of the form x.x.x.x/x.Pls get your address using checkip.amazonaws.com or whatsmyip.org",
 10 |       "Type": "String",
 11 |       "MinLength": "9",
 12 |       "MaxLength": "18",
 13 |       "Default": "0.0.0.0/0",
 14 |       "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})",
 15 |       "ConstraintDescription": "It must be a valid IP CIDR range of the form x.x.x.x/x. Suggest to enable access to your IP address only. Pls get your address using checkip.amazonaws.com or whatsmyip.org."
 16 |     }
 17 |   },
 18 |   "Resources": {
 19 |     "VPC": {
 20 |       "Type": "AWS::EC2::VPC",
 21 |       "Properties": {
 22 |         "CidrBlock": "10.0.0.0/16",
 23 |         "EnableDnsSupport": true,
 24 |         "EnableDnsHostnames": true,
 25 |         "InstanceTenancy": "default",
 26 |         "Tags": [{
 27 |           "Key": "Name",
 28 |           "Value": "awsblog-small-files-vpc"
 29 |         }]
 30 |       }
 31 |     },
 32 |     "PublicSubnet": {
 33 |       "Type": "AWS::EC2::Subnet",
 34 |       "Properties": {
 35 |         "VpcId": {
 36 |           "Ref": "VPC"
 37 |         },
 38 |         "CidrBlock": "10.0.1.0/24",
 39 |         "MapPublicIpOnLaunch": "True",
 40 |         "Tags": [{
 41 |           "Key": "Name",
 42 |           "Value": "awsblog-small-files-subnet"
 43 |         }]
 44 |       }
 45 |     },
 46 |     "InternetGateway": {
 47 |       "Type": "AWS::EC2::InternetGateway",
 48 |       "Properties": {
 49 |         "Tags": [{
 50 |           "Key": "Name",
 51 |           "Value": "awsblog-small-files-gateway"
 52 |         }]
 53 |       }
 54 |     },
 55 |     "MyGatewayAttachment": {
 56 |       "Type": "AWS::EC2::VPCGatewayAttachment",
 57 |       "Properties": {
 58 |         "InternetGatewayId": {
 59 |           "Ref": "InternetGateway"
 60 |         },
 61 |         "VpcId": {
 62 |           "Ref": "VPC"
 63 |         }
 64 |       }
 65 |     },
 66 |     "PublicRouteTable": {
 67 |       "Type": "AWS::EC2::RouteTable",
 68 |       "Properties": {
 69 |         "VpcId": {
 70 |           "Ref": "VPC"
 71 |         }
 72 |       }
 73 |     },
 74 |     "PublicRoute": {
 75 |       "Type": "AWS::EC2::Route",
 76 |       "Properties": {
 77 |         "RouteTableId": {
 78 |           "Ref": "PublicRouteTable"
 79 |         },
 80 |         "DestinationCidrBlock": "0.0.0.0/0",
 81 |         "GatewayId": {
 82 |           "Ref": "InternetGateway"
 83 |         }
 84 |       },
 85 |       "DependsOn": [
 86 |         "MyGatewayAttachment"
 87 |       ]
 88 |     },
 89 |     "PublicSubnetRouteAssociation": {
 90 |       "Type": "AWS::EC2::SubnetRouteTableAssociation",
 91 |       "Properties": {
 92 |         "RouteTableId": {
 93 |           "Ref": "PublicRouteTable"
 94 |         },
 95 |         "SubnetId": {
 96 |           "Ref": "PublicSubnet"
 97 |         }
 98 |       }
 99 |     },
100 |     "InstanceSecurityGroup": {
101 |       "Type": "AWS::EC2::SecurityGroup",
102 |       "Properties": {
103 |         "GroupDescription": "CloudFormationGroup",
104 |         "VpcId": {
105 |           "Ref": "VPC"
106 |         },
107 |         "SecurityGroupIngress": [{
108 |           "IpProtocol": "tcp",
109 |           "CidrIp": { "Ref" : "ClientIP"},
110 |           "FromPort": "22",
111 |           "ToPort": "22"
112 |         }],
113 |         "SecurityGroupEgress": [
114 |           {
115 |             "CidrIp": "0.0.0.0/0",
116 |             "IpProtocol": "-1",
117 |             "FromPort": -1,
118 |             "ToPort": -1
119 |           }
120 |         ],
121 |         "Tags": [{
122 |           "Key": "Name",
123 |           "Value": "awsblog-small-files-securitygroup"
124 |         }]
125 |       }
126 |     },
127 |     "VPCDefaultSecurityGroupIngress": {
128 |       "Type": "AWS::EC2::SecurityGroupIngress",
129 |       "Properties": {
130 |         "GroupId": {
131 |           "Fn::GetAtt": ["InstanceSecurityGroup", "GroupId"]
132 |         },
133 |         "IpProtocol": "-1",
134 |         "FromPort": "-1",
135 |         "ToPort": "-1",
136 |         "SourceSecurityGroupId": {
137 |           "Fn::GetAtt": [
138 |             "InstanceSecurityGroup",
139 |             "GroupId"
140 |           ]
141 |         }
142 |       }
143 |     },
144 |     "S3Bucket": {
145 |       "Type": "AWS::S3::Bucket",
146 |       "Properties": {
147 |         "BucketName": {
148 |           "Ref": "S3BucketName"
149 |         }
150 |       }
151 |     }
152 |   },
153 |   "Outputs": {
154 |     "StackName": {
155 |       "Value": {
156 |         "Ref": "AWS::StackName"
157 |       }
158 |     },
159 |     "SubnetID": {
160 |       "Description": "Use this subnet ID for your other AWS resources",
161 |       "Value": {
162 |         "Ref": "PublicSubnet"
163 |       }
164 |     },
165 |     "SecurityGroup": {
166 |       "Description": "Use this security group ID for your other AWS resources.",
167 |       "Value": {
168 |         "Fn::GetAtt": ["InstanceSecurityGroup", "GroupId"]
169 |       }
170 |     },
171 |     "VPCID": {
172 |       "Description": "Use this VPC ID for your other AWS resources..",
173 |       "Value": {
174 |         "Ref": "VPC"
175 |       }
176 |     },
177 |     "S3BucketDomain": {
178 |       "Description": "S3 Bucket Domain that was created",
179 |       "Value": {
180 |         "Fn::GetAtt": ["S3Bucket", "DomainName" ]
181 |       }
182 |     },
183 |     "S3BucketARN": {
184 |       "Description": "S3 Bucket ARN that was created",
185 |       "Value": {
186 |         "Fn::GetAtt": ["S3Bucket", "Arn" ]
187 |       }
188 |     }
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/cloudformation-templates/step2_iam.template:
--------------------------------------------------------------------------------
 1 | {
 2 |   "AWSTemplateFormatVersion": "2010-09-09",
 3 |   "Description": "AWS BLOGS - IAM Roles required for the AWS Services",
 4 |   "Resources": {
 5 |     "LambdaRole": {
 6 |       "Type": "AWS::IAM::Role",
 7 |       "Properties": {
 8 |         "RoleName": "small-files-lambdarole",
 9 |         "AssumeRolePolicyDocument": {
10 |           "Version": "2012-10-17",
11 |           "Statement": [
12 |             {
13 |               "Effect": "Allow",
14 |               "Principal": {
15 |                 "Service": "lambda.amazonaws.com"
16 |               },
17 |               "Action": "sts:AssumeRole"
18 |             }
19 |           ]
20 |         },
21 |         "Path": "/",
22 |         "Policies": [
23 |           {
24 |             "PolicyName": "root",
25 |             "PolicyDocument": {
26 |               "Version": "2012-10-17",
27 |               "Statement": {
28 |                 "Effect": "Allow",
29 |                 "Action": [
30 |                   "ec2:*",
31 |                   "logs:*",
32 |                   "kinesis:*",
33 |                   "firehose:*",
34 |                   "s3:*"
35 |                 ],
36 |                 "Resource": "*"
37 |               }
38 |             }
39 |           }
40 |         ]
41 |       }
42 |     },
43 |     "FirehoseRole": {
44 |       "Type": "AWS::IAM::Role",
45 |       "Properties": {
46 |         "RoleName": "small-files-firehoserole",
47 |         "AssumeRolePolicyDocument": {
48 |           "Version": "2012-10-17",
49 |           "Statement": [
50 |             {
51 |               "Effect": "Allow",
52 |               "Principal": {
53 |                 "Service": "firehose.amazonaws.com"
54 |               },
55 |               "Action": "sts:AssumeRole"
56 |             }
57 |           ]
58 |         },
59 |         "Path": "/",
60 |         "Policies": [
61 |           {
62 |             "PolicyName": "root",
63 |             "PolicyDocument": {
64 |               "Version": "2012-10-17",
65 |               "Statement": {
66 |                 "Effect": "Allow",
67 |                 "Action": [
68 |                   "s3:*"
69 |                 ],
70 |                 "Resource": "*"
71 |               }
72 |             }
73 |           }
74 |         ]
75 |       }
76 |     }
77 |   },
78 |   "Outputs": {
79 |     "FirehoseRoleArn": {
80 |       "Value": {
81 |         "Fn::GetAtt": [
82 |           "FirehoseRole",
83 |           "Arn"
84 |         ]
85 |       }
86 |     },
87 |     "LambdaRoleArn": {
88 |       "Value": {
89 |         "Fn::GetAtt": [
90 |           "LambdaRole",
91 |           "Arn"
92 |         ]
93 |       }
94 |     }
95 |   }
96 | }


--------------------------------------------------------------------------------
/cloudformation-templates/step3_firehose.template:
--------------------------------------------------------------------------------
 1 | {
 2 |   "AWSTemplateFormatVersion": "2010-09-09",
 3 |   "Description": "AWS BLOGS - Creating Amazon Kinesis Firehose Delivery Stream",
 4 |   "Parameters": {
 5 |     "Role": {
 6 |       "Type": "String",
 7 |       "Description" : "Fire Hose IAM Role ARN that was created as part of the Cloudformation template 2."
 8 | 
 9 |     },
10 |     "S3BucketARN": {
11 |       "Type": "String",
12 |       "Description" : "S3 Bucket ARN that was created as part of the Cloudformation template 1."
13 |     }
14 |   ,
15 |     "FirehoseDeliveryStreamName": {
16 |       "Type": "String",
17 |       "Description" : "Name of the Amazon Firehose delivery stream. Default value is set to 'AWSBlogs-LambdaToFireHose'",
18 |       "Default": "AWSBlogs-LambdaToFireHose",
19 |       "AllowedValues": [
20 |         "AWSBlogs-LambdaToFireHose"
21 |       ]
22 |     }
23 |   },
24 |   "Resources": {
25 |     "KinesisDeliveryStreamFromLambda": {
26 |       "Type": "AWS::KinesisFirehose::DeliveryStream",
27 |       "Properties": {
28 |         "DeliveryStreamName": {"Ref": "FirehoseDeliveryStreamName"},
29 |         "DeliveryStreamType": "DirectPut",
30 |         "ExtendedS3DestinationConfiguration": {
31 |           "BufferingHints": {
32 |             "IntervalInSeconds": 300,
33 |             "SizeInMBs": 128
34 |           },
35 |           "CompressionFormat": "UNCOMPRESSED",
36 |           "BucketARN": {"Ref":"S3BucketARN"},
37 |           "Prefix": "fromfirehose/",
38 |           "RoleARN": {"Ref":"Role"}
39 |         }
40 |       }
41 |     }
42 |   }
43 | }


--------------------------------------------------------------------------------
/cloudformation-templates/step4_kinesisstream.template:
--------------------------------------------------------------------------------
 1 | {
 2 |   "AWSTemplateFormatVersion": "2010-09-09",
 3 |   "Description": "AWS BLOGS - Kinesis Stream and a Lambda Function",
 4 |   "Parameters": {
 5 |     "Role": {
 6 |       "Description": "IAM Role created for Lambda Function as part of the 2nd CF template. Get the value from the output of 2nd CF template.",
 7 |       "Type": "String"
 8 |     },
 9 |     "Region": {
10 |       "Description": "AWS Region - Select us-east-1 by default.",
11 |       "Type": "String",
12 |       "Default": "us-east-1",
13 |       "AllowedValues": [
14 |         "us-east-1"
15 |       ]
16 |     },
17 |     "KinesisStreamName": {
18 |       "Type": "String",
19 |       "Description" : "Name of the Amazon Kinesis stream. Default value is set to 'AWS-Blog-BaseKinesisStream'",
20 |       "Default": "AWS-Blog-BaseKinesisStream",
21 |       "AllowedValues": [
22 |         "AWS-Blog-BaseKinesisStream"
23 |       ]
24 |     },
25 |     "S3Bucket": {
26 |       "Description": "Existing S3 Bucket name that was created using 1st CF template. Do not use the domain name, just provide the bucket name.",
27 |       "Type": "String"
28 |     }
29 |   },
30 |   "Resources": {
31 |   "BaseKinesisStream": {
32 |       "Type": "AWS::Kinesis::Stream",
33 |       "Properties": {
34 |         "Name": {"Ref": "KinesisStreamName"},
35 |         "ShardCount": "10"
36 |       }
37 |     },
38 |   "LambdaProcessKinesisRecords": {
39 |       "Type": "AWS::Lambda::Function",
40 |       "DependsOn": "BaseKinesisStream",
41 |       "Properties": {
42 |         "Code": {
43 |           "S3Bucket": "aws-bigdata-blog",
44 |           "S3Key": {
45 |             "Fn::Sub": "artifacts/aws-blog-avoid-small-files/appjars/kinesis-lambda-1.0-SNAPSHOT-jar-with-dependencies.jar"
46 |           }
47 |         },
48 |         "Description": "AWS BLOGS - Processing Incoming Kinesis Records",
49 |         "FunctionName": "LambdaForProcessingKinesisRecords",
50 |         "Handler": "com.awsblogs.smallfiles.lambda.ProcessKinesisRecords",
51 |         "Role": {
52 |           "Ref": "Role"
53 |         },
54 |         "Runtime": "java8",
55 |         "MemorySize": 1920,
56 |         "Timeout": 300,
57 |         "Environment": {
58 |           "Variables": {
59 |             "kinesis_region": {
60 |               "Ref": "Region"
61 |             },
62 |             "kinesis_stream_name": {
63 |               "Ref": "BaseKinesisStream"
64 |             },
65 |             "s3region": {
66 |               "Ref": "Region"
67 |             },
68 |             "s3bucketName": {
69 |               "Ref": "S3Bucket"
70 |             },
71 |             "s3directorySub": "raw-from-firehose/",
72 |             "kinesisfirehosestream" : "AWSBlogs-LambdaToFireHose"
73 |           }
74 |         }
75 |       }
76 |     },
77 |   "KinesisLambdaEventTrigger": {
78 |     "Type" : "AWS::Lambda::EventSourceMapping",
79 |       "DependsOn": [
80 |         "BaseKinesisStream", "LambdaProcessKinesisRecords"
81 |       ],
82 |     "Properties" : {
83 |       "BatchSize" : 100,
84 |       "Enabled" : true,
85 |       "EventSourceArn" : {
86 |         "Fn::GetAtt": ["BaseKinesisStream", "Arn" ]
87 |         },
88 |       "FunctionName" : {"Ref":"LambdaProcessKinesisRecords"},
89 |       "StartingPosition" : "TRIM_HORIZON"
90 |         }
91 |     }
92 |   }
93 | }


--------------------------------------------------------------------------------
/cloudformation-templates/step5_emr.template:
--------------------------------------------------------------------------------
  1 | {
  2 |   "AWSTemplateFormatVersion": "2010-09-09",
  3 |   "Description": "AWS BLOGS - Creating an EMR cluster",
  4 |   "Parameters": {
  5 |     "EMRClusterName": {
  6 |       "Type": "String",
  7 |       "Description": "ClusterName"
  8 |     },
  9 |     "ClusterSecurityGroup": {
 10 |       "Description": "ID of an existing security-group for the for the Amazon EMR cluster",
 11 |       "Type": "AWS::EC2::SecurityGroup::Id"
 12 |     },
 13 |     "ClusterSubnetID": {
 14 |       "Description": "ID of an existing subnet for the Amazon EMR cluster",
 15 |       "Type": "AWS::EC2::Subnet::Id"
 16 |     },
 17 |     "KeyName": {
 18 |       "Description": "Name of an existing EC2 key pair to access the Amazon EMR cluster",
 19 |       "Type": "AWS::EC2::KeyPair::KeyName"
 20 |     }
 21 |   },
 22 |   "Resources": {
 23 |     "EMRCluster": {
 24 |       "Properties":
 25 |       {
 26 |         "Name": {
 27 |           "Ref": "EMRClusterName"
 28 |         },
 29 |         "Applications": [{
 30 |           "Name": "Spark"
 31 |           },
 32 |           {
 33 |             "Name": "Ganglia"
 34 |           },
 35 |           {
 36 |             "Name": "hive"
 37 |           }
 38 |         ],
 39 |         "Configurations": [{
 40 |           "Classification": "spark",
 41 |           "ConfigurationProperties": {
 42 |             "maximizeResourceAllocation": "true"
 43 |               }
 44 |             },
 45 |             {
 46 |               "Classification": "spark-hive-site",
 47 |               "ConfigurationProperties": {
 48 |                 "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
 49 |               }
 50 |             }
 51 |         ],
 52 |         "Instances": {
 53 |           "CoreInstanceGroup": {
 54 |             "InstanceCount": 2,
 55 |             "InstanceType": "r4.xlarge",
 56 |             "Name": "Core Instance Group"
 57 |           },
 58 |           "Ec2KeyName": {
 59 |             "Ref": "KeyName"
 60 |           },
 61 |           "Ec2SubnetId": {"Ref": "ClusterSubnetID"},
 62 |           "MasterInstanceGroup": {
 63 |             "InstanceCount": 1,
 64 |             "InstanceType": "r4.xlarge",
 65 |             "Name": "Master Instance Group"
 66 |           },
 67 |           "AdditionalMasterSecurityGroups": [{
 68 |             "Ref": "ClusterSecurityGroup"
 69 |           }],
 70 |           "AdditionalSlaveSecurityGroups": [{
 71 |             "Ref": "ClusterSecurityGroup"
 72 |           }]
 73 |         },
 74 |         "JobFlowRole": "EMR_EC2_DefaultRole",
 75 |         "ServiceRole": "EMR_DefaultRole",
 76 |         "ReleaseLabel": "emr-5.16.0",
 77 |         "VisibleToAllUsers": "true"
 78 |       },
 79 |       "Type": "AWS::EMR::Cluster"
 80 |     }
 81 |   },
 82 |   "Outputs": {
 83 |     "EMRClusterMaster": {
 84 |       "Description": "SSH Connection String to EMR Master Instance",
 85 |       "Value" : {
 86 |         "Fn::Join" : [
 87 |           "",
 88 |           [
 89 |             "ssh hadoop@",
 90 |             {
 91 |               "Fn::GetAtt" : [
 92 |                 "EMRCluster",
 93 |                 "MasterPublicDNS"
 94 |               ]
 95 |             },
 96 |             " -i ",
 97 |             {
 98 |               "Ref" : "KeyName"
 99 |             },
100 |             ".pem"
101 |           ]
102 |         ]
103 |       }
104 |     }
105 |   }
106 | }


--------------------------------------------------------------------------------
/cloudformation-templates/step6_ec2_instance.template:
--------------------------------------------------------------------------------
  1 | {
  2 |   "AWSTemplateFormatVersion" : "2010-09-09",
  3 |   "Description": "AWS BLOGS - Creating Amazon EC2 Instance For test dataset generation and loading into Kinesis.",
  4 |   "Parameters":
  5 |   {
  6 |     "EC2SecurityGroupId": {
  7 |       "Description": "Existing security Group",
  8 |       "Type":"AWS::EC2::SecurityGroup::Id"
  9 |     },
 10 |     "KeyName": {
 11 |       "Description": "Name of an existing EC2 KeyPair to enable SSH access to the instance",
 12 |       "Type": "AWS::EC2::KeyPair::KeyName",
 13 |       "ConstraintDescription": "must be the name of an existing EC2 KeyPair."
 14 |     },
 15 |     "EC2Subnet": {
 16 |       "Description": "Subnet List - Choose the Subnet",
 17 |       "Type": "AWS::EC2::Subnet::Id"
 18 |     },
 19 |     "InstanceType" : {
 20 |       "Description" : "EC2 instance specs configuration",
 21 |       "Type" : "String",
 22 |       "Default" : "r4.xlarge",
 23 |       "AllowedValues" : ["r4.xlarge","r4.2xlarge", "r4.4xlarge"]
 24 |     }
 25 |   },
 26 |   "Mappings": {
 27 |     "AWSInstanceType2Arch": {
 28 |       "r4.xlarge": {
 29 |         "Arch": "HVM64"
 30 |       },
 31 |       "r4.2xlarge": {
 32 |         "Arch": "HVM64"
 33 |       },
 34 |       "r4.4xlarge": {
 35 |         "Arch": "HVM64"
 36 |       }
 37 |     },
 38 |     "AWSRegionArch2AMI": {
 39 |       "us-west-2": {
 40 |         "HVM64": "ami-6cd6f714"
 41 |       },
 42 |       "us-east-1": {
 43 |         "HVM64": "ami-1853ac65"
 44 |       }
 45 |     }
 46 |   },
 47 |   "Resources" : {
 48 |     "EC2IAMRole": {
 49 |       "Type": "AWS::IAM::Role",
 50 |       "Properties": {
 51 |         "RoleName": "small-files-ec2role",
 52 |         "AssumeRolePolicyDocument": {
 53 |           "Version": "2012-10-17",
 54 |           "Statement": [
 55 |             {
 56 |               "Effect": "Allow",
 57 |               "Principal": {
 58 |                 "Service": "ec2.amazonaws.com"
 59 |               },
 60 |               "Action": "sts:AssumeRole"
 61 |             }
 62 |           ]
 63 |         },
 64 |         "Path": "/",
 65 |         "Policies": [
 66 |           {
 67 |             "PolicyName": "root",
 68 |             "PolicyDocument": {
 69 |               "Version": "2012-10-17",
 70 |               "Statement": {
 71 |                 "Effect": "Allow",
 72 |                 "Action": [
 73 |                   "ec2:*",
 74 |                   "logs:*",
 75 |                   "kinesis:*",
 76 |                   "firehose:*",
 77 |                   "s3:*"
 78 |                 ],
 79 |                 "Resource": "*"
 80 |               }
 81 |             }
 82 |           }
 83 |         ]
 84 |       }
 85 |     },
 86 |     "EC2InstanceProfile" : {
 87 |       "Type" : "AWS::IAM::InstanceProfile",
 88 |       "DependsOn": [
 89 |         "EC2IAMRole"
 90 |       ],
 91 |       "Properties" : {
 92 |         "Path" : "/",
 93 |         "Roles" : [
 94 |           {
 95 |             "Ref" : "EC2IAMRole"
 96 |           }
 97 |         ]
 98 |       }
 99 |     },
100 |     "EC2InstanceForDataLoadingIntoKinesis" : {
101 |       "Type" : "AWS::EC2::Instance",
102 |       "DependsOn": [
103 |         "EC2InstanceProfile"
104 |       ],
105 |       "Properties" : {
106 |         "KeyName" : { "Ref" : "KeyName" },
107 |         "InstanceType" : { "Ref" : "InstanceType" },
108 |         "ImageId" : { "Fn::FindInMap" : [ "AWSRegionArch2AMI", { "Ref" : "AWS::Region" },
109 |           { "Fn::FindInMap" : [ "AWSInstanceType2Arch", { "Ref" : "InstanceType" }, "Arch" ] } ] },
110 |         "SecurityGroupIds" : [
111 |           {
112 |           "Ref" : "EC2SecurityGroupId"
113 |           }
114 |           ],
115 |         "SubnetId": {"Ref": "EC2Subnet"},
116 |         "IamInstanceProfile": {"Ref": "EC2InstanceProfile"},
117 |         "Tags" : [
118 |           {
119 |           "Key" : "Name",
120 |           "Value" : "AWS-BLOGs-Small-Files-EC2-For-DataLoading"
121 |           }
122 |           ],
123 |         "UserData" : {"Fn::Base64" : { "Fn::Join" : ["",[
124 |           "#!/bin/bash -ex","\n",
125 |           "\n","sudo yum install -y java-1.8.0-openjdk-devel.x86_64","\n",
126 |           "\n","aws s3 cp s3://aws-bigdata-blog/artifacts/aws-blog-optimize-downstream-data-processing/appjars/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar .","\n",
127 |           "\n","sudo cp /sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar /home/ec2-user/","\n",
128 |           "\n","sudo chown -R ec2-user:ec2-user /home/ec2-user/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar","\n",
129 |           "\n","sudo chmod -R 755 /home/ec2-user/sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar","\n"
130 |         ]]}
131 |         }
132 |       }
133 |       }
134 |   },
135 |   "Outputs" : {
136 |     "EC2Instance" : {
137 |       "Description" : "EC2 IP address",
138 |       "Value" : {
139 |         "Fn::Join" : [
140 |           "",
141 |           [
142 |             "ssh ec2-user@",
143 |             {
144 |               "Fn::GetAtt" : [
145 |                 "EC2InstanceForDataLoadingIntoKinesis",
146 |                 "PublicIp"
147 |               ]
148 |             },
149 |             " -i ",
150 |             {
151 |               "Ref" : "KeyName"
152 |             },
153 |             ".pem"
154 |           ]
155 |         ]
156 |       }
157 |     }
158 |   }
159 | }
160 | 


--------------------------------------------------------------------------------
/kinesis-lambda/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <version>1.0-SNAPSHOT</version>
 7 |         <groupId>com.optimize.downstream</groupId>
 8 |         <artifactId>data-processing</artifactId>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>kinesis-lambda</artifactId>
13 |     <packaging>jar</packaging>
14 | 
15 |     <!--
16 |     <build>
17 |         <pluginManagement>
18 |             <plugins>
19 |                 <plugin>
20 |                     <groupId>org.apache.maven.plugins</groupId>
21 |                     <artifactId>maven-surefire-plugin</artifactId>
22 |                     <configuration>
23 |                         <testFailureIgnore>true</testFailureIgnore>
24 |                     </configuration>
25 |                 </plugin>
26 |             </plugins>
27 |         </pluginManagement>
28 |     </build>
29 |     -->
30 | 
31 |     <properties>
32 |         <skip.assembly>false</skip.assembly>
33 |     </properties>
34 | 
35 |     <dependencies>
36 |         <dependency>
37 |             <groupId>com.amazonaws</groupId>
38 |             <artifactId>aws-lambda-java-core</artifactId>
39 |             <version>1.2.0</version>
40 |         </dependency>
41 | 
42 |         <dependency>
43 |             <groupId>com.amazonaws</groupId>
44 |             <artifactId>aws-lambda-java-events</artifactId>
45 |             <version>2.2.2</version>
46 |         </dependency>
47 | 
48 |         <dependency>
49 |             <groupId>com.amazonaws</groupId>
50 |             <artifactId>aws-java-sdk-core</artifactId>
51 |             <version>1.11.362</version>
52 |         </dependency>
53 | 
54 |         <dependency>
55 |             <groupId>com.amazonaws</groupId>
56 |             <artifactId>aws-java-sdk-kinesis</artifactId>
57 |             <version>1.11.362</version>
58 |         </dependency>
59 | 
60 |         <dependency>
61 |             <groupId>com.google.code.gson</groupId>
62 |             <artifactId>gson</artifactId>
63 |             <version>2.8.0</version>
64 |         </dependency>
65 | 
66 |         <!-- https://mvnrepository.com/artifact/org.json/json -->
67 |         <dependency>
68 |             <groupId>org.json</groupId>
69 |             <artifactId>json</artifactId>
70 |             <version>20180813</version>
71 |         </dependency>
72 | 
73 | 
74 | 
75 |     </dependencies>
76 | 
77 | </project>


--------------------------------------------------------------------------------
/kinesis-lambda/src/main/java/com/optimize/downstream/additionaldata/AdditionalIOTData.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.additionaldata;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | public class AdditionalIOTData
 6 | {
 7 |     private String hashDeviceId;
 8 |     private String processedDate;
 9 |     private List<TestAdditonalData> testAdditonalDataList;
10 |     private Random random = new Random();
11 | 
12 |     public AdditionalIOTData(String hashDeviceId,
13 |               String processedDate,
14 |                       List<TestAdditonalData> testAdditonalDataList)
15 |     {
16 |         this.hashDeviceId = hashDeviceId;
17 |         this.processedDate = processedDate;
18 |         this.testAdditonalDataList = testAdditonalDataList;
19 |     }
20 | 
21 |     public String getHashDeviceId() {
22 |         return hashDeviceId;
23 |     }
24 | 
25 |     public String getProcessedDate() {
26 |         return processedDate;
27 |     }
28 | 
29 |     public List<TestAdditonalData> getTestAdditonalDataList() {
30 |         return testAdditonalDataList;
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/kinesis-lambda/src/main/java/com/optimize/downstream/additionaldata/TestAdditonalData.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.additionaldata;
 2 | 
 3 | public class TestAdditonalData
 4 | {
 5 |     private int dimension_X;
 6 |     private int dimension_Y;
 7 |     private int dimension_Z;
 8 | 
 9 |     public void setDimension_Y(int dimension_Y) {
10 |         this.dimension_Y = dimension_Y;
11 |     }
12 | 
13 |     public void setDimension_Z(int dimension_Z) {
14 |         this.dimension_Z = dimension_Z;
15 |     }
16 | 
17 |     public void setDimension_X(int dimension_X)
18 |     {
19 |         this.dimension_X = dimension_X;
20 |     }
21 | 
22 |     public int getDimension_X() {
23 |         return dimension_X;
24 |     }
25 | 
26 |     public int getDimension_Y() {
27 |         return dimension_Y;
28 |     }
29 | 
30 |     public int getDimension_Z() {
31 |         return dimension_Z;
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/kinesis-lambda/src/main/java/com/optimize/downstream/lambda/ProcessKinesisRecords.java:
--------------------------------------------------------------------------------
  1 | package com.optimize.downstream.lambda;
  2 | 
  3 | import com.amazonaws.ClientConfiguration;
  4 | import com.amazonaws.regions.Region;
  5 | import com.amazonaws.regions.Regions;
  6 | import com.amazonaws.services.kinesisfirehose.AmazonKinesisFirehoseClient;
  7 | import com.amazonaws.services.kinesisfirehose.model.PutRecordRequest;
  8 | import com.amazonaws.services.kinesisfirehose.model.PutRecordResult;
  9 | import com.amazonaws.services.kinesisfirehose.model.Record;
 10 | import com.amazonaws.services.lambda.runtime.Context;
 11 | import com.amazonaws.services.lambda.runtime.RequestHandler;
 12 | import com.amazonaws.services.lambda.runtime.events.KinesisEvent;
 13 | import com.amazonaws.services.lambda.runtime.events.KinesisEvent.KinesisEventRecord;
 14 | import com.optimize.downstream.additionaldata.AdditionalIOTData;
 15 | import com.optimize.downstream.additionaldata.TestAdditonalData;
 16 | import com.google.gson.Gson;
 17 | import org.json.JSONObject;
 18 | import java.io.ByteArrayOutputStream;
 19 | import java.io.IOException;
 20 | import java.nio.ByteBuffer;
 21 | import java.nio.charset.Charset;
 22 | import java.text.DateFormat;
 23 | import java.text.SimpleDateFormat;
 24 | import java.util.*;
 25 | import java.util.zip.GZIPOutputStream;
 26 | 
 27 | public class ProcessKinesisRecords implements RequestHandler<KinesisEvent, Void>{
 28 |     private static Charset charset = Charset.forName("UTF-8");
 29 |     private String hashDeviceId;
 30 |     private String processedDate;
 31 |     private Random random = new Random();
 32 |     private Gson gson;
 33 |     private AmazonKinesisFirehoseClient kinesisFirehoseClient;
 34 |     private String firehoseStreamName = System.getenv("kinesisfirehosestream");
 35 |     private String REGION=System.getenv("kinesis_region");
 36 | 
 37 |     @Override
 38 |     public Void handleRequest(KinesisEvent event, Context context)
 39 |     {
 40 |         //System.out.print("In Handle Request");
 41 |         gson = new Gson();
 42 |         ClientConfiguration config = new ClientConfiguration();
 43 |         config.setMaxErrorRetry(5);
 44 |         config.setSocketTimeout(100);
 45 |         kinesisFirehoseClient = new AmazonKinesisFirehoseClient(config);
 46 |         kinesisFirehoseClient.setRegion(Region.getRegion(Regions.fromName(REGION)));
 47 |         String mergedJsonString = "";
 48 |         String recordId;
 49 |         try {
 50 |             for (KinesisEventRecord rec : event.getRecords())
 51 |             {
 52 |                 //System.out.println(new String(rec.getKinesis().getData().array()));
 53 |                 String jsonMessage = new String(rec.getKinesis().getData().array());
 54 |                 //System.out.println("Kinesis JSON Message is ::: ");
 55 |                 //System.out.println(jsonMessage);
 56 |                 AdditionalIOTData additionalDeviceMessage = generateAdditionalIOTDeviceData();
 57 |                 String addJson = gson.toJson(additionalDeviceMessage);
 58 |                 //System.out.println("Additional JSON Is  :: " + addJson);
 59 | 
 60 |                 try {
 61 |                     mergedJsonString = mergeJsonStrings(jsonMessage, addJson);
 62 |                     System.out.println(mergedJsonString);
 63 |                     System.out.println("Sending record to Firehose");
 64 |                     recordId = sendToFireHose(mergedJsonString);
 65 |                     System.out.println("Record sent to Firehose. Result Record Id is : " + recordId);
 66 |                 }catch (Exception e)
 67 |                 {
 68 |                     e.printStackTrace();
 69 |                 }
 70 |             }
 71 |         }
 72 |         catch (Exception ie)
 73 |         {
 74 |             ie.getStackTrace();
 75 |         }
 76 |         return null;
 77 |     }
 78 | 
 79 |     private String mergeJsonStrings(String kinJsonMessage, String addJson)
 80 |     {
 81 |         JSONObject kinesisJsonObject;
 82 |         JSONObject addJsonObject;
 83 |         kinesisJsonObject = new JSONObject(kinJsonMessage);
 84 |         addJsonObject = new JSONObject(addJson);
 85 |         JSONObject mergedJson = new JSONObject();
 86 |         if (kinesisJsonObject.length()>0){
 87 |             mergedJson = new JSONObject(kinesisJsonObject, JSONObject.getNames(kinesisJsonObject));
 88 |         }
 89 |         if (addJsonObject.length()>0){
 90 |             for(String key : JSONObject.getNames(addJsonObject))
 91 |             {
 92 |                 mergedJson.put(key, addJsonObject.get(key));
 93 |             }
 94 |         }
 95 |         return mergedJson.toString();
 96 |     }
 97 | 
 98 |     private static byte[] compressMessage(byte[] inputDataMessage) throws IOException
 99 |     {
100 |         ByteArrayOutputStream array = new ByteArrayOutputStream();
101 |         GZIPOutputStream output = new GZIPOutputStream(array);
102 |         try
103 |         {
104 |             output.write(inputDataMessage);
105 |             output.finish();
106 |             output.close();
107 |             array.close();
108 |         }
109 |         catch (Exception e)
110 |         {
111 |             e.printStackTrace();
112 |         }
113 |         return array.toByteArray();
114 |     }
115 | 
116 |     private AdditionalIOTData generateAdditionalIOTDeviceData()
117 |     {
118 |         UUID uuid = UUID.randomUUID();
119 |         hashDeviceId = uuid.toString();
120 |         DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
121 |         Date date = new Date();
122 |         processedDate = dateFormat.format(date);
123 |         //AdditionalIOTData additionalIOTData = new AdditionalIOTData(hashDeviceId, processedDate, getAdditionalData());
124 |         return (new AdditionalIOTData(hashDeviceId, processedDate, getAdditionalData()));
125 |     }
126 | 
127 |     private ArrayList<TestAdditonalData> getAdditionalData()
128 |     {
129 |         ArrayList<TestAdditonalData> additionalIOTDataArrayList = new ArrayList();
130 | 
131 |         // Adding extra content to make the message size more than 1 MB.
132 |         // The below generated data will be appended/merged with the message coming from Kineisis Stream.
133 |         // It is just to demonstrate that if the message size is more than 1MB, we can gzip the message and send it to KinesisFirehose.
134 |         for(int i =0;i <5000; i++)
135 |         {
136 |             TestAdditonalData t = new TestAdditonalData();
137 |             t.setDimension_X(getRandomInt(10,1));
138 |             t.setDimension_Y(getRandomInt(10,1));
139 |             t.setDimension_Z(getRandomInt(10,1));
140 |             additionalIOTDataArrayList.add(t);
141 |         }
142 |         return additionalIOTDataArrayList;
143 |     }
144 | 
145 |     private String sendToFireHose(String mergedJsonString)
146 |     {
147 |         PutRecordResult res = null;
148 |         try {
149 |             //To Firehose -
150 |             System.out.println("MESSAGE SIZE BEFORE COMPRESSION IS : " + mergedJsonString.toString().getBytes(charset).length);
151 |             System.out.println("MESSAGE SIZE AFTER GZIP COMPRESSION IS : " + compressMessage(mergedJsonString.toString().getBytes(charset)).length);
152 |             PutRecordRequest req = new PutRecordRequest()
153 |                     .withDeliveryStreamName(firehoseStreamName);
154 | 
155 |             // Without compression - Send to Firehose
156 |             //Record record = new Record().withData(ByteBuffer.wrap((mergedJsonString.toString() + "\r\n").getBytes()));
157 | 
158 |             // With compression - send to Firehose
159 |             Record record = new Record().withData(ByteBuffer.wrap(compressMessage((mergedJsonString.toString() + "\r\n").getBytes())));
160 |             req.setRecord(record);
161 |             res = kinesisFirehoseClient.putRecord(req);
162 |         }
163 |         catch (IOException ie) {
164 |             ie.printStackTrace();
165 |         }
166 |         return res.getRecordId();
167 |     }
168 |     private int getRandomInt(int max, int min)
169 |     {
170 |         return random.nextInt(max - min + 1) + min;
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.optimize.downstream</groupId>
 8 |     <artifactId>data-processing</artifactId>
 9 |     <packaging>pom</packaging>
10 |     <version>1.0-SNAPSHOT</version>
11 | 
12 |     <modules>
13 |         <module>kinesis-lambda</module>
14 |         <module>spark-process</module>
15 |         <module>sample-kinesis-producer</module>
16 |     </modules>
17 | 
18 |     <properties>
19 |         <skip.assembly>true</skip.assembly>
20 |     </properties>
21 | 
22 |     <build>
23 |         <plugins>
24 |             <plugin>
25 |                 <artifactId>maven-assembly-plugin</artifactId>
26 |                 <version>2.6</version>
27 |                 <configuration>
28 |                     <descriptorRefs>
29 |                         <descriptorRef>jar-with-dependencies</descriptorRef>
30 |                     </descriptorRefs>
31 |                     <skipAssembly>${skip.assembly}</skipAssembly>
32 |                 </configuration>
33 |                 <executions>
34 |                     <execution>
35 |                         <id>make-assembly</id>
36 |                         <phase>package</phase>
37 |                         <goals>
38 |                             <goal>single</goal>
39 |                         </goals>
40 |                     </execution>
41 |                 </executions>
42 |             </plugin>
43 |         </plugins>
44 |     </build>
45 | 
46 |     <!--
47 |     <build>
48 |         <pluginManagement>
49 |             <plugins>
50 |                 <plugin>
51 |                     <groupId>org.apache.maven.plugins</groupId>
52 |                     <artifactId>maven-compiler-plugin</artifactId>
53 |                     <configuration>
54 |                         <source>1.8</source>
55 |                         <target>1.8</target>
56 |                     </configuration>
57 |                 </plugin>
58 |             </plugins>
59 |         </pluginManagement>
60 |         <plugins>
61 |             <plugin>
62 |                 <groupId>org.jfrog.buildinfo</groupId>
63 |                 <artifactId>artifactory-maven-plugin</artifactId>
64 |                 <version>2.6.1</version>
65 |                 <executions>
66 |                     <execution>
67 |                         <id>build-info</id>
68 |                         <goals>
69 |                             <goal>publish</goal>
70 |                         </goals>
71 |                         <configuration>
72 |                             <publisher>
73 |                                 <contextUrl>{{ARTIFACTORY_CONTEXT_URL|CONTEXT_URL}}</contextUrl>
74 |                                 <username>{{ARTIFACTORY_USERNAME}}</username>
75 |                                 <password>{{ARTIFACTORY_PASSWORD}}</password>
76 |                                 <repoKey>libs-release-local</repoKey>
77 |                                 <snapshotRepoKey>libs-snapshot-local</snapshotRepoKey>
78 |                                 <publishArtifacts>true</publishArtifacts>
79 |                                 <publishBuildInfo>true</publishBuildInfo>
80 |                             </publisher>
81 |                             <buildInfo>
82 |                                 <buildNumber>{{CIRCLE_BUILD_NUM}}</buildNumber>
83 |                                 <buildUrl>{{CIRCLE_BUILD_URL}}</buildUrl>
84 |                             </buildInfo>
85 |                         </configuration>
86 |                     </execution>
87 |                 </executions>
88 |             </plugin>
89 |         </plugins>
90 |     </build>
91 |     -->
92 | 
93 | </project>


--------------------------------------------------------------------------------
/sample-kinesis-producer/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <groupId>com.optimize.downstream</groupId>
 7 |         <artifactId>data-processing</artifactId>
 8 |         <version>1.0-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 | 
13 |     <artifactId>sample-kinesis-producer</artifactId>
14 |     <packaging>jar</packaging>
15 | 
16 |     <properties>
17 |         <skip.assembly>false</skip.assembly>
18 |     </properties>
19 | 
20 |     <dependencies>
21 |         <dependency>
22 |             <groupId>com.amazonaws</groupId>
23 |             <artifactId>aws-lambda-java-core</artifactId>
24 |             <version>1.2.0</version>
25 |         </dependency>
26 | 
27 |         <dependency>
28 |             <groupId>com.amazonaws</groupId>
29 |             <artifactId>aws-lambda-java-events</artifactId>
30 |             <version>2.2.2</version>
31 |         </dependency>
32 | 
33 |         <dependency>
34 |             <groupId>com.amazonaws</groupId>
35 |             <artifactId>aws-java-sdk-core</artifactId>
36 |             <version>1.11.362</version>
37 |         </dependency>
38 | 
39 |         <dependency>
40 |             <groupId>com.amazonaws</groupId>
41 |             <artifactId>aws-java-sdk-kinesis</artifactId>
42 |             <version>1.11.362</version>
43 |         </dependency>
44 | 
45 |         <dependency>
46 |             <groupId>com.amazonaws</groupId>
47 |             <artifactId>amazon-kinesis-producer</artifactId>
48 |             <version>0.12.9</version>
49 |         </dependency>
50 | 
51 |         <dependency>
52 |             <groupId>com.googlecode.json-simple</groupId>
53 |             <artifactId>json-simple</artifactId>
54 |             <version>1.1.1</version>
55 |         </dependency>
56 | 
57 |         <!-- https://mvnrepository.com/artifact/org.fluttercode.datafactory/datafactory -->
58 |         <dependency>
59 |             <groupId>org.fluttercode.datafactory</groupId>
60 |             <artifactId>datafactory</artifactId>
61 |             <version>0.8</version>
62 |         </dependency>
63 | 
64 |         <dependency>
65 |             <groupId>org.apache.commons</groupId>
66 |             <artifactId>commons-lang3</artifactId>
67 |             <version>3.5</version>
68 |         </dependency>
69 | 
70 |         <dependency>
71 |             <groupId>com.google.code.gson</groupId>
72 |             <artifactId>gson</artifactId>
73 |             <version>2.8.0</version>
74 |         </dependency>
75 | 
76 | 
77 |     </dependencies>
78 | 
79 | 
80 | </project>


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/datagenerator/GenerateDataMain.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.datagenerator;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | import java.util.UUID;
 6 | 
 7 | 
 8 | public class GenerateDataMain
 9 | {
10 | 
11 |     public ArrayList<String> getDeviceIDS(int totalDeviceIds)
12 |     {
13 |         ArrayList<String> listOfUUIDS = new ArrayList<String>();
14 |         for(int i =0; i< totalDeviceIds; i++)
15 |         {
16 |             UUID uuid = UUID.randomUUID();
17 |             listOfUUIDS.add(uuid.toString());
18 |         }
19 |         return listOfUUIDS;
20 |     }
21 | 
22 |     public void kickAllWorkers()
23 |     {
24 |         ArrayList<String> allDeviceIds = getDeviceIDS(5);
25 |         List<GenerateDataWorker> workers = new ArrayList<GenerateDataWorker>();
26 | 
27 |         for (int i=0; i<allDeviceIds.size(); i++)
28 |         {
29 |             workers.add(new GenerateDataWorker());
30 |         }
31 |     }
32 | 
33 |     public static void main(String [] args)
34 |     {
35 |         GenerateDataMain g = new GenerateDataMain();
36 |         g.kickAllWorkers();
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/datagenerator/GenerateDataWorker.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.datagenerator;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.UUID;
 5 | 
 6 | public class GenerateDataWorker implements Runnable
 7 | {
 8 |     //DataFactory df;
 9 |     public boolean running = false;
10 | 
11 |     GenerateDataWorker()
12 |     {
13 |         Thread t = new Thread(this);
14 |         t.start();
15 |     }
16 |     @Override
17 |     public void run()
18 |     {
19 |         this.running = true;
20 |         System.out.print("Need to generate data and insert into Blocked Queue. The Id is : " + Thread.currentThread().getId() );
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/entry/Main.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.entry;
 2 | 
 3 | import com.optimize.downstream.samplekinesisproducer.IOTDevice;
 4 | import com.optimize.downstream.samplekinesisproducer.IOTDeviceConsumerFromBlockingQueueToKinesisStreams;
 5 | import com.optimize.downstream.samplekinesisproducer.IOTDeviceProducerToBlockingQueue;
 6 | 
 7 | import java.util.concurrent.BlockingQueue;
 8 | import java.util.concurrent.LinkedBlockingDeque;
 9 | 
10 | // This will throw an error - as the messeage size will be 1054382 bytes(more than 1 MB) when we used 2300 as parameter.
11 | // java -cp sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar com.awsblogs.smallfiles.entry.Main 100 2300
12 | // java -Xms1024m -Xmx3072m -XX:+UseG1GC -cp sample-kinesis-producer-1.0-SNAPSHOT-jar-with-dependencies.jar com.awsblogs.smallfiles.entry.Main 10000 2000
13 | 
14 | public class Main {
15 |     public static void main(String [] args)
16 |     {
17 |         if(args.length !=1)
18 |         {
19 |             System.out.println("Needs an argument");
20 |             System.out.println("Argument : Number of Messages to send to Kinesis");
21 |             //System.out.println("2nd Argument : Number of Threads");
22 |         }
23 |         else {
24 |             int numberOfSamplesInEachMessage = 2200; // Used 2200 to make the message size closer to 1 MB.
25 |             // Will use one thread for simplicity.
26 |             int numberOfThreads = 1; //Integer.parseInt(args[1]);
27 |             int numberOfMessages = Integer.parseInt(args[0]);
28 |             BlockingQueue<IOTDevice> inputQueue = new LinkedBlockingDeque();
29 | 
30 |             Thread[] consumerThread = new Thread[numberOfThreads];
31 |             Thread producerThread = new Thread(new IOTDeviceProducerToBlockingQueue(inputQueue, numberOfMessages, numberOfSamplesInEachMessage));
32 |             System.out.println("Starting producer and consumer.....");
33 |             producerThread.start();
34 | 
35 |             for (int i = 0; i < numberOfThreads; i++) {
36 |                 consumerThread[i] = new Thread(new IOTDeviceConsumerFromBlockingQueueToKinesisStreams(inputQueue));
37 |                 consumerThread[i].start();
38 |             }
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/samplekinesisproducer/IOTDevice.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.samplekinesisproducer;
 2 | 
 3 | import com.optimize.downstream.sensors.Accelerometer;
 4 | import com.optimize.downstream.sensors.GPS;
 5 | import com.optimize.downstream.sensors.Illuminance;
 6 | import com.optimize.downstream.sensors.TemperatureSensor;
 7 | 
 8 | import java.io.Serializable;
 9 | import java.util.List;
10 | 
11 | public class IOTDevice implements Serializable
12 | {
13 |     private String deviceId;
14 |     private String currentDate;
15 |     private List<Accelerometer> accelerometerSensorList;
16 |     private List<GPS> gpsSensorList;
17 |     private List<TemperatureSensor> tempSensorList;
18 |     private List<Illuminance> illuminancesSensorList;
19 | 
20 |     IOTDevice(String deviceId,
21 |               String currentDate,
22 |               List<Accelerometer> accelerometerArrayList,
23 |               List<GPS> gpsArrayList,
24 |               List<TemperatureSensor> temperatureSensorArrayList,
25 |               List<Illuminance> illuminanceArrayList
26 |             )
27 |     {
28 |         this.deviceId = deviceId;
29 |         this.currentDate = currentDate;
30 |         this.accelerometerSensorList = accelerometerArrayList;
31 |         this.gpsSensorList = gpsArrayList;
32 |         this.tempSensorList = temperatureSensorArrayList;
33 |         this.illuminancesSensorList = illuminanceArrayList;
34 | 
35 |     }
36 | 
37 |     public String getDeviceId()
38 |     {
39 |         return deviceId;
40 |     }
41 | 
42 |     public String getCurrentDate()
43 |     {
44 |         return currentDate;
45 |     }
46 | 
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/samplekinesisproducer/IOTDeviceConsumerFromBlockingQueueToKinesisStreams.java:
--------------------------------------------------------------------------------
  1 | package com.optimize.downstream.samplekinesisproducer;
  2 | 
  3 | import java.nio.ByteBuffer;
  4 | import java.nio.charset.Charset;
  5 | import java.util.ArrayList;
  6 | import java.util.List;
  7 | import java.util.concurrent.BlockingQueue;
  8 | import com.amazonaws.regions.Regions;
  9 | import com.amazonaws.services.kinesis.AmazonKinesis;
 10 | import com.amazonaws.services.kinesis.AmazonKinesisClient;
 11 | import com.amazonaws.services.kinesis.model.PutRecordsRequest;
 12 | import com.amazonaws.services.kinesis.model.PutRecordsRequestEntry;
 13 | import com.google.gson.Gson;
 14 | 
 15 | public class IOTDeviceConsumerFromBlockingQueueToKinesisStreams implements Runnable
 16 | {
 17 |     private AmazonKinesis kinesis;
 18 |     private List<PutRecordsRequestEntry> entries;
 19 |     private BlockingQueue<IOTDevice> inputQueue;
 20 |     private int dataSize;
 21 |     Gson gson;
 22 |     private final String STREAM_NAME = "AWS-Blog-BaseKinesisStream";
 23 |     private final String  REGION = "us-east-1";
 24 |     private static Charset charset = Charset.forName("UTF-8");
 25 | 
 26 |     public IOTDeviceConsumerFromBlockingQueueToKinesisStreams(BlockingQueue<IOTDevice> inputQueue)
 27 |     {
 28 |         gson = new Gson();
 29 |         this.inputQueue = inputQueue;
 30 |         kinesis = new AmazonKinesisClient().withRegion(Regions.fromName(REGION));
 31 |         entries = new ArrayList();
 32 |         dataSize = 0;
 33 |     }
 34 | 
 35 |     @Override
 36 |     public void run()
 37 |     {
 38 |         long threadId = Thread.currentThread().getId();
 39 |         System.out.println("Thread # " + threadId + " is doing this task");
 40 |         while(!inputQueue.isEmpty())
 41 |         {
 42 |             try {
 43 |                 IOTDevice deviceMessage = inputQueue.take();
 44 |                 String partitionKey = deviceMessage.getDeviceId();
 45 | 
 46 |                 String json = gson.toJson(deviceMessage);
 47 | 
 48 |                 //System.out.println("================= JSON String IS ================");
 49 |                 //System.out.println(json);
 50 |                 //System.out.println("Partition Key / Device Id before inserting into Kinesis stream is : " + partitionKey);
 51 | 
 52 |                 //System.out.println("SRIKANTH : SIZE IS : " + json.getBytes(charset).length);
 53 |                 //ByteBuffer data = ByteBuffer.wrap(SerializationUtils.serialize(deviceMessage));
 54 |                 ByteBuffer data = ByteBuffer.wrap(json.getBytes());
 55 |                 pushToKinesis(new PutRecordsRequestEntry().withPartitionKey(partitionKey).withData(data));
 56 |             } catch (Exception e) {
 57 |                 e.printStackTrace();
 58 |             }
 59 |         }
 60 |         System.exit(0);
 61 |     }
 62 |     private void flush() {
 63 |         System.out.println("Sending a record to Kinesis Stream with " + entries.size() + " messages grouped together.");
 64 |         kinesis.putRecords(new PutRecordsRequest()
 65 |                 .withStreamName(STREAM_NAME)
 66 |                 .withRecords(entries));
 67 |         entries.clear();
 68 | 
 69 |     }
 70 |     private void pushToKinesis(PutRecordsRequestEntry entry)
 71 |     {
 72 |         /*System.out.println("===================================================================");
 73 |         System.out.println("Data Size is : " + dataSize);
 74 |         System.out.println("Remaining Data is : " + entry.getData().remaining());
 75 |         System.out.println("Partition Key length is : " + entry.getPartitionKey().length());*/
 76 | 
 77 |         int newDataSize = dataSize + entry.getData().remaining() +
 78 |                 entry.getPartitionKey().length();
 79 |         if (newDataSize <= 5 * 1024 * 1024 && entries.size() < 500)
 80 |         {
 81 |             dataSize = newDataSize;
 82 |             entries.add(entry);
 83 |             //System.out.println("Data size is : " + dataSize );
 84 |         }
 85 |         else {
 86 |             //System.out.println("In Else : Entries size is : " + entries.size() + " --- New Data size is ::: " + newDataSize);
 87 |             //System.out.println("Sending records to Kinesis Stream... Size is ::: " + dataSize);
 88 |             /*kinesis.putRecords(new PutRecordsRequest()
 89 |                 .withStreamName(STREAM_NAME)
 90 |                 .withRecords(entry));*/
 91 |             flush();
 92 |             System.out.println("Record sent to Kinesis Stream. Record size is ::: " + dataSize + " KB");
 93 |             dataSize = 0;
 94 |             pushToKinesis(entry);
 95 |         }
 96 |     }
 97 | 
 98 |     /*private String generateJSONObject(IOTDevice deviceMessage)
 99 |     {
100 |         GPS gps;
101 |         TemperatureSensor tempSensor;
102 | 
103 |         JSONObject mainObj = new JSONObject();
104 |         mainObj.put("deviceid", deviceMessage.getDeviceId());
105 |         mainObj.put("currentDate", deviceMessage.getCurrentDate());
106 |         mainObj.put("accelerometerreadings", getAccelerometerReadings(deviceMessage.accelerometerSensor));
107 |         mainObj.put("gpsreadings", getGPSReadings(deviceMessage.gpsSensor));
108 |         mainObj.put("temperaturereadings", getTemperatureReadings(deviceMessage.tempSensor));
109 |         mainObj.put("illuminancereadings", getIlluminanceReadings(deviceMessage.illuminancesSensor));
110 | 
111 |         return mainObj.toJSONString();
112 |     }
113 | 
114 |     private JSONArray getAccelerometerReadings(ArrayList<Accelerometer> acc)
115 |     {
116 |         JSONArray accelerometerReadings = new JSONArray();
117 |         for (Accelerometer a : acc)
118 |         {
119 |             JSONObject accelerometerObj = new JSONObject();
120 | 
121 |             accelerometerObj.put("accelerometer_X", a.getAccelerometer_X());
122 | 
123 |             accelerometerObj.put("accelerometer_Y", a.getAccelerometer_Y());
124 |             accelerometerObj.put("accelerometer_Z", a.getAccelerometer_Z());
125 | 
126 |             accelerometerObj.put("linearAccelerationSensor_X", a.getLinearAccelerationSensor_X());
127 |             accelerometerObj.put("linearAccelerationSensor_Y", a.getLinearAccelerationSensor_Y());
128 |             accelerometerObj.put("linearAccelerationSensor_Z", a.getLinearAccelerationSensor_Z());
129 | 
130 |             accelerometerObj.put("gravitySensor_X", a.getGravitySensor_X());
131 |             accelerometerObj.put("gravitySensor_Y", a.getGravitySensor_Y());
132 |             accelerometerObj.put("gravitySensor_Z", a.getGravitySensor_Z());
133 | 
134 |             accelerometerReadings.add(accelerometerObj);
135 |         }
136 |         return accelerometerReadings;
137 |     }
138 | 
139 |     private JSONArray getTemperatureReadings(ArrayList<TemperatureSensor> temp)
140 |     {
141 |         JSONArray temperatureReadings = new JSONArray();
142 |         for (TemperatureSensor t : temp)
143 |         {
144 |             JSONObject accelerometerObj = new JSONObject();
145 |             accelerometerObj.put("celcius", t.getCelsius());
146 |             accelerometerObj.put("fahrenheit", t.getFahrenheit());
147 |             accelerometerObj.put("kelvin", t.getKelvin());
148 | 
149 |             temperatureReadings.add(accelerometerObj);
150 |         }
151 |         return temperatureReadings;
152 |     }
153 | 
154 |     private JSONArray getGPSReadings(ArrayList<GPS> gps)
155 |     {
156 |         JSONArray gpsReadings = new JSONArray();
157 |         int gpsLength = gps.size();
158 | 
159 |         for (GPS g : gps)
160 |         {
161 |             JSONObject gpsObj = new JSONObject();
162 |             gpsObj.put("altitude", g.getAltitude());
163 |             gpsObj.put("heading", g.getHeading());
164 |             gpsObj.put("lat", g.getLatitude());
165 |             gpsObj.put("long", g.getLongitude());
166 | 
167 |             gpsReadings.add(gpsObj);
168 |         }
169 |         return gpsReadings;
170 |     }
171 | 
172 |     private JSONArray getIlluminanceReadings(ArrayList<Illuminance> illuminances)
173 |     {
174 |         JSONArray illuminancesReadings = new JSONArray();
175 |         int gpsLength = illuminances.size();
176 | 
177 |         for (Illuminance i : illuminances)
178 |         {
179 |             JSONObject illuminancesObj = new JSONObject();
180 |             illuminancesObj.put("illuminance", i.getIlluminance());
181 |             illuminancesReadings.add(illuminancesObj);
182 |         }
183 |         return illuminancesReadings;
184 |     }*/
185 | }
186 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/samplekinesisproducer/IOTDeviceProducerToBlockingQueue.java:
--------------------------------------------------------------------------------
  1 | package com.optimize.downstream.samplekinesisproducer;
  2 | 
  3 | import com.optimize.downstream.sensors.Accelerometer;
  4 | import com.optimize.downstream.sensors.GPS;
  5 | import com.optimize.downstream.sensors.Illuminance;
  6 | import com.optimize.downstream.sensors.TemperatureSensor;
  7 | 
  8 | 
  9 | import java.text.DateFormat;
 10 | import java.text.SimpleDateFormat;
 11 | import java.util.ArrayList;
 12 | import java.util.Date;
 13 | import java.util.Random;
 14 | import java.util.UUID;
 15 | import java.util.concurrent.BlockingQueue;
 16 | import java.util.concurrent.atomic.AtomicLong;
 17 | 
 18 | public class IOTDeviceProducerToBlockingQueue implements Runnable
 19 | {
 20 |     private final BlockingQueue<IOTDevice> inputQueue;
 21 |     private volatile boolean shutdown = false;
 22 |     private final AtomicLong recordsPut = new AtomicLong(0);
 23 |     private Random random = new Random();
 24 |     private int numberOfSamplesInEachMessage;
 25 |     private int numberOfMessages;
 26 | 
 27 |     public IOTDeviceProducerToBlockingQueue(BlockingQueue<IOTDevice> inputQueue, int numberOfMessages, int numberOfSamplesInEachMessage)
 28 |     {
 29 |         this.inputQueue = inputQueue;
 30 |         this.numberOfSamplesInEachMessage = numberOfSamplesInEachMessage;
 31 |         this.numberOfMessages = numberOfMessages;
 32 |     }
 33 | 
 34 |     public void run() {
 35 |         long threadId = Thread.currentThread().getId();
 36 |         //System.out.println("Thread # " + threadId + " is doing this task");
 37 | 
 38 |         //while (!shutdown) {
 39 |         for(int i=0;i<numberOfMessages;i++)
 40 |         {
 41 |             try {
 42 |                 IOTDevice iotDevice = generateIOTDeviceData();
 43 |                 System.out.println("Inserting a message into blocking queue before sending to Kinesis Firehose and Message number is : " + i);
 44 |                 inputQueue.put(iotDevice);
 45 |             } catch (Exception e) {
 46 |                 e.printStackTrace();
 47 |             }
 48 |             if (i%100 == 0) // For every 100 messages sleep 500ms.
 49 |             {
 50 |                 try {
 51 |                     System.out.println("Producer Thread # " + threadId + " is going to sleep mode for 500 ms.");
 52 |                     Thread.sleep(500);
 53 |                 }
 54 |                 catch (Exception e)
 55 |                 {
 56 |                     e.printStackTrace();
 57 |                 }
 58 |             }
 59 |         }
 60 |     }
 61 | 
 62 |     private IOTDevice  generateIOTDeviceData()
 63 |     {
 64 |         String deviceId;
 65 |         String currentDate;
 66 |         UUID uuid = UUID.randomUUID();
 67 |         //System.out.println("Device Id is :: " + uuid);
 68 |         deviceId = uuid.toString();
 69 |         DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
 70 |         Date date = new Date();
 71 |         currentDate = dateFormat.format(date);
 72 | 
 73 |         IOTDevice iotDevice = new IOTDevice(deviceId,
 74 |                 currentDate,
 75 |                 getAccelerometerList(),
 76 |                 getGpsArrayList(),
 77 |                 getTemperatureSensorArrayList(),
 78 |                 getIlluminanceArrayList());
 79 | 
 80 |         return iotDevice;
 81 |     }
 82 | 
 83 |     private ArrayList<Accelerometer> getAccelerometerList()
 84 |     {
 85 |         ArrayList<Accelerometer> accelerometerArrayList = new ArrayList();
 86 | 
 87 |         for(int i =0;i <numberOfSamplesInEachMessage; i++)
 88 |         {
 89 |             Accelerometer a = new Accelerometer();
 90 |             a.setAccelerometer_X(getRandomInt(10,1));
 91 |             a.setAccelerometer_Y(getRandomInt(10,1));
 92 |             a.setAccelerometer_Z(getRandomInt(10,1));
 93 |             a.setGravitySensor_X(getRandomInt(10,1));
 94 |             a.setGravitySensor_Y(getRandomInt(10,1));
 95 |             a.setGravitySensor_Z(getRandomInt(10,1));
 96 |             a.setLinearAccelerationSensor_X(getRandomInt(10,1));
 97 |             a.setLinearAccelerationSensor_Y(getRandomInt(10,1));
 98 |             a.setLinearAccelerationSensor_Z(getRandomInt(10,1));
 99 | 
100 |             //System.out.println("Just for testing ::::: getAccelerometer_X is ::: " + a.getAccelerometer_X());
101 |             accelerometerArrayList.add(a);
102 |         }
103 |         //System.out.println("Acc List size is :::: " + accelerometerArrayList.size());
104 |         return accelerometerArrayList;
105 |     }
106 | 
107 | 
108 |     private ArrayList<GPS> getGpsArrayList()
109 |     {
110 |         ArrayList<GPS> gpsArrayList = new ArrayList();
111 | 
112 |         for(int i =0; i< numberOfSamplesInEachMessage; i++)
113 |         {
114 |             GPS g = new GPS();
115 |             g.setAltitude(getRandomDouble(1,10));
116 |             g.setHeading(getRandomDouble(1,10));
117 |             g.setLatitude(Math.random() * Math.PI * 2);
118 |             g.setLongitude(Math.acos(Math.random() * 2 - 1));
119 | 
120 |             gpsArrayList.add(g);
121 |         }
122 | 
123 |         return gpsArrayList;
124 |     }
125 | 
126 |     private ArrayList<Illuminance> getIlluminanceArrayList()
127 |     {
128 |         ArrayList<Illuminance> illuminanceArrayList = new ArrayList();
129 | 
130 |         for(int i =0; i< numberOfSamplesInEachMessage; i++)
131 |         {
132 |             Illuminance il = new Illuminance();
133 |             il.setIlluminance(getRandomDouble(1,100));
134 |             illuminanceArrayList.add(il);
135 |         }
136 | 
137 |         return illuminanceArrayList;
138 |     }
139 | 
140 |     private ArrayList<TemperatureSensor> getTemperatureSensorArrayList()
141 |     {
142 |         ArrayList<TemperatureSensor> temperatureSensorArrayList = new ArrayList();
143 | 
144 |         for (int i=0;i<numberOfSamplesInEachMessage;i++)
145 |         {
146 |             TemperatureSensor t = new TemperatureSensor();
147 |             t.setCelsius(getRandomDouble(1,100));
148 |             t.setFahrenheit(getRandomDouble(1,150));
149 |             t.setKelvin(getRandomDouble(1,1000));
150 | 
151 |             temperatureSensorArrayList.add(t);
152 |         }
153 | 
154 |         return temperatureSensorArrayList;
155 |     }
156 | 
157 |     private int getRandomInt(int max, int min)
158 |     {
159 |         return random.nextInt(max - min + 1) + min;
160 |     }
161 | 
162 |     private double getRandomDouble(int rangeMin, int rangeMax)
163 |     {
164 |         return rangeMin + (rangeMax - rangeMin) * random.nextDouble();
165 |     }
166 |     public long recordsPut() {
167 |         return recordsPut.get();
168 |     }
169 | 
170 |     public void stop() {
171 |         shutdown = true;
172 |     }
173 | 
174 | }
175 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/samplekinesisproducer/SampleEvent.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.samplekinesisproducer;
 2 | 
 3 | import java.util.Date;
 4 | 
 5 | public class SampleEvent
 6 | {
 7 |     private long phoneNumber;
 8 |     private long simid;
 9 |     private String fname;
10 |     private String lname;
11 |     private String prefix;
12 |     private String title;
13 |     private String jobArea;
14 |     private String jobType;
15 |     private double latitude;
16 |     private double longitude;
17 |     private Class  age;
18 | }


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/sensors/Accelerometer.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.sensors;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class Accelerometer implements Serializable
 6 | {
 7 |     private int accelerometer_X;
 8 |     private int accelerometer_Y;
 9 |     private int accelerometer_Z;
10 | 
11 |     private int linearAccelerationSensor_X;
12 |     private int linearAccelerationSensor_Y;
13 |     private int linearAccelerationSensor_Z;
14 | 
15 |     private int gravitySensor_X;
16 |     private int gravitySensor_Y;
17 |     private int gravitySensor_Z;
18 | 
19 | 
20 |     public int getAccelerometer_X() {
21 |         return accelerometer_X;
22 |     }
23 | 
24 |     public int getAccelerometer_Y() {
25 |         return accelerometer_Y;
26 |     }
27 | 
28 |     public int getAccelerometer_Z() {
29 |         return accelerometer_Z;
30 |     }
31 | 
32 |     public int getLinearAccelerationSensor_X() {
33 |         return linearAccelerationSensor_X;
34 |     }
35 | 
36 |     public int getLinearAccelerationSensor_Y() {
37 |         return linearAccelerationSensor_Y;
38 |     }
39 | 
40 |     public int getLinearAccelerationSensor_Z() {
41 |         return linearAccelerationSensor_Z;
42 |     }
43 | 
44 |     public int getGravitySensor_X() {
45 |         return gravitySensor_X;
46 |     }
47 | 
48 |     public int getGravitySensor_Y() {
49 |         return gravitySensor_Y;
50 |     }
51 | 
52 |     public int getGravitySensor_Z() {
53 |         return gravitySensor_Z;
54 |     }
55 | 
56 |     public void setAccelerometer_X(int accelerometer_X) {
57 |         this.accelerometer_X = accelerometer_X;
58 |     }
59 | 
60 |     public void setAccelerometer_Y(int accelerometer_Y) {
61 |         this.accelerometer_Y = accelerometer_Y;
62 |     }
63 | 
64 |     public void setAccelerometer_Z(int accelerometer_Z) {
65 |         this.accelerometer_Z = accelerometer_Z;
66 |     }
67 | 
68 |     public void setGravitySensor_X(int gravitySensor_X) {
69 |         this.gravitySensor_X = gravitySensor_X;
70 |     }
71 | 
72 |     public void setGravitySensor_Y(int gravitySensor_Y) {
73 |         this.gravitySensor_Y = gravitySensor_Y;
74 |     }
75 | 
76 |     public void setGravitySensor_Z(int gravitySensor_Z) {
77 |         this.gravitySensor_Z = gravitySensor_Z;
78 |     }
79 | 
80 |     public void setLinearAccelerationSensor_X(int linearAccelerationSensor_X) {
81 |         this.linearAccelerationSensor_X = linearAccelerationSensor_X;
82 |     }
83 | 
84 |     public void setLinearAccelerationSensor_Y(int linearAccelerationSensor_Y) {
85 |         this.linearAccelerationSensor_Y = linearAccelerationSensor_Y;
86 |     }
87 | 
88 |     public void setLinearAccelerationSensor_Z(int linearAccelerationSensor_Z) {
89 |         this.linearAccelerationSensor_Z = linearAccelerationSensor_Z;
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/sensors/GPS.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.sensors;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class GPS implements Serializable
 6 | {
 7 |     private double latitude; // -90.0 to 90.0
 8 |     private double longitude; // -180.0 to 180.0
 9 |     private double altitude; //In meters
10 |     private double heading; // 0 to 359.999
11 | 
12 |     public double getAltitude() {
13 |         return altitude;
14 |     }
15 | 
16 |     public double getHeading() {
17 |         return heading;
18 |     }
19 | 
20 |     public double getLatitude() {
21 |         return latitude;
22 |     }
23 | 
24 |     public double getLongitude() {
25 |         return longitude;
26 |     }
27 | 
28 |     public void setAltitude(double altitude) {
29 |         this.altitude = altitude;
30 |     }
31 | 
32 |     public void setHeading(double heading) {
33 |         this.heading = heading;
34 |     }
35 | 
36 |     public void setLatitude(double latitude) {
37 |         this.latitude = latitude;
38 |     }
39 | 
40 |     public void setLongitude(double longitude) {
41 |         this.longitude = longitude;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/sensors/Illuminance.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.sensors;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class Illuminance implements Serializable
 6 | {
 7 |     private double illuminance;
 8 | 
 9 |     public double getIlluminance() {
10 |         return illuminance;
11 |     }
12 | 
13 |     public void setIlluminance(double illuminance) {
14 |         this.illuminance = illuminance;
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/sample-kinesis-producer/src/main/java/com/optimize/downstream/sensors/TemperatureSensor.java:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.sensors;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class TemperatureSensor implements Serializable
 6 | {
 7 |     private double celsius;
 8 |     private double fahrenheit;
 9 |     private double kelvin;
10 | 
11 |     public double getCelsius() {
12 |         return celsius;
13 |     }
14 | 
15 |     public double getFahrenheit() {
16 |         return fahrenheit;
17 |     }
18 | 
19 |     public double getKelvin() {
20 |         return kelvin;
21 |     }
22 | 
23 |     public void setCelsius(double celsius) {
24 |         this.celsius = celsius;
25 |     }
26 | 
27 |     public void setFahrenheit(double fahrenheit) {
28 |         this.fahrenheit = fahrenheit;
29 |     }
30 | 
31 |     public void setKelvin(double kelvin) {
32 |         this.kelvin = kelvin;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/spark-process/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <groupId>com.optimize.downstream</groupId>
  7 |         <artifactId>data-processing</artifactId>
  8 |         <version>1.0-SNAPSHOT</version>
  9 |     </parent>
 10 |     <modelVersion>4.0.0</modelVersion>
 11 | 
 12 |     <artifactId>spark-process</artifactId>
 13 |     <packaging>jar</packaging>
 14 | 
 15 |     <!--
 16 |     <build>
 17 |         <pluginManagement>
 18 |             <plugins>
 19 |                 <plugin>
 20 |                     <groupId>org.apache.maven.plugins</groupId>
 21 |                     <artifactId>maven-surefire-plugin</artifactId>
 22 |                     <configuration>
 23 |                         <testFailureIgnore>true</testFailureIgnore>
 24 |                     </configuration>
 25 |                 </plugin>
 26 |             </plugins>
 27 |         </pluginManagement>
 28 |         <plugins>
 29 |             <plugin>
 30 |                 <groupId>org.apache.maven.plugins</groupId>
 31 |                 <artifactId>maven-compiler-plugin</artifactId>
 32 |                 <configuration>
 33 |                     <skipMain>true</skipMain>
 34 |                     <skip>true</skip>
 35 |                 </configuration>
 36 |             </plugin>
 37 | 
 38 |             <plugin>
 39 |                 <groupId>com.google.code.sbt-compiler-maven-plugin</groupId>
 40 |                 <artifactId>sbt-compiler-maven-plugin</artifactId>
 41 |                 <version>1.0.0</version>
 42 |                 <configuration>
 43 |                     <scalaVersion>2.11.12</scalaVersion>
 44 |                 </configuration>
 45 |                 <executions>
 46 |                     <execution>
 47 |                         <id>default-sbt-compile</id>
 48 |                         <goals>
 49 |                             <goal>addScalaSources</goal>
 50 |                             <goal>compile</goal>
 51 |                             <goal>testCompile</goal>
 52 |                         </goals>
 53 |                     </execution>
 54 |                 </executions>
 55 |             </plugin>
 56 |         </plugins>
 57 |     </build>
 58 |     -->
 59 | 
 60 |     <properties>
 61 |         <skip.assembly>false</skip.assembly>
 62 |     </properties>
 63 | 
 64 |     <!--
 65 |     <pluginRepositories>
 66 |         <pluginRepository>
 67 |             <id>jcenter</id>
 68 |             <url>http://jcenter.bintray.com/</url>
 69 |         </pluginRepository>
 70 |     </pluginRepositories>
 71 |     -->
 72 | 
 73 |     <dependencies>
 74 | 
 75 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
 76 |         <dependency>
 77 |             <groupId>org.apache.spark</groupId>
 78 |             <artifactId>spark-core_2.11</artifactId>
 79 |             <version>2.3.1</version>
 80 |         </dependency>
 81 | 
 82 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
 83 |         <dependency>
 84 |             <groupId>org.apache.spark</groupId>
 85 |             <artifactId>spark-sql_2.11</artifactId>
 86 |             <version>2.3.1</version>
 87 |         </dependency>
 88 | 
 89 |         <!-- https://mvnrepository.com/artifact/org.scala-lang/scala-reflect -->
 90 |         <dependency>
 91 |             <groupId>org.scala-lang</groupId>
 92 |             <artifactId>scala-reflect</artifactId>
 93 |             <version>2.11.8</version>
 94 |         </dependency>
 95 | 
 96 |         <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk -->
 97 |         <dependency>
 98 |             <groupId>com.amazonaws</groupId>
 99 |             <artifactId>aws-java-sdk</artifactId>
100 |             <version>1.11.390</version>
101 |         </dependency>
102 | 
103 |         <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
104 |         <dependency>
105 |             <groupId>commons-io</groupId>
106 |             <artifactId>commons-io</artifactId>
107 |             <version>2.7</version>
108 |         </dependency>
109 | 
110 | 
111 |     </dependencies>
112 | 
113 |     <build>
114 |         <sourceDirectory>src/main/scala</sourceDirectory>
115 |         <testSourceDirectory>src/test/scala</testSourceDirectory>
116 |         <plugins>
117 |             <plugin>
118 |                 <groupId>org.scala-tools</groupId>
119 |                 <artifactId>maven-scala-plugin</artifactId>
120 |                 <executions>
121 |                     <execution>
122 |                         <goals>
123 |                             <goal>compile</goal>
124 |                             <goal>testCompile</goal>
125 |                         </goals>
126 |                     </execution>
127 |                 </executions>
128 |                 <configuration>
129 |                     <scalaVersion>2.11.8</scalaVersion>
130 |                 </configuration>
131 |             </plugin>
132 |         </plugins>
133 |     </build>
134 | </project>


--------------------------------------------------------------------------------
/spark-process/src/main/scala/com/optimize/downstream/process/ProcessFilesFromS3AndConvertToParquet.scala:
--------------------------------------------------------------------------------
 1 | package com.optimize.downstream.process
 2 | 
 3 | import java.io.InputStream
 4 | import java.util.zip.GZIPInputStream
 5 | import com.amazonaws.auth.BasicAWSCredentials
 6 | import com.amazonaws.services.s3.AmazonS3Client
 7 | import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing}
 8 | import org.apache.spark.sql._
 9 | 
10 | import scala.collection.JavaConversions.{collectionAsScalaIterable => asScala}
11 | import scala.io.Source
12 | 
13 | object ProcessFilesFromS3AndConvertToParquet {
14 | 
15 |   val pageLength = 1000
16 |   def main(args: Array[String]) =
17 |   {
18 |     if(args.length != 3)
19 |     {
20 |       println("Requires 3 parameters")
21 |       println("Usage: <sourceBucket> <s3InputLocation> <s3OutputLocation>")
22 |       System.exit(-1)
23 |     }
24 |     val s3BucketName = args(0)
25 |     val s3InputLocation = args(1)
26 |     val s3OutputLocation = args(2)
27 | 
28 |     //def s3Client = new AmazonS3Client(new BasicAWSCredentials(accesskeyID, secretAccessKey))
29 |     def s3Client = new AmazonS3Client()
30 | 
31 |     val spark = SparkSession
32 |       .builder()
33 |       .appName("AWS-Small-Blogs-Job")
34 |       .getOrCreate()
35 | 
36 |     val request = new ListObjectsRequest()
37 |     request.setBucketName(s3BucketName)
38 |     request.setPrefix(s3InputLocation) //Get the prefix part only
39 |     request.setMaxKeys(pageLength)
40 | 
41 | 
42 |     var objs= new ObjectListing()
43 |     objs = s3Client.listObjects(request)
44 |     val s3ObjectKeys = objs.getObjectSummaries.map(x => x.getKey).toList
45 |     println("Printing the keys")
46 |     s3ObjectKeys.foreach { println }
47 | 
48 |     val allLinesRDD = spark.sparkContext.parallelize(s3ObjectKeys).flatMap
49 |     { key => Source.fromInputStream(new GZIPInputStream(s3Client.getObject(s3BucketName, key).getObjectContent: InputStream)).getLines }
50 | 
51 |     var finalDF = spark.read.json(allLinesRDD).toDF()
52 | 
53 |     while(objs.isTruncated())
54 |     {
55 |       objs = s3Client.listNextBatchOfObjects(objs)
56 |       val s3ObjectKeys = objs.getObjectSummaries.map(x => x.getKey).toList
57 |       //println("Printing the keys")
58 |       s3ObjectKeys.foreach { println }
59 |       val allLinesRDD = spark.sparkContext.parallelize(s3ObjectKeys).flatMap
60 |       { key => Source.fromInputStream(new GZIPInputStream(s3Client.getObject(s3BucketName, key).getObjectContent: InputStream)).getLines }
61 | 
62 |       val allLines = spark.read.json(allLinesRDD).toDF()
63 |       finalDF = finalDF.union(allLines)
64 |     }
65 |     finalDF.write
66 |       .mode("append")
67 |       .parquet("s3://" + s3BucketName + "/" + s3OutputLocation)
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------