├── .gitignore
├── .travis.yml
├── CHANGELOG
├── LICENSE-2.0.txt
├── README.md
├── Vagrantfile
├── config
    ├── .gitignore
    └── config.hocon.sample
├── docs
    ├── dynamodb-table-image.png
    └── spark-ui-image.png
├── project
    ├── BuildSettings.scala
    ├── Dependencies.scala
    ├── SparkStreamingExampleProjectBuild.scala
    ├── build.properties
    └── plugins.sbt
├── src
    └── main
    │   └── scala
    │       └── com.snowplowanalytics.spark
    │           ├── package.scala
    │           └── streaming
    │               ├── SimpleEvent.scala
    │               ├── StreamingCounts.scala
    │               ├── StreamingCountsApp.scala
    │               ├── StreamingCountsConfig.scala
    │               ├── kinesis
    │                   └── KinesisUtils.scala
    │               └── storage
    │                   ├── BucketingStrategy.scala
    │                   └── DynamoUtils.scala
├── tasks.py
└── vagrant
    ├── .gitignore
    ├── ansible.hosts
    ├── peru.yaml
    ├── up.bash
    ├── up.guidance
    └── up.playbooks


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache
 6 | .history
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Python
16 | __pycache__/
17 | *.py[cod]
18 | 
19 | # Vagrant
20 | .vagrant
21 | 
22 | *.class
23 | *.log
24 | 
25 | # Scala-IDE specific
26 | .scala_dependencies
27 | .worksheet
28 | 
29 | # Spark build
30 | master.zip
31 | spark-master/
32 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |     - 2.10.4
4 | jdk:
5 |     - oraclejdk7
6 |     - openjdk6
7 |     - openjdk7
8 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | Version 0.1.0 (2015-06-10)
2 | --------------------------
3 | Initial release


--------------------------------------------------------------------------------
/LICENSE-2.0.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Spark Streaming Example Project
  2 | 
  3 | [ ![Build Status] [travis-image] ] [travis]  [ ![Release] [release-image] ] [releases] [ ![License] [license-image] ] [license]
  4 | 
  5 | ## Introduction
  6 | 
  7 | This is a simple time series analysis stream processing job ([introductory blog post] [blog-post]) written in Scala for the [Spark Streaming] [spark-streaming] cluster computing platform, processing JSON events from [Amazon Kinesis] [kinesis] and writing aggregates to [Amazon DynamoDB] [dynamodb].
  8 | 
  9 | This was built by the Data Science team at [Snowplow Analytics] [snowplow], who use Spark Streaming in their client projects.
 10 | 
 11 | **Running this requires an Amazon AWS account, and it will incur charges.**
 12 | 
 13 | _See also:_ [Spark Example Project] [spark-example-project] | [AWS Lambda Example Project] [aws-lambda-example-project]
 14 | 
 15 | ## Overview
 16 | 
 17 | We have implemented a super-simple analytics-on-write stream processing job using Spark Streaming. Our Spark Streaming job reads a Kinesis stream containing events in a JSON format:
 18 | 
 19 | ```json
 20 | {
 21 |   "timestamp": "2015-06-05T12:54:43.064528",
 22 |   "type": "Green",
 23 |   "id": "4ec80fb1-0963-4e35-8f54-ce760499d974"
 24 | }
 25 | ```
 26 | 
 27 | Our job counts the events by `type` and aggregates these counts into 1 minute buckets. The job then takes these aggregates and saves them into a table in DynamoDB:
 28 | 
 29 | ![dynamodb-table-image][dynamodb-table-image]
 30 | 
 31 | ## Developer Quickstart
 32 | 
 33 | Assuming git, [Vagrant] [vagrant-install] and [VirtualBox] [virtualbox-install] installed:
 34 | 
 35 | ```bash
 36 |  host$ git clone https://github.com/snowplow/spark-streaming-example-project.git
 37 |  host$ cd spark-streaming-example-project
 38 |  host$ vagrant up && vagrant ssh
 39 | guest$ cd /vagrant
 40 | guest$ sbt compile
 41 | ```
 42 | 
 43 | ## Tutorial
 44 | 
 45 | You can follow along in [the release blog post] [blog-post] to get the project up and running yourself.
 46 | 
 47 | The below steps assume that you are running inside Vagrant, as per the Developer Quickstart above.
 48 | 
 49 | ### 1. Setting up AWS
 50 | 
 51 | First we need to configure a default AWS profile:
 52 | 
 53 | ```bash
 54 | $ aws configure
 55 | AWS Access Key ID [None]: ...
 56 | AWS Secret Access Key [None]: ...
 57 | Default region name [None]: us-east-1
 58 | Default output format [None]: json
 59 | ```
 60 | 
 61 | Now we create our Kinesis event stream:
 62 | 
 63 | ```bash
 64 | $ inv create_kinesis_stream default my-stream
 65 | ```
 66 | 
 67 | Wait a minute and then:
 68 | 
 69 | ```bash
 70 | $ inv describe_kinesis_stream default my-stream
 71 | {
 72 |     "StreamDescription": {
 73 |         "StreamStatus": "ACTIVE",
 74 |         "StreamName": "my-stream",
 75 |         "StreamARN": "arn:aws:kinesis:us-east-1:719197435995:stream/my-stream",
 76 |         "Shards": [
 77 |             {
 78 |                 "ShardId": "shardId-000000000000",
 79 |                 "HashKeyRange": {
 80 |                     "EndingHashKey": "340282366920938463463374607431768211455",
 81 |                     "StartingHashKey": "0"
 82 |                 },
 83 |                 "SequenceNumberRange": {
 84 |                     "StartingSequenceNumber": "49551350243544458458477304430170758137221526998466166786"
 85 |                 }
 86 |             }
 87 |         ]
 88 |     }
 89 | }
 90 | ```
 91 | 
 92 | If the Kinesis response says that the stream is still being created, wait a minute and then try again.
 93 | 
 94 | Now create our DynamoDB table:
 95 | 
 96 | ```bash
 97 | $ inv create_dynamodb_table default us-east-1 my-table
 98 | ```
 99 | 
100 | ### 2. Sending events to Kinesis
101 | 
102 | We need to start sending events to our new Kinesis stream. We have created a helper method to do this - run the below and leave it running: 
103 | 
104 | ```bash
105 | $ inv generate_events default us-east-1 my-stream
106 | Event sent to Kinesis: {"timestamp": "2015-06-05T12:54:43.064528", "type": "Green", "id": "4ec80fb1-0963-4e35-8f54-ce760499d974"}
107 | Event sent to Kinesis: {"timestamp": "2015-06-05T12:54:43.757797", "type": "Red", "id": "eb84b0d1-f793-4213-8a65-2fb09eab8c5c"}
108 | Event sent to Kinesis: {"timestamp": "2015-06-05T12:54:44.295972", "type": "Yellow", "id": "4654bdc8-86d4-44a3-9920-fee7939e2582"}
109 | ...
110 | ```
111 | 
112 | Now open up a separate terminal for the rest of the setup.
113 | 
114 | ### 3. Running our job on Spark Streaming
115 | 
116 | First we need to build Spark Streaming with Kinesis support. This can take up to 90 minutes:
117 | 
118 | ```bash
119 | $ inv build_spark
120 | ...
121 | [INFO] Spark Kinesis Integration ......................... SUCCESS [1:11.115s]
122 | ...
123 | [INFO] ------------------------------------------------------------------------
124 | [INFO] BUILD SUCCESS
125 | [INFO] ------------------------------------------------------------------------
126 | [INFO] Total time: 1:29:00.686s
127 | [INFO] Finished at: Sun Jun 07 00:32:09 UTC 2015
128 | [INFO] Final Memory: 94M/665M
129 | [INFO] ------------------------------------------------------------------------
130 | ```
131 | 
132 | Now we build our application. This should take closer to 10 minutes:
133 | 
134 | ```bash
135 | $ inv build_project
136 | ...
137 | ```
138 | 
139 | Finally we can submit our job to Spark with this command:
140 | 
141 | ```bash
142 | $ inv run_project config/config.hocon.sample
143 | ...
144 | ```
145 | 
146 | If you have updated any of the configuration options above (e.g. stream name or region), then you will have to update the `config.hocon.sample` file accordingly.
147 | 
148 | ### 4. Monitoring your job
149 | 
150 | First review the spooling output of the `run_project` command above - it's very verbose, but if you don't see any Java stack traces in there, then Spark Streaming should be running okay.
151 | 
152 | Now head over to your host machine's [localhost:4040] [localhost-4040] and you should see something like this:
153 | 
154 | ![spark-ui-image][spark-ui-image]
155 | 
156 | You can see how our Spark Streaming job _discretizes_ the Kinesis event stream into 2-second-duration "micro-batches", which are each then processed as a discrete Spark job.
157 | 
158 | Finally, let's check the data in our DynamoDB table. Make sure you are in the correct AWS region, then click on `my-table` and hit the `Explore Table` button:
159 | 
160 | ![dynamodb-table-image][dynamodb-table-image]
161 | 
162 | For each **BucketStart** and **EventType** pair, we see a **Count**, plus some **CreatedAt** and **UpdatedAt** metadata for debugging purposes. Our bucket size is 1 minute, and we have 5 discrete event types, hence the matrix of rows that we see.
163 | 
164 | ## Roadmap
165 | 
166 | * Porting this job to [AWS Lambda] [aws-lambda-example-project]
167 | * Various improvements for the [0.2.0 release] [020-milestone]
168 | * Expanding our analytics-on-write thinking into our new [Icebucket] [icebucket] project
169 | 
170 | ## Copyright and license
171 | 
172 | Copyright 2015 Snowplow Analytics Ltd.
173 | 
174 | Licensed under the [Apache License, Version 2.0] [license] (the "License");
175 | you may not use this software except in compliance with the License.
176 | 
177 | Unless required by applicable law or agreed to in writing, software
178 | distributed under the License is distributed on an "AS IS" BASIS,
179 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
180 | See the License for the specific language governing permissions and
181 | limitations under the License.
182 | 
183 | [travis]: https://travis-ci.org/snowplow/spark-streaming-example-project
184 | [travis-image]: https://travis-ci.org/snowplow/spark-streaming-example-project.png?branch=master
185 | [license-image]: http://img.shields.io/badge/license-Apache--2-blue.svg?style=flat
186 | [license]: http://www.apache.org/licenses/LICENSE-2.0
187 | [release-image]: http://img.shields.io/badge/release-0.1.0-blue.svg?style=flat
188 | [releases]: https://github.com/snowplow/spark-streaming-example-project/releases
189 | 
190 | [blog-post]: http://snowplowanalytics.com/blog/2015/06/10/spark-streaming-example-project-0.1.0-released/
191 | 
192 | [dynamodb-table-image]: /docs/dynamodb-table-image.png?raw=true
193 | [spark-ui-image]: /docs/spark-ui-image.png?raw=true
194 | 
195 | [spark-streaming]: https://spark.apache.org/streaming/
196 | [kinesis]: http://aws.amazon.com/kinesis
197 | [dynamodb]: http://aws.amazon.com/dynamodb
198 | [snowplow]: http://snowplowanalytics.com
199 | [icebucket]: https://github.com/snowplow/icebucket
200 | 
201 | [vagrant-install]: http://docs.vagrantup.com/v2/installation/index.html
202 | [virtualbox-install]: https://www.virtualbox.org/wiki/Downloads
203 | 
204 | [spark-example-project]: https://github.com/snowplow/spark-example-project
205 | [aws-lambda-example-project]: https://github.com/snowplow/aws-lambda-example-project
206 | 
207 | [localhost-4040]: http://localhost:4040/
208 | 
209 | [020-milestone]: https://github.com/snowplow/spark-streaming-example-project/milestones/Version%200.2.0
210 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | Vagrant.configure("2") do |config|
 2 | 
 3 |   config.vm.box = "ubuntu/trusty64"
 4 |   config.vm.hostname = "spark-streaming-example-project"
 5 |   config.ssh.forward_agent = true
 6 | 
 7 |   # Forward guest port 4040 to host port 4040 (for Spark web UI)
 8 |   config.vm.network "forwarded_port", guest: 4040, host: 4040
 9 | 
10 |   config.vm.provider :virtualbox do |vb|
11 |     vb.name = Dir.pwd().split("/")[-1] + "-" + Time.now.to_f.to_i.to_s
12 |     vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
13 |     vb.customize [ "guestproperty", "set", :id, "--timesync-threshold", 10000 ]
14 |     # Scala is memory-hungry
15 |     vb.memory = 8000
16 |   end
17 | 
18 |   config.vm.provision :shell do |sh|
19 |     sh.path = "vagrant/up.bash"
20 |   end
21 | 
22 | end
23 | 


--------------------------------------------------------------------------------
/config/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !config.hocon.sample
3 | 


--------------------------------------------------------------------------------
/config/config.hocon.sample:
--------------------------------------------------------------------------------
 1 | ###################################
 2 | #    Sample configuration for     #
 3 | # spark-streaming-example-project #
 4 | ###################################
 5 | 
 6 | 
 7 | kinesis {
 8 | 
 9 |   streamName: "my-stream"
10 | 
11 |   region: "us-east-1"
12 | 
13 | }
14 | 
15 | 
16 | spark {
17 | 
18 |   appName: "StreamingCountsApp"
19 | 
20 |   checkpointInterval: 10 # Secs
21 | 
22 |   master: "local[2]" # At least 2 threads
23 | 
24 |   batchInterval: 2000 # Ms
25 | 
26 | }
27 | 
28 | 
29 | dynamodb {
30 | 
31 |   tableName: "my-table"
32 | 
33 | }
34 | 
35 | 
36 | aws {
37 | 
38 |   awsProfile: "default"
39 | 
40 | }


--------------------------------------------------------------------------------
/docs/dynamodb-table-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snowplow-archive/spark-streaming-example-project/836eefb768c83663cb0c4ead27f08cfaeec0e352/docs/dynamodb-table-image.png


--------------------------------------------------------------------------------
/docs/spark-ui-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snowplow-archive/spark-streaming-example-project/836eefb768c83663cb0c4ead27f08cfaeec0e352/docs/spark-ui-image.png


--------------------------------------------------------------------------------
/project/BuildSettings.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | import sbt._
14 | import Keys._
15 | 
16 | object BuildSettings {
17 | 
18 |   // Basic settings for our app
19 |   lazy val basicSettings = Seq[Setting[_]](
20 |     organization  := "com.snowplowanalytics",
21 |     version       := "0.1.0",
22 |     description   := "A Spark Streaming job reading events from Amazon Kinesis and writing event counts to DynamoDB",
23 |     scalaVersion  := "2.10.4",
24 |     scalacOptions :=  Seq("-deprecation", "-encoding", "utf8",
25 |                           "-feature", "-target:jvm-1.7"),
26 |     scalacOptions in Test :=  Seq("-Yrangepos"),
27 |     resolvers     ++= Dependencies.resolutionRepos
28 |   )
29 | 
30 |   // Makes our SBT app settings available from within the app
31 |   lazy val scalifySettings = Seq(sourceGenerators in Compile <+= (sourceManaged in Compile, version, name, organization) map { (d, v, n, o) =>
32 |     val file = d / "settings.scala"
33 |     IO.write(file, """package com.snowplowanalytics.spark.streaming.generated
34 |       |object Settings {
35 |       |  val organization = "%s"
36 |       |  val version = "%s"
37 |       |  val name = "%s"
38 |       |}
39 |       |""".stripMargin.format(o, v, n))
40 |     Seq(file)
41 |   })
42 | 
43 |   // sbt-assembly settings for building a fat jar
44 |   import sbtassembly.Plugin._
45 |   import AssemblyKeys._
46 |   lazy val sbtAssemblySettings = assemblySettings ++ Seq(
47 | 
48 |     // Simpler jar name
49 |     jarName in assembly := {
50 |       name.value + "-" + version.value + ".jar"
51 |     },
52 | 
53 |     // Drop these jars
54 |     excludedJars in assembly <<= (fullClasspath in assembly) map { cp =>
55 |       val excludes = Set(
56 |         "junit-4.5.jar", // We shouldn't need JUnit
57 |         "jsp-api-2.1-6.1.14.jar",
58 |         "jsp-2.1-6.1.14.jar",
59 |         "jasper-compiler-5.5.12.jar",
60 |         "minlog-1.2.jar", // Otherwise causes conflicts with Kyro (which bundles it)
61 |         "janino-2.5.16.jar", // Janino includes a broken signature, and is not needed anyway
62 |         "commons-beanutils-core-1.8.0.jar", // Clash with each other and with commons-collections
63 |         "commons-beanutils-1.7.0.jar",      // "
64 |         "hadoop-core-0.20.2.jar", // Provided by Amazon EMR. Delete this line if you're not on EMR
65 |         "hadoop-tools-0.20.2.jar",
66 |         "guava-14.0.1.jar", // conflict spark-network-common_2.10-1.3.0.jar
67 |         "jcl-over-slf4j-1.7.10.jar", //conflict commons-logging-1.1.3.jar
68 |         "hadoop-yarn-api-2.2.0.jar"
69 |       )
70 |       cp filter { jar => excludes(jar.data.getName) }
71 |     },
72 | 
73 |     mergeStrategy in assembly <<= (mergeStrategy in assembly) {
74 |       (old) => {
75 |         case x if x.contains("UnusedStubClass.class") => MergeStrategy.first
76 |         case x if x.endsWith("project.clj") => MergeStrategy.discard // Leiningen build files
77 |         case x if x.startsWith("META-INF") => MergeStrategy.discard // More bumf
78 |         case x if x.endsWith(".html") => MergeStrategy.discard
79 |         case x => old(x)
80 |       }
81 |     }
82 |   )
83 | 
84 |   lazy val buildSettings = basicSettings ++ scalifySettings ++ sbtAssemblySettings
85 | }


--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | 
14 | import sbt._
15 | 
16 | object Dependencies {
17 |   val resolutionRepos = Seq(
18 |     "Akka Repository" at "http://repo.akka.io/releases/",
19 |     "Spray Repository" at "http://repo.spray.cc/"
20 |   )
21 | 
22 |   object V {
23 |     // Java
24 |     val awsSdk               = "1.9.34"
25 |     val awsKinesisConnectors = "1.1.1"
26 |     // Scala
27 |     val spark                = "1.3.0"
28 |     val argot                = "1.0.3"
29 |     // Add versions for your additional libraries here...
30 |     // Scala (test)
31 |     val specs2               = "1.13"
32 |     val guava                = "11.0.1"
33 |     val json4s               = "3.2.10"
34 | 
35 |   }
36 | 
37 |   object Libraries {
38 |     // Java
39 |     val awsSdk                = "com.amazonaws"    % "aws-java-sdk"                 % V.awsSdk
40 |     val awsSdkCore            = "com.amazonaws"    % "aws-java-sdk-core"            % V.awsSdk
41 |     val awsKinesisConnectors  = "com.amazonaws"    % "amazon-kinesis-connectors"    % V.awsKinesisConnectors
42 | 
43 |     // Scala
44 |     val argot                 = "org.clapper"      %% "argot"                       % V.argot
45 |     val sparkCore             = "org.apache.spark" %% "spark-core"                  % V.spark
46 |     val sparkStreaming        = "org.apache.spark" %% "spark-streaming"             % V.spark
47 |     val sparkStreamingKinesis = "org.apache.spark" %% "spark-streaming-kinesis-asl" % V.spark
48 |     val json4s                = "org.json4s"       %% "json4s-jackson"              % V.json4s
49 | 
50 |     // Scala (test only)
51 |     val specs2                = "org.specs2"       % "specs2_2.10"                  % V.specs2       % "test"
52 |     val guava                 = "com.google.guava" % "guava"                        % V.guava        % "test"
53 | 
54 |     // Add additional libraries from mvnrepository.com (SBT syntax) here...
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/project/SparkStreamingExampleProjectBuild.scala:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | import sbt._
14 | import Keys._
15 | 
16 | object SparkStreamingExampleProjectBuild extends Build {
17 | 
18 |   import Dependencies._
19 |   import BuildSettings._
20 | 
21 |   // Configure prompt to show current project
22 |   override lazy val settings = super.settings :+ {
23 |     shellPrompt := { s => Project.extract(s).currentProject.id + " > " }
24 |   }
25 | 
26 |   // Define our project, with basic project information and library dependencies
27 |   lazy val project = Project("spark-streaming-example-project", file("."))
28 |     .settings(buildSettings: _*)
29 |     .settings(
30 |       libraryDependencies ++= Seq(
31 |         Libraries.awsSdk,
32 |         Libraries.awsSdkCore,
33 |         Libraries.awsKinesisConnectors,
34 |         Libraries.argot,
35 |         Libraries.sparkCore,
36 |         Libraries.sparkStreaming,
37 |         Libraries.sparkStreamingKinesis,
38 |         Libraries.specs2
39 |       )
40 |     )
41 | }
42 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.6
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | package com.snowplowanalytics.spark
14 | 
15 | /**
16 |  * Scala package object to hold types,
17 |  * helper methods etc.
18 |  *
19 |  * See:
20 |  * http://www.artima.com/scalazine/articles/package_objects.html
21 |  */
22 | package object streaming {
23 | 
24 |   // TODO: add any packages we need
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/streaming/SimpleEvent.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | package com.snowplowanalytics.spark.streaming
14 | 
15 | // Java
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | 
19 | // json4s
20 | import org.json4s._
21 | import org.json4s.jackson.JsonMethods._
22 | 
23 | // This project
24 | import storage.BucketingStrategy
25 | 
26 | /**
27 |  * Companion object for creating a SimpleEvent
28 |  * from incoming JSON
29 |  */
30 | object SimpleEvent {
31 | 
32 |   private val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss")
33 | 
34 |   /**
35 |    * Converts date string into Date object
36 |    */
37 |   def convertStringToDate(dateString: String): Date = format.parse(dateString)
38 | 
39 |   /**
40 |    * Converts Kinesis ByteArray of JSON data into SimpleEvent objects
41 |    */
42 |   def fromJson(byteArray: Array[Byte]): SimpleEvent = {
43 |     implicit val formats = DefaultFormats
44 |     val newString = new String(byteArray, "UTF-8")
45 |     val parsed = parse(newString)
46 |     parsed.extract[SimpleEvent]
47 |   }
48 | 
49 | }
50 | 
51 | /**
52 |  * Simple Class demonstrating an EventType log consisting of:
53 |  *   1. ISO 8601 DateTime Object that will be downsampled
54 |  *      (see BucketingStrategy.scala file for more details)
55 |  *   2. A simple model of colors for this EventType:
56 |  *      'Red','Orange','Yellow','Green', or 'Blue'
57 |  *   example log: {"timestamp": "2015-06-05T13:00:22.540374", "type": "Orange", "id": "018dd633-f4c3-4599-9b44-ebf71a1c519f"}
58 |  */
59 | case class SimpleEvent(id: String, timestamp: String, `type`: String) {
60 | 
61 |   // Convert timestamp into Time Bucket using Bucketing Strategy
62 |   val bucket = BucketingStrategy.bucket(SimpleEvent.convertStringToDate(timestamp))
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/streaming/StreamingCounts.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
  3 |  *
  4 |  * This program is licensed to you under the Apache License Version 2.0,
  5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
  6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing,
  9 |  * software distributed under the Apache License Version 2.0 is distributed on an
 10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
 12 |  */
 13 | package com.snowplowanalytics.spark.streaming
 14 | 
 15 | // Spark
 16 | 
 17 | import com.amazonaws.services.dynamodbv2.document.DynamoDB
 18 | import org.apache.spark.SparkConf
 19 | import org.apache.spark.streaming._
 20 | import org.apache.spark.streaming.kinesis.KinesisUtils
 21 | 
 22 | // This project
 23 | import storage.DynamoUtils
 24 | import kinesis.{KinesisUtils => KU}
 25 | 
 26 | /**
 27 |  * Core of the Spark Streaming Application
 28 |  * 1. Configuration information is brought in from StreamingCountsApp.scala
 29 |  * 2. Object sets up Kinesis, DynamoDB, CloudTrail connections
 30 |  * 3. Once connections are up, Spark StreamingCounts stream processing starts
 31 |  * AWS Kinesis -> Apache Spark Streaming -> AWS DynamoDB
 32 |  * Raw Data    -> Stream Processing Data -> Stored in Database
 33 |  *
 34 |  * (More on Spark Streaming: https://spark.apache.org/docs/1.3.0/streaming-kinesis-integration.html)
 35 |  */
 36 | object StreamingCounts {
 37 | 
 38 |   /**
 39 |    * Private function to set up Spark Streaming
 40 |    *
 41 |    * @param config The configuration for our job using StreamingCountsConfig.scala
 42 |    */
 43 |   private def setupSparkContext(config: StreamingCountsConfig): StreamingContext = {
 44 |     val streamingSparkContext = {
 45 |       val sparkConf = new SparkConf().setAppName(config.appName).setMaster(config.master)
 46 |       new StreamingContext(sparkConf, config.batchInterval)
 47 |     }
 48 |     streamingSparkContext
 49 |   }
 50 | 
 51 |   /**
 52 |    * Starts our processing of a single Kinesis stream.
 53 |    * Never ends.
 54 |    *
 55 |    * @param config The configuration for our job using StreamingCountsConfig.scala
 56 |    */
 57 |   def execute(config: StreamingCountsConfig) {
 58 | 
 59 |     // setting up Spark Streaming connection to Kinesis
 60 |     val kinesisClient = KU.setupKinesisClientConnection(config.endpointUrl, config.awsProfile)
 61 |     require(kinesisClient != null,
 62 |       "No AWS credentials found. Please specify credentials using one of the methods specified " +
 63 |         "in http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html")
 64 | 
 65 |     // setting up Spark Streaming connection to DynamoDB
 66 |     lazy val dynamoConnection = DynamoUtils.setupDynamoClientConnection(config.awsProfile)
 67 | 
 68 |     val streamingSparkContext = setupSparkContext(config)
 69 |     val numShards = KU.getShardCount(kinesisClient, config.streamName)
 70 |     val sparkDStreams = (0 until numShards).map { i =>
 71 |       KinesisUtils.createStream(
 72 |         ssc = streamingSparkContext,
 73 |         streamName = config.streamName,
 74 |         endpointUrl = config.endpointUrl,
 75 |         initialPositionInStream = config.initialPosition,
 76 |         checkpointInterval = config.batchInterval,
 77 |         storageLevel = config.storageLevel
 78 |         )
 79 |     }
 80 | 
 81 |     // Map phase: union DStreams, derive events, determine bucket
 82 |     val bucketedEvents = streamingSparkContext
 83 |       .union(sparkDStreams)
 84 |       .map { bytes =>
 85 |         val e = SimpleEvent.fromJson(bytes)
 86 |         (e.bucket, e.`type`)
 87 |       }
 88 | 
 89 |     // Reduce phase: group by key then by count
 90 |     val bucketedEventCounts = bucketedEvents
 91 |       .groupByKey
 92 |       .map { case (eventType, events) =>
 93 |         val count = events.groupBy(identity).mapValues(_.size)
 94 |         (eventType, count)
 95 |       }
 96 | 
 97 |     // Iterate over each aggregate record and save the record into DynamoDB
 98 |     bucketedEventCounts.foreachRDD { rdd =>
 99 |       rdd.foreach { case (bucket, aggregates) =>
100 |         aggregates.foreach { case (eventType, count) =>
101 |           DynamoUtils.setOrUpdateCount(
102 |             dynamoConnection,
103 |             config.tableName,
104 |             bucket.toString,
105 |             eventType,
106 |             DynamoUtils.timeNow(),
107 |             DynamoUtils.timeNow(),
108 |             count.toInt
109 |           )
110 |         }
111 |       }
112 |     }
113 | 
114 |     // Start Spark Streaming process
115 |     streamingSparkContext.start()
116 |     streamingSparkContext.awaitTermination()
117 |   }
118 | }


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/streaming/StreamingCountsApp.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | package com.snowplowanalytics.spark.streaming
14 | 
15 | // Java
16 | import java.io.File
17 | import java.io.FileReader
18 | import java.util.Properties
19 | 
20 | // AWS libs
21 | import com.amazonaws.auth.AWSCredentialsProvider
22 | import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
23 | import com.amazonaws.services.kinesis.connectors.KinesisConnectorConfiguration
24 | 
25 | // Config
26 | import com.typesafe.config.{Config, ConfigFactory}
27 | 
28 | // Argot
29 | import org.clapper.argot._
30 | 
31 | // Spark
32 | import org.apache.spark.storage.StorageLevel
33 | import org.apache.spark.streaming.{Minutes, Seconds, Milliseconds}
34 | 
35 | /**
36 |  * The entry point class for the Spark Streaming Application.
37 |  *
38 |  * Usage:
39 |  *
40 |  * spark/bin/spark-submit --class com.snowplowanalytics.spark.streaming.StreamingCountsApp \
41 |  *                        --master local[2] \
42 |  *                        spark-streaming-example-project/target/scala-2.10/spark-streaming-example-project-0.1.0.jar \
43 |  *                        --config spark-streaming-example-project/src/main/resources/config.hocon.sample
44 | */
45 | object StreamingCountsApp {
46 | 
47 |   def main(args: Array[String]) {
48 | 
49 |     // General bumf for our app
50 |     val parser = new ArgotParser(
51 |       programName = "generated",
52 |       compactUsage = true,
53 |       preUsage = Some("%s: Version %s. Copyright (c) 2015, %s.".format(
54 |         generated.Settings.name,
55 |         generated.Settings.version,
56 |         generated.Settings.organization)
57 |       )
58 |     )
59 | 
60 |     // Optional config argument
61 |     val config = parser.option[Config](List("config"),
62 |       "filename",
63 |       "Configuration file.") {
64 |       (c, opt) =>
65 | 
66 |         val file = new File(c)
67 |         if (file.exists) {
68 |           ConfigFactory.parseFile(file)
69 |         } else {
70 |           parser.usage("Configuration file \"%s\" does not exist".format(c))
71 |           ConfigFactory.empty()
72 |         }
73 |     }
74 |     parser.parse(args)
75 | 
76 |     // read the config file if --config parameter is provided else fail
77 |     val conf = config.value.getOrElse(throw new RuntimeException("--config argument must be provided"))
78 | 
79 |     // create Spark Streaming Config from hocon file in resource directory
80 |     val scc = StreamingCountsConfig(
81 |       region = conf.getConfig("kinesis").getString("region"),
82 |       streamName = conf.getConfig("kinesis").getString("streamName"),
83 |       checkpointInterval = Minutes(conf.getConfig("spark").getInt("checkpointInterval")),
84 |       initialPosition = InitialPositionInStream.LATEST,
85 |       storageLevel = StorageLevel.MEMORY_AND_DISK_2,
86 |       appName = conf.getConfig("spark").getString("appName"),
87 |       master = conf.getConfig("spark").getString("master"),
88 |       batchInterval =  Milliseconds(conf.getConfig("spark").getInt("batchInterval")),
89 |       tableName = conf.getConfig("dynamodb").getString("tableName"),
90 |       awsProfile = conf.getConfig("aws").getString("awsProfile")
91 |     )
92 | 
93 |     // start StreamingCounts application with config object
94 |     StreamingCounts.execute(scc)
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/streaming/StreamingCountsConfig.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | package com.snowplowanalytics.spark.streaming
14 | 
15 | // AWS SDK
16 | import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
17 | 
18 | // Spark
19 | import org.apache.spark.storage.StorageLevel
20 | import org.apache.spark.streaming.Duration
21 | 
22 | /**
23 |  * Configuration object for our StreamingCounts job
24 |  */
25 | case class StreamingCountsConfig(
26 |   region:             String,
27 |   streamName:         String,
28 |   checkpointInterval: Duration,
29 |   initialPosition:    InitialPositionInStream,
30 |   storageLevel:       StorageLevel,
31 |   appName:            String,
32 |   master:             String,
33 |   batchInterval:      Duration,
34 |   tableName:          String,
35 |   awsProfile:         String
36 | ) {
37 | 
38 |   /**
39 |    * The Kinesis endpoint from the region.
40 |    */
41 |   val endpointUrl = s"https://kinesis.${region}.amazonaws.com"
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/streaming/kinesis/KinesisUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | package com.snowplowanalytics.spark.streaming
14 | package kinesis
15 | 
16 | // AWS KCL
17 | import com.amazonaws.auth.profile.ProfileCredentialsProvider
18 | import com.amazonaws.services.kinesis.AmazonKinesisClient
19 | 
20 | 
21 | object KinesisUtils {
22 | 
23 |   /**
24 |    * Singleton Object that takes 1.KCL/AWS credentials object. 2.Kinesis Stream
25 |    * name. The utility function queries the stream name and determines how many
26 |    * shards are in the stream so that the sharded data can be unioned/joined
27 |    * for processing by Apache Spark Streaming application
28 |    *
29 |    * @param AmazonKinesisClient AWS Kinesis Client
30 |    * @param String Kinesis Stream Name
31 |    * @param StreamingCountsConfig Application Configuration information object
32 |    */
33 |   def getShardCount(kinesisClient: AmazonKinesisClient, stream: String): Int =
34 |     kinesisClient
35 |       .describeStream(stream)
36 |       .getStreamDescription
37 |       .getShards
38 |       .size
39 | 
40 |   /**
41 |    * Finds AWS Credential by provided awsProfile and creates Kinesis Client
42 |    */
43 |   def setupKinesisClientConnection(endpointUrl: String, awsProfile: String): AmazonKinesisClient = {
44 |     val credentials = new ProfileCredentialsProvider(awsProfile)
45 |     val akc = new AmazonKinesisClient(credentials)
46 |     akc.setEndpoint(endpointUrl)
47 |     akc
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/streaming/storage/BucketingStrategy.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | package com.snowplowanalytics.spark.streaming
14 | package storage
15 | 
16 | // Java
17 | import java.util.Date
18 | import java.text.SimpleDateFormat
19 | 
20 | /**
21 |  * Object uses downsampling method to create metadata from each
22 |  * EventType log record. Parsing the ISO 8601
23 |  * datetime stamp to the minute means downsampling aka reducing
24 |  * precision.
25 |  *
26 |  * Bucketing
27 |  * A family of aggregations that build buckets, where each bucket
28 |  * is associated with a key and an EventType criterion. When the
29 |  * aggregation is executed, all the buckets criteria are evaluated
30 |  * on every EventType in the context and when a criterion matches,
31 |  * the EventType is considered to "fall in" the relevant bucket.
32 |  * By the end of the aggregation process, we’ll end up with a
33 |  * list of buckets - each one with a set of EventTypes that
34 |  * "belong" to it.
35 |  *
36 |  */
37 | object BucketingStrategy {
38 | 
39 |   private val BucketToMinuteFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:00.000")
40 | 
41 |   /**
42 |    * Function to bucket a date based on
43 |    * our bucketing strategy. Bucketing
44 |    * means downsampling aka reducing
45 |    * precision.
46 |    *
47 |    * @param date The Java Date to bucket
48 |    * @return the downsampled date in String
49 |    *         format
50 |    */
51 |   def bucket(date: Date): String =
52 |     BucketToMinuteFormatter.format(date)
53 | } 


--------------------------------------------------------------------------------
/src/main/scala/com.snowplowanalytics.spark/streaming/storage/DynamoUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
  3 |  *
  4 |  * This program is licensed to you under the Apache License Version 2.0,
  5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
  6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing,
  9 |  * software distributed under the Apache License Version 2.0 is distributed on an
 10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
 12 |  */
 13 | package com.snowplowanalytics.spark.streaming
 14 | package storage
 15 | 
 16 | // Java
 17 | import java.util.Date
 18 | import java.util.TimeZone
 19 | import java.text.SimpleDateFormat
 20 | 
 21 | 
 22 | // AWS Authentication
 23 | // http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html
 24 | import com.amazonaws.auth.profile.ProfileCredentialsProvider
 25 | 
 26 | // AWS DynamoDB
 27 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClient
 28 | import com.amazonaws.services.dynamodbv2.document.{AttributeUpdate, DynamoDB, Item}
 29 | 
 30 | /**
 31 |  * Object sets up singleton that finds AWS credentials for DynamoDB to access the
 32 |  * aggregation records table. The utility function below puts items into the
 33 |  * "AggregateRecords" table.
 34 |  */
 35 | object DynamoUtils {
 36 | 
 37 |   val dateFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
 38 |   val timezone = TimeZone.getTimeZone("UTC")
 39 | 
 40 |   /**
 41 |    * Function timezone helper
 42 |    */
 43 |   def timeNow(): String = {
 44 |     dateFormatter.setTimeZone(timezone)
 45 |     dateFormatter.format(new Date())
 46 |   }
 47 | 
 48 | 
 49 |   /**
 50 |    * Function wraps DynamoDB cred setup
 51 |    */
 52 |   def setupDynamoClientConnection(awsProfile: String): DynamoDB = {
 53 |     val credentials = new ProfileCredentialsProvider(awsProfile)
 54 |     val dynamoDB = new DynamoDB(new AmazonDynamoDBClient(credentials))
 55 |     dynamoDB
 56 |   }
 57 | 
 58 | 
 59 |   /**
 60 |    * Function wraps get or create item in DynamoDB table
 61 |    */
 62 |   def setOrUpdateCount(dynamoDB: DynamoDB, tableName: String, bucketStart: String, eventType: String, createdAt: String,  updatedAt: String, count: Int){
 63 | 
 64 |     val recordInTable = getItem(dynamoDB: DynamoDB, tableName, bucketStart, eventType)
 65 |     println(recordInTable)
 66 |     if (recordInTable == null) {
 67 |       DynamoUtils.putItem(dynamoDB: DynamoDB, tableName, bucketStart, eventType, createdAt, updatedAt, count)
 68 |     } else {
 69 |       val oldCreatedAt = recordInTable.getJSON("CreatedAt").replace("\"", "").replace("\\", "")
 70 |       val oldCount = recordInTable.getJSON("Count").toInt
 71 |       val newCount = oldCount + count.toInt
 72 |       DynamoUtils.putItem(dynamoDB: DynamoDB, tableName, bucketStart, eventType, oldCreatedAt, updatedAt, newCount)
 73 |     }
 74 |   }
 75 | 
 76 | 
 77 |   /**
 78 |    * Function wraps AWS Java getItemOutcome operation to DynamoDB table
 79 |    */
 80 |   def getItem(dynamoDB: DynamoDB, tableName: String, bucketStart: String, eventType: String): Item = {
 81 | 
 82 |     val table = dynamoDB.getTable(tableName)
 83 |     val items = table.getItemOutcome("BucketStart", bucketStart, "EventType", eventType)
 84 |     items.getItem
 85 |   }
 86 | 
 87 | 
 88 |   /**
 89 |    * Function wraps AWS Java putItem operation to DynamoDB table
 90 |    */
 91 |   def putItem(dynamoDB: DynamoDB, tableName: String, bucketStart: String, eventType: String, createdAt: String,  updatedAt: String, count: Int) {
 92 | 
 93 |     // AggregateRecords column names
 94 |     val tablePrimaryKeyName = "BucketStart"
 95 |     val tableEventTypeSecondaryKeyName = "EventType"
 96 |     val tableCreatedAtColumnName = "CreatedAt"
 97 |     val tableUpdatedAtColumnName = "UpdatedAt"
 98 |     val tableCountColumnName = "Count"
 99 | 
100 |     try {
101 |       val time = new Date().getTime - (1 * 24 * 60 * 60 * 1000)
102 |       val date = new Date()
103 |       date.setTime(time)
104 |       dateFormatter.setTimeZone(TimeZone.getTimeZone("UTC"))
105 |       val table = dynamoDB.getTable(tableName)
106 |       println("Adding data to " + tableName)
107 | 
108 |       val item = new Item().withPrimaryKey(tablePrimaryKeyName, bucketStart)
109 |         .withString(tableEventTypeSecondaryKeyName, eventType)
110 |         .withString(tableCreatedAtColumnName, createdAt)
111 |         .withString(tableUpdatedAtColumnName, updatedAt)
112 |         .withInt(tableCountColumnName, count)
113 | 
114 |       // saving the data to DynamoDB AggregrateRecords table
115 |       // println(item)
116 |       table.putItem(item)
117 |     } catch {
118 |       case e: Exception => {
119 |         System.err.println("Failed to create item in " + tableName)
120 |         System.err.println(e.getMessage)
121 |       }
122 |     }
123 |   }
124 | }
125 | 


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved.
  2 | #
  3 | # This program is licensed to you under the Apache License Version 2.0,
  4 | # and you may not use this file except in compliance with the Apache License Version 2.0.
  5 | # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
  6 | #
  7 | # Unless required by applicable law or agreed to in writing,
  8 | # software distributed under the Apache License Version 2.0 is distributed on an
  9 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
 11 | 
 12 | import datetime, json, uuid, time
 13 | from functools import partial
 14 | from random import choice
 15 | 
 16 | from invoke import run, task
 17 | 
 18 | import boto
 19 | from boto import kinesis
 20 | import boto.dynamodb2
 21 | from boto.dynamodb2.fields import HashKey, RangeKey, KeysOnlyIndex, GlobalAllIndex
 22 | from boto.dynamodb2.table import Table
 23 | from boto.dynamodb2.types import NUMBER
 24 | 
 25 | 
 26 | JAR_FILE = "spark-streaming-example-project-0.1.0.jar"
 27 | 
 28 | # Selection of EventType values
 29 | COLORS = ['Red','Orange','Yellow','Green','Blue']
 30 |         
 31 | 
 32 | # DynamoDB settings
 33 | THROUGHPUT_READ = 20
 34 | THROUGHPUT_WRITE = 20
 35 | 
 36 | 
 37 | # AWS Kinesis Data Generator
 38 | def picker(seq):
 39 |   """
 40 |   Returns a new function that can be called without arguments
 41 |   to select and return a random color
 42 |   """
 43 |   return partial(choice, seq)
 44 | 
 45 | def create_event():
 46 |   """
 47 |   Returns a choice of color and builds and event
 48 |   """
 49 |   event_id = str(uuid.uuid4())
 50 |   color_choice = picker(COLORS)
 51 | 
 52 |   return (event_id, {
 53 |     "id": event_id,
 54 |     "timestamp": datetime.datetime.now().isoformat(),
 55 |     "type": color_choice()
 56 |   })
 57 | 
 58 | def write_event(conn, stream_name):
 59 |   """
 60 |   Returns the event and event event_payload
 61 |   """
 62 |   event_id, event_payload = create_event()
 63 |   event_json = json.dumps(event_payload)
 64 |   conn.put_record(stream_name, event_json, event_id)
 65 |   return event_json
 66 | 
 67 | 
 68 | @task
 69 | def generate_events(profile, region, stream):
 70 |     """
 71 |     load demo data with python generator script for SimpleEvents
 72 |     """
 73 |     conn = kinesis.connect_to_region(region, profile_name=profile)
 74 |     while True:
 75 |         event_json = write_event(conn, stream)
 76 |         print "Event sent to Kinesis: {}".format(event_json)
 77 |         #time.sleep(5)
 78 | 
 79 | @task
 80 | def build_spark():
 81 |     """
 82 |     compile, build and assembly Apache Spark with Kinesis support
 83 |     """
 84 |     run("rm -rf master.zip spark-master", pty=True)
 85 |     run("wget https://github.com/apache/spark/archive/master.zip")
 86 |     run("unzip master.zip", pty=True)
 87 |     run('export MAVEN_OPTS="-Xmx1g -XX:MaxPermSize=256M -XX:ReservedCodeCacheSize=256m" && cd spark-master && mvn -Pkinesis-asl -DskipTests clean package', pty=True)
 88 | 
 89 | 
 90 | @task
 91 | def build_project():
 92 |     """
 93 |     build spark-streaming-example-project
 94 |     and package into "fat jar" ready for spark-submit
 95 |     """
 96 |     run("sbt assembly", pty=True)
 97 | 
 98 | 
 99 | @task
100 | def create_profile(profile):
101 |     """
102 |     Create a profile
103 |     """
104 |     run("aws configure --profile {}".format(profile), pty=True)
105 | 
106 | 
107 | @task
108 | def create_dynamodb_table(profile, region, table):
109 |     """
110 |     DynamoDB table creation with AWS Boto library in Python
111 |     """
112 | 
113 |     connection = boto.dynamodb2.connect_to_region(region, profile_name=profile)
114 |     aggregate = Table.create(table,
115 |                              schema=[
116 |                                  HashKey("BucketStart"),
117 |                                  RangeKey("EventType"),
118 |                              ],
119 |                              throughput={
120 |                                  'read': THROUGHPUT_READ,
121 |                                  'write': THROUGHPUT_WRITE
122 |                              },
123 |                              connection=connection
124 |                              )
125 | 
126 | 
127 | @task
128 | def create_kinesis_stream(profile, stream):
129 |     """
130 |     create our Kinesis stream
131 |     """
132 | 
133 |     # TODO: switch to use boto
134 |     run("aws kinesis create-stream --stream-name {} --shard-count 1 --profile {}".format(stream, profile), pty=True)
135 | 
136 | 
137 | @task
138 | def describe_kinesis_stream(profile, stream):
139 |     """
140 |     show status Kinesis stream named eventStream
141 |     """
142 | 
143 |     # TODO: switch to use boto
144 |     run("aws kinesis describe-stream --stream-name {} --profile {}".format(stream, profile), pty=True)
145 | 
146 | 
147 | @task
148 | def run_project(config_path):
149 |     """
150 |     Submits the compiled "fat jar" to Apache Spark and
151 |     starts Spark Streaming based on project settings
152 |     """
153 |     run("./spark-master/bin/spark-submit \
154 |         --class com.snowplowanalytics.spark.streaming.StreamingCountsApp \
155 |         --master local[4] \
156 |         ./target/scala-2.10/{} \
157 |         --config {}".format(JAR_FILE, config_path),
158 |         pty=True)
159 | 


--------------------------------------------------------------------------------
/vagrant/.gitignore:
--------------------------------------------------------------------------------
1 | .peru
2 | oss-playbooks
3 | ansible
4 | 


--------------------------------------------------------------------------------
/vagrant/ansible.hosts:
--------------------------------------------------------------------------------
1 | [vagrant]
2 | 127.0.0.1:2222
3 | 


--------------------------------------------------------------------------------
/vagrant/peru.yaml:
--------------------------------------------------------------------------------
 1 | imports:
 2 |     ansible: ansible
 3 |     ansible_playbooks: oss-playbooks
 4 | 
 5 | curl module ansible:
 6 |     # Equivalent of git cloning tags/v1.6.6 but much, much faster
 7 |     url: https://codeload.github.com/ansible/ansible/zip/69d85c22c7475ccf8169b6ec9dee3ee28c92a314
 8 |     unpack: zip
 9 |     export: ansible-69d85c22c7475ccf8169b6ec9dee3ee28c92a314
10 | 
11 | git module ansible_playbooks:
12 |     url: https://github.com/snowplow/ansible-playbooks.git
13 |     # Comment out to fetch a specific rev instead of master:
14 |     # rev: xxx
15 | 


--------------------------------------------------------------------------------
/vagrant/up.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | vagrant_dir=/vagrant/vagrant
 5 | bashrc=/home/vagrant/.bashrc
 6 | 
 7 | echo "========================================"
 8 | echo "INSTALLING PERU AND ANSIBLE DEPENDENCIES"
 9 | echo "----------------------------------------"
10 | apt-get update
11 | apt-get install -y language-pack-en git unzip libyaml-dev python3-pip python-yaml python-paramiko python-jinja2
12 | 
13 | echo "==============="
14 | echo "INSTALLING PERU"
15 | echo "---------------"
16 | sudo pip3 install peru
17 | 
18 | echo "======================================="
19 | echo "CLONING ANSIBLE AND PLAYBOOKS WITH PERU"
20 | echo "---------------------------------------"
21 | cd ${vagrant_dir} && peru sync -v
22 | echo "... done"
23 | 
24 | env_setup=${vagrant_dir}/ansible/hacking/env-setup
25 | hosts=${vagrant_dir}/ansible.hosts
26 | 
27 | echo "==================="
28 | echo "CONFIGURING ANSIBLE"
29 | echo "-------------------"
30 | touch ${bashrc}
31 | echo "source ${env_setup}" >> ${bashrc}
32 | echo "export ANSIBLE_HOSTS=${hosts}" >> ${bashrc}
33 | echo "... done"
34 | 
35 | echo "=========================================="
36 | echo "RUNNING PLAYBOOKS WITH ANSIBLE*"
37 | echo "* no output while each playbook is running"
38 | echo "------------------------------------------"
39 | while read pb; do
40 |     su - -c "source ${env_setup} && ${vagrant_dir}/ansible/bin/ansible-playbook ${vagrant_dir}/${pb} --connection=local --inventory-file=${hosts}" vagrant
41 | done <${vagrant_dir}/up.playbooks
42 | 
43 | guidance=${vagrant_dir}/up.guidance
44 | 
45 | if [ -f ${guidance} ]; then
46 |     echo "==========="
47 |     echo "PLEASE READ"
48 |     echo "-----------"
49 |     cat $guidance
50 | fi
51 | 


--------------------------------------------------------------------------------
/vagrant/up.guidance:
--------------------------------------------------------------------------------
1 | To get started:
2 | vagrant ssh
3 | cd /vagrant
4 | sbt test
5 | 


--------------------------------------------------------------------------------
/vagrant/up.playbooks:
--------------------------------------------------------------------------------
1 | oss-playbooks/aws-cli-and-psql.yml
2 | oss-playbooks/java7.yml
3 | oss-playbooks/scala.yml
4 | oss-playbooks/sbt.yml
5 | oss-playbooks/invoke.yml
6 | 


--------------------------------------------------------------------------------