├── .gitignore ├── .travis.yml ├── CHANGELOG ├── LICENSE-2.0.txt ├── README.md ├── Vagrantfile ├── config ├── .gitignore └── config.hocon.sample ├── docs ├── dynamodb-table-image.png └── spark-ui-image.png ├── project ├── BuildSettings.scala ├── Dependencies.scala ├── SparkStreamingExampleProjectBuild.scala ├── build.properties └── plugins.sbt ├── src └── main │ └── scala │ └── com.snowplowanalytics.spark │ ├── package.scala │ └── streaming │ ├── SimpleEvent.scala │ ├── StreamingCounts.scala │ ├── StreamingCountsApp.scala │ ├── StreamingCountsConfig.scala │ ├── kinesis │ └── KinesisUtils.scala │ └── storage │ ├── BucketingStrategy.scala │ └── DynamoUtils.scala ├── tasks.py └── vagrant ├── .gitignore ├── ansible.hosts ├── peru.yaml ├── up.bash ├── up.guidance └── up.playbooks /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Python 16 | __pycache__/ 17 | *.py[cod] 18 | 19 | # Vagrant 20 | .vagrant 21 | 22 | *.class 23 | *.log 24 | 25 | # Scala-IDE specific 26 | .scala_dependencies 27 | .worksheet 28 | 29 | # Spark build 30 | master.zip 31 | spark-master/ 32 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.10.4 4 | jdk: 5 | - oraclejdk7 6 | - openjdk6 7 | - openjdk7 8 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | Version 0.1.0 (2015-06-10) 2 | -------------------------- 3 | Initial release -------------------------------------------------------------------------------- /LICENSE-2.0.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Streaming Example Project 2 | 3 | [ ![Build Status] [travis-image] ] [travis] [ ![Release] [release-image] ] [releases] [ ![License] [license-image] ] [license] 4 | 5 | ## Introduction 6 | 7 | This is a simple time series analysis stream processing job ([introductory blog post] [blog-post]) written in Scala for the [Spark Streaming] [spark-streaming] cluster computing platform, processing JSON events from [Amazon Kinesis] [kinesis] and writing aggregates to [Amazon DynamoDB] [dynamodb]. 8 | 9 | This was built by the Data Science team at [Snowplow Analytics] [snowplow], who use Spark Streaming in their client projects. 10 | 11 | **Running this requires an Amazon AWS account, and it will incur charges.** 12 | 13 | _See also:_ [Spark Example Project] [spark-example-project] | [AWS Lambda Example Project] [aws-lambda-example-project] 14 | 15 | ## Overview 16 | 17 | We have implemented a super-simple analytics-on-write stream processing job using Spark Streaming. Our Spark Streaming job reads a Kinesis stream containing events in a JSON format: 18 | 19 | ```json 20 | { 21 | "timestamp": "2015-06-05T12:54:43.064528", 22 | "type": "Green", 23 | "id": "4ec80fb1-0963-4e35-8f54-ce760499d974" 24 | } 25 | ``` 26 | 27 | Our job counts the events by `type` and aggregates these counts into 1 minute buckets. The job then takes these aggregates and saves them into a table in DynamoDB: 28 | 29 | ![dynamodb-table-image][dynamodb-table-image] 30 | 31 | ## Developer Quickstart 32 | 33 | Assuming git, [Vagrant] [vagrant-install] and [VirtualBox] [virtualbox-install] installed: 34 | 35 | ```bash 36 | host$ git clone https://github.com/snowplow/spark-streaming-example-project.git 37 | host$ cd spark-streaming-example-project 38 | host$ vagrant up && vagrant ssh 39 | guest$ cd /vagrant 40 | guest$ sbt compile 41 | ``` 42 | 43 | ## Tutorial 44 | 45 | You can follow along in [the release blog post] [blog-post] to get the project up and running yourself. 46 | 47 | The below steps assume that you are running inside Vagrant, as per the Developer Quickstart above. 48 | 49 | ### 1. Setting up AWS 50 | 51 | First we need to configure a default AWS profile: 52 | 53 | ```bash 54 | $ aws configure 55 | AWS Access Key ID [None]: ... 56 | AWS Secret Access Key [None]: ... 57 | Default region name [None]: us-east-1 58 | Default output format [None]: json 59 | ``` 60 | 61 | Now we create our Kinesis event stream: 62 | 63 | ```bash 64 | $ inv create_kinesis_stream default my-stream 65 | ``` 66 | 67 | Wait a minute and then: 68 | 69 | ```bash 70 | $ inv describe_kinesis_stream default my-stream 71 | { 72 | "StreamDescription": { 73 | "StreamStatus": "ACTIVE", 74 | "StreamName": "my-stream", 75 | "StreamARN": "arn:aws:kinesis:us-east-1:719197435995:stream/my-stream", 76 | "Shards": [ 77 | { 78 | "ShardId": "shardId-000000000000", 79 | "HashKeyRange": { 80 | "EndingHashKey": "340282366920938463463374607431768211455", 81 | "StartingHashKey": "0" 82 | }, 83 | "SequenceNumberRange": { 84 | "StartingSequenceNumber": "49551350243544458458477304430170758137221526998466166786" 85 | } 86 | } 87 | ] 88 | } 89 | } 90 | ``` 91 | 92 | If the Kinesis response says that the stream is still being created, wait a minute and then try again. 93 | 94 | Now create our DynamoDB table: 95 | 96 | ```bash 97 | $ inv create_dynamodb_table default us-east-1 my-table 98 | ``` 99 | 100 | ### 2. Sending events to Kinesis 101 | 102 | We need to start sending events to our new Kinesis stream. We have created a helper method to do this - run the below and leave it running: 103 | 104 | ```bash 105 | $ inv generate_events default us-east-1 my-stream 106 | Event sent to Kinesis: {"timestamp": "2015-06-05T12:54:43.064528", "type": "Green", "id": "4ec80fb1-0963-4e35-8f54-ce760499d974"} 107 | Event sent to Kinesis: {"timestamp": "2015-06-05T12:54:43.757797", "type": "Red", "id": "eb84b0d1-f793-4213-8a65-2fb09eab8c5c"} 108 | Event sent to Kinesis: {"timestamp": "2015-06-05T12:54:44.295972", "type": "Yellow", "id": "4654bdc8-86d4-44a3-9920-fee7939e2582"} 109 | ... 110 | ``` 111 | 112 | Now open up a separate terminal for the rest of the setup. 113 | 114 | ### 3. Running our job on Spark Streaming 115 | 116 | First we need to build Spark Streaming with Kinesis support. This can take up to 90 minutes: 117 | 118 | ```bash 119 | $ inv build_spark 120 | ... 121 | [INFO] Spark Kinesis Integration ......................... SUCCESS [1:11.115s] 122 | ... 123 | [INFO] ------------------------------------------------------------------------ 124 | [INFO] BUILD SUCCESS 125 | [INFO] ------------------------------------------------------------------------ 126 | [INFO] Total time: 1:29:00.686s 127 | [INFO] Finished at: Sun Jun 07 00:32:09 UTC 2015 128 | [INFO] Final Memory: 94M/665M 129 | [INFO] ------------------------------------------------------------------------ 130 | ``` 131 | 132 | Now we build our application. This should take closer to 10 minutes: 133 | 134 | ```bash 135 | $ inv build_project 136 | ... 137 | ``` 138 | 139 | Finally we can submit our job to Spark with this command: 140 | 141 | ```bash 142 | $ inv run_project config/config.hocon.sample 143 | ... 144 | ``` 145 | 146 | If you have updated any of the configuration options above (e.g. stream name or region), then you will have to update the `config.hocon.sample` file accordingly. 147 | 148 | ### 4. Monitoring your job 149 | 150 | First review the spooling output of the `run_project` command above - it's very verbose, but if you don't see any Java stack traces in there, then Spark Streaming should be running okay. 151 | 152 | Now head over to your host machine's [localhost:4040] [localhost-4040] and you should see something like this: 153 | 154 | ![spark-ui-image][spark-ui-image] 155 | 156 | You can see how our Spark Streaming job _discretizes_ the Kinesis event stream into 2-second-duration "micro-batches", which are each then processed as a discrete Spark job. 157 | 158 | Finally, let's check the data in our DynamoDB table. Make sure you are in the correct AWS region, then click on `my-table` and hit the `Explore Table` button: 159 | 160 | ![dynamodb-table-image][dynamodb-table-image] 161 | 162 | For each **BucketStart** and **EventType** pair, we see a **Count**, plus some **CreatedAt** and **UpdatedAt** metadata for debugging purposes. Our bucket size is 1 minute, and we have 5 discrete event types, hence the matrix of rows that we see. 163 | 164 | ## Roadmap 165 | 166 | * Porting this job to [AWS Lambda] [aws-lambda-example-project] 167 | * Various improvements for the [0.2.0 release] [020-milestone] 168 | * Expanding our analytics-on-write thinking into our new [Icebucket] [icebucket] project 169 | 170 | ## Copyright and license 171 | 172 | Copyright 2015 Snowplow Analytics Ltd. 173 | 174 | Licensed under the [Apache License, Version 2.0] [license] (the "License"); 175 | you may not use this software except in compliance with the License. 176 | 177 | Unless required by applicable law or agreed to in writing, software 178 | distributed under the License is distributed on an "AS IS" BASIS, 179 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 180 | See the License for the specific language governing permissions and 181 | limitations under the License. 182 | 183 | [travis]: https://travis-ci.org/snowplow/spark-streaming-example-project 184 | [travis-image]: https://travis-ci.org/snowplow/spark-streaming-example-project.png?branch=master 185 | [license-image]: http://img.shields.io/badge/license-Apache--2-blue.svg?style=flat 186 | [license]: http://www.apache.org/licenses/LICENSE-2.0 187 | [release-image]: http://img.shields.io/badge/release-0.1.0-blue.svg?style=flat 188 | [releases]: https://github.com/snowplow/spark-streaming-example-project/releases 189 | 190 | [blog-post]: http://snowplowanalytics.com/blog/2015/06/10/spark-streaming-example-project-0.1.0-released/ 191 | 192 | [dynamodb-table-image]: /docs/dynamodb-table-image.png?raw=true 193 | [spark-ui-image]: /docs/spark-ui-image.png?raw=true 194 | 195 | [spark-streaming]: https://spark.apache.org/streaming/ 196 | [kinesis]: http://aws.amazon.com/kinesis 197 | [dynamodb]: http://aws.amazon.com/dynamodb 198 | [snowplow]: http://snowplowanalytics.com 199 | [icebucket]: https://github.com/snowplow/icebucket 200 | 201 | [vagrant-install]: http://docs.vagrantup.com/v2/installation/index.html 202 | [virtualbox-install]: https://www.virtualbox.org/wiki/Downloads 203 | 204 | [spark-example-project]: https://github.com/snowplow/spark-example-project 205 | [aws-lambda-example-project]: https://github.com/snowplow/aws-lambda-example-project 206 | 207 | [localhost-4040]: http://localhost:4040/ 208 | 209 | [020-milestone]: https://github.com/snowplow/spark-streaming-example-project/milestones/Version%200.2.0 210 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | Vagrant.configure("2") do |config| 2 | 3 | config.vm.box = "ubuntu/trusty64" 4 | config.vm.hostname = "spark-streaming-example-project" 5 | config.ssh.forward_agent = true 6 | 7 | # Forward guest port 4040 to host port 4040 (for Spark web UI) 8 | config.vm.network "forwarded_port", guest: 4040, host: 4040 9 | 10 | config.vm.provider :virtualbox do |vb| 11 | vb.name = Dir.pwd().split("/")[-1] + "-" + Time.now.to_f.to_i.to_s 12 | vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] 13 | vb.customize [ "guestproperty", "set", :id, "--timesync-threshold", 10000 ] 14 | # Scala is memory-hungry 15 | vb.memory = 8000 16 | end 17 | 18 | config.vm.provision :shell do |sh| 19 | sh.path = "vagrant/up.bash" 20 | end 21 | 22 | end 23 | -------------------------------------------------------------------------------- /config/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !config.hocon.sample 3 | -------------------------------------------------------------------------------- /config/config.hocon.sample: -------------------------------------------------------------------------------- 1 | ################################### 2 | # Sample configuration for # 3 | # spark-streaming-example-project # 4 | ################################### 5 | 6 | 7 | kinesis { 8 | 9 | streamName: "my-stream" 10 | 11 | region: "us-east-1" 12 | 13 | } 14 | 15 | 16 | spark { 17 | 18 | appName: "StreamingCountsApp" 19 | 20 | checkpointInterval: 10 # Secs 21 | 22 | master: "local[2]" # At least 2 threads 23 | 24 | batchInterval: 2000 # Ms 25 | 26 | } 27 | 28 | 29 | dynamodb { 30 | 31 | tableName: "my-table" 32 | 33 | } 34 | 35 | 36 | aws { 37 | 38 | awsProfile: "default" 39 | 40 | } -------------------------------------------------------------------------------- /docs/dynamodb-table-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snowplow-archive/spark-streaming-example-project/836eefb768c83663cb0c4ead27f08cfaeec0e352/docs/dynamodb-table-image.png -------------------------------------------------------------------------------- /docs/spark-ui-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snowplow-archive/spark-streaming-example-project/836eefb768c83663cb0c4ead27f08cfaeec0e352/docs/spark-ui-image.png -------------------------------------------------------------------------------- /project/BuildSettings.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | import sbt._ 14 | import Keys._ 15 | 16 | object BuildSettings { 17 | 18 | // Basic settings for our app 19 | lazy val basicSettings = Seq[Setting[_]]( 20 | organization := "com.snowplowanalytics", 21 | version := "0.1.0", 22 | description := "A Spark Streaming job reading events from Amazon Kinesis and writing event counts to DynamoDB", 23 | scalaVersion := "2.10.4", 24 | scalacOptions := Seq("-deprecation", "-encoding", "utf8", 25 | "-feature", "-target:jvm-1.7"), 26 | scalacOptions in Test := Seq("-Yrangepos"), 27 | resolvers ++= Dependencies.resolutionRepos 28 | ) 29 | 30 | // Makes our SBT app settings available from within the app 31 | lazy val scalifySettings = Seq(sourceGenerators in Compile <+= (sourceManaged in Compile, version, name, organization) map { (d, v, n, o) => 32 | val file = d / "settings.scala" 33 | IO.write(file, """package com.snowplowanalytics.spark.streaming.generated 34 | |object Settings { 35 | | val organization = "%s" 36 | | val version = "%s" 37 | | val name = "%s" 38 | |} 39 | |""".stripMargin.format(o, v, n)) 40 | Seq(file) 41 | }) 42 | 43 | // sbt-assembly settings for building a fat jar 44 | import sbtassembly.Plugin._ 45 | import AssemblyKeys._ 46 | lazy val sbtAssemblySettings = assemblySettings ++ Seq( 47 | 48 | // Simpler jar name 49 | jarName in assembly := { 50 | name.value + "-" + version.value + ".jar" 51 | }, 52 | 53 | // Drop these jars 54 | excludedJars in assembly <<= (fullClasspath in assembly) map { cp => 55 | val excludes = Set( 56 | "junit-4.5.jar", // We shouldn't need JUnit 57 | "jsp-api-2.1-6.1.14.jar", 58 | "jsp-2.1-6.1.14.jar", 59 | "jasper-compiler-5.5.12.jar", 60 | "minlog-1.2.jar", // Otherwise causes conflicts with Kyro (which bundles it) 61 | "janino-2.5.16.jar", // Janino includes a broken signature, and is not needed anyway 62 | "commons-beanutils-core-1.8.0.jar", // Clash with each other and with commons-collections 63 | "commons-beanutils-1.7.0.jar", // " 64 | "hadoop-core-0.20.2.jar", // Provided by Amazon EMR. Delete this line if you're not on EMR 65 | "hadoop-tools-0.20.2.jar", 66 | "guava-14.0.1.jar", // conflict spark-network-common_2.10-1.3.0.jar 67 | "jcl-over-slf4j-1.7.10.jar", //conflict commons-logging-1.1.3.jar 68 | "hadoop-yarn-api-2.2.0.jar" 69 | ) 70 | cp filter { jar => excludes(jar.data.getName) } 71 | }, 72 | 73 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { 74 | (old) => { 75 | case x if x.contains("UnusedStubClass.class") => MergeStrategy.first 76 | case x if x.endsWith("project.clj") => MergeStrategy.discard // Leiningen build files 77 | case x if x.startsWith("META-INF") => MergeStrategy.discard // More bumf 78 | case x if x.endsWith(".html") => MergeStrategy.discard 79 | case x => old(x) 80 | } 81 | } 82 | ) 83 | 84 | lazy val buildSettings = basicSettings ++ scalifySettings ++ sbtAssemblySettings 85 | } -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | 14 | import sbt._ 15 | 16 | object Dependencies { 17 | val resolutionRepos = Seq( 18 | "Akka Repository" at "http://repo.akka.io/releases/", 19 | "Spray Repository" at "http://repo.spray.cc/" 20 | ) 21 | 22 | object V { 23 | // Java 24 | val awsSdk = "1.9.34" 25 | val awsKinesisConnectors = "1.1.1" 26 | // Scala 27 | val spark = "1.3.0" 28 | val argot = "1.0.3" 29 | // Add versions for your additional libraries here... 30 | // Scala (test) 31 | val specs2 = "1.13" 32 | val guava = "11.0.1" 33 | val json4s = "3.2.10" 34 | 35 | } 36 | 37 | object Libraries { 38 | // Java 39 | val awsSdk = "com.amazonaws" % "aws-java-sdk" % V.awsSdk 40 | val awsSdkCore = "com.amazonaws" % "aws-java-sdk-core" % V.awsSdk 41 | val awsKinesisConnectors = "com.amazonaws" % "amazon-kinesis-connectors" % V.awsKinesisConnectors 42 | 43 | // Scala 44 | val argot = "org.clapper" %% "argot" % V.argot 45 | val sparkCore = "org.apache.spark" %% "spark-core" % V.spark 46 | val sparkStreaming = "org.apache.spark" %% "spark-streaming" % V.spark 47 | val sparkStreamingKinesis = "org.apache.spark" %% "spark-streaming-kinesis-asl" % V.spark 48 | val json4s = "org.json4s" %% "json4s-jackson" % V.json4s 49 | 50 | // Scala (test only) 51 | val specs2 = "org.specs2" % "specs2_2.10" % V.specs2 % "test" 52 | val guava = "com.google.guava" % "guava" % V.guava % "test" 53 | 54 | // Add additional libraries from mvnrepository.com (SBT syntax) here... 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /project/SparkStreamingExampleProjectBuild.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | import sbt._ 14 | import Keys._ 15 | 16 | object SparkStreamingExampleProjectBuild extends Build { 17 | 18 | import Dependencies._ 19 | import BuildSettings._ 20 | 21 | // Configure prompt to show current project 22 | override lazy val settings = super.settings :+ { 23 | shellPrompt := { s => Project.extract(s).currentProject.id + " > " } 24 | } 25 | 26 | // Define our project, with basic project information and library dependencies 27 | lazy val project = Project("spark-streaming-example-project", file(".")) 28 | .settings(buildSettings: _*) 29 | .settings( 30 | libraryDependencies ++= Seq( 31 | Libraries.awsSdk, 32 | Libraries.awsSdkCore, 33 | Libraries.awsKinesisConnectors, 34 | Libraries.argot, 35 | Libraries.sparkCore, 36 | Libraries.sparkStreaming, 37 | Libraries.sparkStreamingKinesis, 38 | Libraries.specs2 39 | ) 40 | ) 41 | } 42 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.6 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark 14 | 15 | /** 16 | * Scala package object to hold types, 17 | * helper methods etc. 18 | * 19 | * See: 20 | * http://www.artima.com/scalazine/articles/package_objects.html 21 | */ 22 | package object streaming { 23 | 24 | // TODO: add any packages we need 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/streaming/SimpleEvent.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark.streaming 14 | 15 | // Java 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | 19 | // json4s 20 | import org.json4s._ 21 | import org.json4s.jackson.JsonMethods._ 22 | 23 | // This project 24 | import storage.BucketingStrategy 25 | 26 | /** 27 | * Companion object for creating a SimpleEvent 28 | * from incoming JSON 29 | */ 30 | object SimpleEvent { 31 | 32 | private val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss") 33 | 34 | /** 35 | * Converts date string into Date object 36 | */ 37 | def convertStringToDate(dateString: String): Date = format.parse(dateString) 38 | 39 | /** 40 | * Converts Kinesis ByteArray of JSON data into SimpleEvent objects 41 | */ 42 | def fromJson(byteArray: Array[Byte]): SimpleEvent = { 43 | implicit val formats = DefaultFormats 44 | val newString = new String(byteArray, "UTF-8") 45 | val parsed = parse(newString) 46 | parsed.extract[SimpleEvent] 47 | } 48 | 49 | } 50 | 51 | /** 52 | * Simple Class demonstrating an EventType log consisting of: 53 | * 1. ISO 8601 DateTime Object that will be downsampled 54 | * (see BucketingStrategy.scala file for more details) 55 | * 2. A simple model of colors for this EventType: 56 | * 'Red','Orange','Yellow','Green', or 'Blue' 57 | * example log: {"timestamp": "2015-06-05T13:00:22.540374", "type": "Orange", "id": "018dd633-f4c3-4599-9b44-ebf71a1c519f"} 58 | */ 59 | case class SimpleEvent(id: String, timestamp: String, `type`: String) { 60 | 61 | // Convert timestamp into Time Bucket using Bucketing Strategy 62 | val bucket = BucketingStrategy.bucket(SimpleEvent.convertStringToDate(timestamp)) 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/streaming/StreamingCounts.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark.streaming 14 | 15 | // Spark 16 | 17 | import com.amazonaws.services.dynamodbv2.document.DynamoDB 18 | import org.apache.spark.SparkConf 19 | import org.apache.spark.streaming._ 20 | import org.apache.spark.streaming.kinesis.KinesisUtils 21 | 22 | // This project 23 | import storage.DynamoUtils 24 | import kinesis.{KinesisUtils => KU} 25 | 26 | /** 27 | * Core of the Spark Streaming Application 28 | * 1. Configuration information is brought in from StreamingCountsApp.scala 29 | * 2. Object sets up Kinesis, DynamoDB, CloudTrail connections 30 | * 3. Once connections are up, Spark StreamingCounts stream processing starts 31 | * AWS Kinesis -> Apache Spark Streaming -> AWS DynamoDB 32 | * Raw Data -> Stream Processing Data -> Stored in Database 33 | * 34 | * (More on Spark Streaming: https://spark.apache.org/docs/1.3.0/streaming-kinesis-integration.html) 35 | */ 36 | object StreamingCounts { 37 | 38 | /** 39 | * Private function to set up Spark Streaming 40 | * 41 | * @param config The configuration for our job using StreamingCountsConfig.scala 42 | */ 43 | private def setupSparkContext(config: StreamingCountsConfig): StreamingContext = { 44 | val streamingSparkContext = { 45 | val sparkConf = new SparkConf().setAppName(config.appName).setMaster(config.master) 46 | new StreamingContext(sparkConf, config.batchInterval) 47 | } 48 | streamingSparkContext 49 | } 50 | 51 | /** 52 | * Starts our processing of a single Kinesis stream. 53 | * Never ends. 54 | * 55 | * @param config The configuration for our job using StreamingCountsConfig.scala 56 | */ 57 | def execute(config: StreamingCountsConfig) { 58 | 59 | // setting up Spark Streaming connection to Kinesis 60 | val kinesisClient = KU.setupKinesisClientConnection(config.endpointUrl, config.awsProfile) 61 | require(kinesisClient != null, 62 | "No AWS credentials found. Please specify credentials using one of the methods specified " + 63 | "in http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html") 64 | 65 | // setting up Spark Streaming connection to DynamoDB 66 | lazy val dynamoConnection = DynamoUtils.setupDynamoClientConnection(config.awsProfile) 67 | 68 | val streamingSparkContext = setupSparkContext(config) 69 | val numShards = KU.getShardCount(kinesisClient, config.streamName) 70 | val sparkDStreams = (0 until numShards).map { i => 71 | KinesisUtils.createStream( 72 | ssc = streamingSparkContext, 73 | streamName = config.streamName, 74 | endpointUrl = config.endpointUrl, 75 | initialPositionInStream = config.initialPosition, 76 | checkpointInterval = config.batchInterval, 77 | storageLevel = config.storageLevel 78 | ) 79 | } 80 | 81 | // Map phase: union DStreams, derive events, determine bucket 82 | val bucketedEvents = streamingSparkContext 83 | .union(sparkDStreams) 84 | .map { bytes => 85 | val e = SimpleEvent.fromJson(bytes) 86 | (e.bucket, e.`type`) 87 | } 88 | 89 | // Reduce phase: group by key then by count 90 | val bucketedEventCounts = bucketedEvents 91 | .groupByKey 92 | .map { case (eventType, events) => 93 | val count = events.groupBy(identity).mapValues(_.size) 94 | (eventType, count) 95 | } 96 | 97 | // Iterate over each aggregate record and save the record into DynamoDB 98 | bucketedEventCounts.foreachRDD { rdd => 99 | rdd.foreach { case (bucket, aggregates) => 100 | aggregates.foreach { case (eventType, count) => 101 | DynamoUtils.setOrUpdateCount( 102 | dynamoConnection, 103 | config.tableName, 104 | bucket.toString, 105 | eventType, 106 | DynamoUtils.timeNow(), 107 | DynamoUtils.timeNow(), 108 | count.toInt 109 | ) 110 | } 111 | } 112 | } 113 | 114 | // Start Spark Streaming process 115 | streamingSparkContext.start() 116 | streamingSparkContext.awaitTermination() 117 | } 118 | } -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/streaming/StreamingCountsApp.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark.streaming 14 | 15 | // Java 16 | import java.io.File 17 | import java.io.FileReader 18 | import java.util.Properties 19 | 20 | // AWS libs 21 | import com.amazonaws.auth.AWSCredentialsProvider 22 | import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream 23 | import com.amazonaws.services.kinesis.connectors.KinesisConnectorConfiguration 24 | 25 | // Config 26 | import com.typesafe.config.{Config, ConfigFactory} 27 | 28 | // Argot 29 | import org.clapper.argot._ 30 | 31 | // Spark 32 | import org.apache.spark.storage.StorageLevel 33 | import org.apache.spark.streaming.{Minutes, Seconds, Milliseconds} 34 | 35 | /** 36 | * The entry point class for the Spark Streaming Application. 37 | * 38 | * Usage: 39 | * 40 | * spark/bin/spark-submit --class com.snowplowanalytics.spark.streaming.StreamingCountsApp \ 41 | * --master local[2] \ 42 | * spark-streaming-example-project/target/scala-2.10/spark-streaming-example-project-0.1.0.jar \ 43 | * --config spark-streaming-example-project/src/main/resources/config.hocon.sample 44 | */ 45 | object StreamingCountsApp { 46 | 47 | def main(args: Array[String]) { 48 | 49 | // General bumf for our app 50 | val parser = new ArgotParser( 51 | programName = "generated", 52 | compactUsage = true, 53 | preUsage = Some("%s: Version %s. Copyright (c) 2015, %s.".format( 54 | generated.Settings.name, 55 | generated.Settings.version, 56 | generated.Settings.organization) 57 | ) 58 | ) 59 | 60 | // Optional config argument 61 | val config = parser.option[Config](List("config"), 62 | "filename", 63 | "Configuration file.") { 64 | (c, opt) => 65 | 66 | val file = new File(c) 67 | if (file.exists) { 68 | ConfigFactory.parseFile(file) 69 | } else { 70 | parser.usage("Configuration file \"%s\" does not exist".format(c)) 71 | ConfigFactory.empty() 72 | } 73 | } 74 | parser.parse(args) 75 | 76 | // read the config file if --config parameter is provided else fail 77 | val conf = config.value.getOrElse(throw new RuntimeException("--config argument must be provided")) 78 | 79 | // create Spark Streaming Config from hocon file in resource directory 80 | val scc = StreamingCountsConfig( 81 | region = conf.getConfig("kinesis").getString("region"), 82 | streamName = conf.getConfig("kinesis").getString("streamName"), 83 | checkpointInterval = Minutes(conf.getConfig("spark").getInt("checkpointInterval")), 84 | initialPosition = InitialPositionInStream.LATEST, 85 | storageLevel = StorageLevel.MEMORY_AND_DISK_2, 86 | appName = conf.getConfig("spark").getString("appName"), 87 | master = conf.getConfig("spark").getString("master"), 88 | batchInterval = Milliseconds(conf.getConfig("spark").getInt("batchInterval")), 89 | tableName = conf.getConfig("dynamodb").getString("tableName"), 90 | awsProfile = conf.getConfig("aws").getString("awsProfile") 91 | ) 92 | 93 | // start StreamingCounts application with config object 94 | StreamingCounts.execute(scc) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/streaming/StreamingCountsConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark.streaming 14 | 15 | // AWS SDK 16 | import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream 17 | 18 | // Spark 19 | import org.apache.spark.storage.StorageLevel 20 | import org.apache.spark.streaming.Duration 21 | 22 | /** 23 | * Configuration object for our StreamingCounts job 24 | */ 25 | case class StreamingCountsConfig( 26 | region: String, 27 | streamName: String, 28 | checkpointInterval: Duration, 29 | initialPosition: InitialPositionInStream, 30 | storageLevel: StorageLevel, 31 | appName: String, 32 | master: String, 33 | batchInterval: Duration, 34 | tableName: String, 35 | awsProfile: String 36 | ) { 37 | 38 | /** 39 | * The Kinesis endpoint from the region. 40 | */ 41 | val endpointUrl = s"https://kinesis.${region}.amazonaws.com" 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/streaming/kinesis/KinesisUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark.streaming 14 | package kinesis 15 | 16 | // AWS KCL 17 | import com.amazonaws.auth.profile.ProfileCredentialsProvider 18 | import com.amazonaws.services.kinesis.AmazonKinesisClient 19 | 20 | 21 | object KinesisUtils { 22 | 23 | /** 24 | * Singleton Object that takes 1.KCL/AWS credentials object. 2.Kinesis Stream 25 | * name. The utility function queries the stream name and determines how many 26 | * shards are in the stream so that the sharded data can be unioned/joined 27 | * for processing by Apache Spark Streaming application 28 | * 29 | * @param AmazonKinesisClient AWS Kinesis Client 30 | * @param String Kinesis Stream Name 31 | * @param StreamingCountsConfig Application Configuration information object 32 | */ 33 | def getShardCount(kinesisClient: AmazonKinesisClient, stream: String): Int = 34 | kinesisClient 35 | .describeStream(stream) 36 | .getStreamDescription 37 | .getShards 38 | .size 39 | 40 | /** 41 | * Finds AWS Credential by provided awsProfile and creates Kinesis Client 42 | */ 43 | def setupKinesisClientConnection(endpointUrl: String, awsProfile: String): AmazonKinesisClient = { 44 | val credentials = new ProfileCredentialsProvider(awsProfile) 45 | val akc = new AmazonKinesisClient(credentials) 46 | akc.setEndpoint(endpointUrl) 47 | akc 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/streaming/storage/BucketingStrategy.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark.streaming 14 | package storage 15 | 16 | // Java 17 | import java.util.Date 18 | import java.text.SimpleDateFormat 19 | 20 | /** 21 | * Object uses downsampling method to create metadata from each 22 | * EventType log record. Parsing the ISO 8601 23 | * datetime stamp to the minute means downsampling aka reducing 24 | * precision. 25 | * 26 | * Bucketing 27 | * A family of aggregations that build buckets, where each bucket 28 | * is associated with a key and an EventType criterion. When the 29 | * aggregation is executed, all the buckets criteria are evaluated 30 | * on every EventType in the context and when a criterion matches, 31 | * the EventType is considered to "fall in" the relevant bucket. 32 | * By the end of the aggregation process, we’ll end up with a 33 | * list of buckets - each one with a set of EventTypes that 34 | * "belong" to it. 35 | * 36 | */ 37 | object BucketingStrategy { 38 | 39 | private val BucketToMinuteFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:00.000") 40 | 41 | /** 42 | * Function to bucket a date based on 43 | * our bucketing strategy. Bucketing 44 | * means downsampling aka reducing 45 | * precision. 46 | * 47 | * @param date The Java Date to bucket 48 | * @return the downsampled date in String 49 | * format 50 | */ 51 | def bucket(date: Date): String = 52 | BucketToMinuteFormatter.format(date) 53 | } -------------------------------------------------------------------------------- /src/main/scala/com.snowplowanalytics.spark/streaming/storage/DynamoUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | package com.snowplowanalytics.spark.streaming 14 | package storage 15 | 16 | // Java 17 | import java.util.Date 18 | import java.util.TimeZone 19 | import java.text.SimpleDateFormat 20 | 21 | 22 | // AWS Authentication 23 | // http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html 24 | import com.amazonaws.auth.profile.ProfileCredentialsProvider 25 | 26 | // AWS DynamoDB 27 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClient 28 | import com.amazonaws.services.dynamodbv2.document.{AttributeUpdate, DynamoDB, Item} 29 | 30 | /** 31 | * Object sets up singleton that finds AWS credentials for DynamoDB to access the 32 | * aggregation records table. The utility function below puts items into the 33 | * "AggregateRecords" table. 34 | */ 35 | object DynamoUtils { 36 | 37 | val dateFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") 38 | val timezone = TimeZone.getTimeZone("UTC") 39 | 40 | /** 41 | * Function timezone helper 42 | */ 43 | def timeNow(): String = { 44 | dateFormatter.setTimeZone(timezone) 45 | dateFormatter.format(new Date()) 46 | } 47 | 48 | 49 | /** 50 | * Function wraps DynamoDB cred setup 51 | */ 52 | def setupDynamoClientConnection(awsProfile: String): DynamoDB = { 53 | val credentials = new ProfileCredentialsProvider(awsProfile) 54 | val dynamoDB = new DynamoDB(new AmazonDynamoDBClient(credentials)) 55 | dynamoDB 56 | } 57 | 58 | 59 | /** 60 | * Function wraps get or create item in DynamoDB table 61 | */ 62 | def setOrUpdateCount(dynamoDB: DynamoDB, tableName: String, bucketStart: String, eventType: String, createdAt: String, updatedAt: String, count: Int){ 63 | 64 | val recordInTable = getItem(dynamoDB: DynamoDB, tableName, bucketStart, eventType) 65 | println(recordInTable) 66 | if (recordInTable == null) { 67 | DynamoUtils.putItem(dynamoDB: DynamoDB, tableName, bucketStart, eventType, createdAt, updatedAt, count) 68 | } else { 69 | val oldCreatedAt = recordInTable.getJSON("CreatedAt").replace("\"", "").replace("\\", "") 70 | val oldCount = recordInTable.getJSON("Count").toInt 71 | val newCount = oldCount + count.toInt 72 | DynamoUtils.putItem(dynamoDB: DynamoDB, tableName, bucketStart, eventType, oldCreatedAt, updatedAt, newCount) 73 | } 74 | } 75 | 76 | 77 | /** 78 | * Function wraps AWS Java getItemOutcome operation to DynamoDB table 79 | */ 80 | def getItem(dynamoDB: DynamoDB, tableName: String, bucketStart: String, eventType: String): Item = { 81 | 82 | val table = dynamoDB.getTable(tableName) 83 | val items = table.getItemOutcome("BucketStart", bucketStart, "EventType", eventType) 84 | items.getItem 85 | } 86 | 87 | 88 | /** 89 | * Function wraps AWS Java putItem operation to DynamoDB table 90 | */ 91 | def putItem(dynamoDB: DynamoDB, tableName: String, bucketStart: String, eventType: String, createdAt: String, updatedAt: String, count: Int) { 92 | 93 | // AggregateRecords column names 94 | val tablePrimaryKeyName = "BucketStart" 95 | val tableEventTypeSecondaryKeyName = "EventType" 96 | val tableCreatedAtColumnName = "CreatedAt" 97 | val tableUpdatedAtColumnName = "UpdatedAt" 98 | val tableCountColumnName = "Count" 99 | 100 | try { 101 | val time = new Date().getTime - (1 * 24 * 60 * 60 * 1000) 102 | val date = new Date() 103 | date.setTime(time) 104 | dateFormatter.setTimeZone(TimeZone.getTimeZone("UTC")) 105 | val table = dynamoDB.getTable(tableName) 106 | println("Adding data to " + tableName) 107 | 108 | val item = new Item().withPrimaryKey(tablePrimaryKeyName, bucketStart) 109 | .withString(tableEventTypeSecondaryKeyName, eventType) 110 | .withString(tableCreatedAtColumnName, createdAt) 111 | .withString(tableUpdatedAtColumnName, updatedAt) 112 | .withInt(tableCountColumnName, count) 113 | 114 | // saving the data to DynamoDB AggregrateRecords table 115 | // println(item) 116 | table.putItem(item) 117 | } catch { 118 | case e: Exception => { 119 | System.err.println("Failed to create item in " + tableName) 120 | System.err.println(e.getMessage) 121 | } 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. 2 | # 3 | # This program is licensed to you under the Apache License Version 2.0, 4 | # and you may not use this file except in compliance with the Apache License Version 2.0. 5 | # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 6 | # 7 | # Unless required by applicable law or agreed to in writing, 8 | # software distributed under the Apache License Version 2.0 is distributed on an 9 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 11 | 12 | import datetime, json, uuid, time 13 | from functools import partial 14 | from random import choice 15 | 16 | from invoke import run, task 17 | 18 | import boto 19 | from boto import kinesis 20 | import boto.dynamodb2 21 | from boto.dynamodb2.fields import HashKey, RangeKey, KeysOnlyIndex, GlobalAllIndex 22 | from boto.dynamodb2.table import Table 23 | from boto.dynamodb2.types import NUMBER 24 | 25 | 26 | JAR_FILE = "spark-streaming-example-project-0.1.0.jar" 27 | 28 | # Selection of EventType values 29 | COLORS = ['Red','Orange','Yellow','Green','Blue'] 30 | 31 | 32 | # DynamoDB settings 33 | THROUGHPUT_READ = 20 34 | THROUGHPUT_WRITE = 20 35 | 36 | 37 | # AWS Kinesis Data Generator 38 | def picker(seq): 39 | """ 40 | Returns a new function that can be called without arguments 41 | to select and return a random color 42 | """ 43 | return partial(choice, seq) 44 | 45 | def create_event(): 46 | """ 47 | Returns a choice of color and builds and event 48 | """ 49 | event_id = str(uuid.uuid4()) 50 | color_choice = picker(COLORS) 51 | 52 | return (event_id, { 53 | "id": event_id, 54 | "timestamp": datetime.datetime.now().isoformat(), 55 | "type": color_choice() 56 | }) 57 | 58 | def write_event(conn, stream_name): 59 | """ 60 | Returns the event and event event_payload 61 | """ 62 | event_id, event_payload = create_event() 63 | event_json = json.dumps(event_payload) 64 | conn.put_record(stream_name, event_json, event_id) 65 | return event_json 66 | 67 | 68 | @task 69 | def generate_events(profile, region, stream): 70 | """ 71 | load demo data with python generator script for SimpleEvents 72 | """ 73 | conn = kinesis.connect_to_region(region, profile_name=profile) 74 | while True: 75 | event_json = write_event(conn, stream) 76 | print "Event sent to Kinesis: {}".format(event_json) 77 | #time.sleep(5) 78 | 79 | @task 80 | def build_spark(): 81 | """ 82 | compile, build and assembly Apache Spark with Kinesis support 83 | """ 84 | run("rm -rf master.zip spark-master", pty=True) 85 | run("wget https://github.com/apache/spark/archive/master.zip") 86 | run("unzip master.zip", pty=True) 87 | run('export MAVEN_OPTS="-Xmx1g -XX:MaxPermSize=256M -XX:ReservedCodeCacheSize=256m" && cd spark-master && mvn -Pkinesis-asl -DskipTests clean package', pty=True) 88 | 89 | 90 | @task 91 | def build_project(): 92 | """ 93 | build spark-streaming-example-project 94 | and package into "fat jar" ready for spark-submit 95 | """ 96 | run("sbt assembly", pty=True) 97 | 98 | 99 | @task 100 | def create_profile(profile): 101 | """ 102 | Create a profile 103 | """ 104 | run("aws configure --profile {}".format(profile), pty=True) 105 | 106 | 107 | @task 108 | def create_dynamodb_table(profile, region, table): 109 | """ 110 | DynamoDB table creation with AWS Boto library in Python 111 | """ 112 | 113 | connection = boto.dynamodb2.connect_to_region(region, profile_name=profile) 114 | aggregate = Table.create(table, 115 | schema=[ 116 | HashKey("BucketStart"), 117 | RangeKey("EventType"), 118 | ], 119 | throughput={ 120 | 'read': THROUGHPUT_READ, 121 | 'write': THROUGHPUT_WRITE 122 | }, 123 | connection=connection 124 | ) 125 | 126 | 127 | @task 128 | def create_kinesis_stream(profile, stream): 129 | """ 130 | create our Kinesis stream 131 | """ 132 | 133 | # TODO: switch to use boto 134 | run("aws kinesis create-stream --stream-name {} --shard-count 1 --profile {}".format(stream, profile), pty=True) 135 | 136 | 137 | @task 138 | def describe_kinesis_stream(profile, stream): 139 | """ 140 | show status Kinesis stream named eventStream 141 | """ 142 | 143 | # TODO: switch to use boto 144 | run("aws kinesis describe-stream --stream-name {} --profile {}".format(stream, profile), pty=True) 145 | 146 | 147 | @task 148 | def run_project(config_path): 149 | """ 150 | Submits the compiled "fat jar" to Apache Spark and 151 | starts Spark Streaming based on project settings 152 | """ 153 | run("./spark-master/bin/spark-submit \ 154 | --class com.snowplowanalytics.spark.streaming.StreamingCountsApp \ 155 | --master local[4] \ 156 | ./target/scala-2.10/{} \ 157 | --config {}".format(JAR_FILE, config_path), 158 | pty=True) 159 | -------------------------------------------------------------------------------- /vagrant/.gitignore: -------------------------------------------------------------------------------- 1 | .peru 2 | oss-playbooks 3 | ansible 4 | -------------------------------------------------------------------------------- /vagrant/ansible.hosts: -------------------------------------------------------------------------------- 1 | [vagrant] 2 | 127.0.0.1:2222 3 | -------------------------------------------------------------------------------- /vagrant/peru.yaml: -------------------------------------------------------------------------------- 1 | imports: 2 | ansible: ansible 3 | ansible_playbooks: oss-playbooks 4 | 5 | curl module ansible: 6 | # Equivalent of git cloning tags/v1.6.6 but much, much faster 7 | url: https://codeload.github.com/ansible/ansible/zip/69d85c22c7475ccf8169b6ec9dee3ee28c92a314 8 | unpack: zip 9 | export: ansible-69d85c22c7475ccf8169b6ec9dee3ee28c92a314 10 | 11 | git module ansible_playbooks: 12 | url: https://github.com/snowplow/ansible-playbooks.git 13 | # Comment out to fetch a specific rev instead of master: 14 | # rev: xxx 15 | -------------------------------------------------------------------------------- /vagrant/up.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | vagrant_dir=/vagrant/vagrant 5 | bashrc=/home/vagrant/.bashrc 6 | 7 | echo "========================================" 8 | echo "INSTALLING PERU AND ANSIBLE DEPENDENCIES" 9 | echo "----------------------------------------" 10 | apt-get update 11 | apt-get install -y language-pack-en git unzip libyaml-dev python3-pip python-yaml python-paramiko python-jinja2 12 | 13 | echo "===============" 14 | echo "INSTALLING PERU" 15 | echo "---------------" 16 | sudo pip3 install peru 17 | 18 | echo "=======================================" 19 | echo "CLONING ANSIBLE AND PLAYBOOKS WITH PERU" 20 | echo "---------------------------------------" 21 | cd ${vagrant_dir} && peru sync -v 22 | echo "... done" 23 | 24 | env_setup=${vagrant_dir}/ansible/hacking/env-setup 25 | hosts=${vagrant_dir}/ansible.hosts 26 | 27 | echo "===================" 28 | echo "CONFIGURING ANSIBLE" 29 | echo "-------------------" 30 | touch ${bashrc} 31 | echo "source ${env_setup}" >> ${bashrc} 32 | echo "export ANSIBLE_HOSTS=${hosts}" >> ${bashrc} 33 | echo "... done" 34 | 35 | echo "==========================================" 36 | echo "RUNNING PLAYBOOKS WITH ANSIBLE*" 37 | echo "* no output while each playbook is running" 38 | echo "------------------------------------------" 39 | while read pb; do 40 | su - -c "source ${env_setup} && ${vagrant_dir}/ansible/bin/ansible-playbook ${vagrant_dir}/${pb} --connection=local --inventory-file=${hosts}" vagrant 41 | done <${vagrant_dir}/up.playbooks 42 | 43 | guidance=${vagrant_dir}/up.guidance 44 | 45 | if [ -f ${guidance} ]; then 46 | echo "===========" 47 | echo "PLEASE READ" 48 | echo "-----------" 49 | cat $guidance 50 | fi 51 | -------------------------------------------------------------------------------- /vagrant/up.guidance: -------------------------------------------------------------------------------- 1 | To get started: 2 | vagrant ssh 3 | cd /vagrant 4 | sbt test 5 | -------------------------------------------------------------------------------- /vagrant/up.playbooks: -------------------------------------------------------------------------------- 1 | oss-playbooks/aws-cli-and-psql.yml 2 | oss-playbooks/java7.yml 3 | oss-playbooks/scala.yml 4 | oss-playbooks/sbt.yml 5 | oss-playbooks/invoke.yml 6 | --------------------------------------------------------------------------------