├── .gitignore
├── .travis.yml
├── README.md
├── activator
├── activator-launch-1.3.5.jar
├── activator.bat
├── modules
    ├── api
    │   └── src
    │   │   └── main
    │   │       ├── resources
    │   │           ├── application.conf
    │   │           ├── log4j.properties
    │   │           └── logback.xml
    │   │       └── scala
    │   │           └── com
    │   │               └── fortysevendeg
    │   │                   └── sparkon
    │   │                       └── api
    │   │                           └── http
    │   │                               ├── ApiHttpService.scala
    │   │                               ├── Boot.scala
    │   │                               └── Protocols.scala
    ├── common
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── com
    │   │               └── fortysevendeg
    │   │                   └── sparkon
    │   │                       └── common
    │   │                           ├── StaticValues.scala
    │   │                           └── config
    │   │                               └── ConfigRegistry.scala
    ├── persistence
    │   └── src
    │   │   └── main
    │   │       ├── resources
    │   │           └── data
    │   │           │   └── spark_on_spark.cql
    │   │       └── scala
    │   │           └── com
    │   │               └── fortysevendeg
    │   │                   └── sparkon
    │   │                       └── persistence
    │   │                           └── schema
    │   │                               ├── CassandraServices.scala
    │   │                               └── domain
    │   │                                   ├── PersistenceException.scala
    │   │                                   └── TweetsModels.scala
    ├── services
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── com
    │   │               └── fortysevendeg
    │   │                   └── sparkon
    │   │                       └── services
    │   │                           └── twitter
    │   │                               ├── TwitterReceiverActorStream.scala
    │   │                               ├── TwitterServices.scala
    │   │                               ├── TwitterStreamingServices.scala
    │   │                               └── domain
    │   │                                   ├── Conversions.scala
    │   │                                   └── TwitterServiceException.scala
    └── test
    │   └── src
    │       └── test
    │           ├── resources
    │               ├── log4j.properties
    │               ├── logback.xml
    │               └── reference.conf
    │           └── scala
    │               └── com
    │                   └── fortysevendeg
    │                       └── sparkon
    │                           ├── common
    │                               └── BaseServiceTest.scala
    │                           ├── persistence
    │                               └── CassandraServicesSpec.scala
    │                           └── services
    │                               └── twitter
    │                                   ├── TwitterReceiverActorStreamSpec.scala
    │                                   ├── TwitterServicesSpec.scala
    │                                   └── TwitterStreamingServicesSpec.scala
├── project
    ├── Build.scala
    ├── Dependencies.scala
    ├── Excludes.scala
    ├── Settings.scala
    ├── SettingsDocker.scala
    ├── V.scala
    ├── build.properties
    └── plugins.sbt
└── scripts
    ├── deploy.sh
    ├── docker-compose.yml
    ├── initOpscenter.sh
    └── sparkOn.env


/.gitignore:
--------------------------------------------------------------------------------
 1 | checkpoint
 2 | logs
 3 | target
 4 | tmp
 5 | .history
 6 | dist
 7 | /out
 8 | /RUNNING_PID
 9 | /.ivy*
10 | 
11 | # sbt specific
12 | /.sbt
13 | .cache/
14 | .history/
15 | .lib/
16 | dist/*
17 | target/
18 | lib_managed/
19 | src_managed/
20 | project/boot/
21 | project/plugins/project/
22 | project/project/target
23 | project/project/project*
24 | project/target
25 | /.activator
26 | 
27 | # Scala-IDE specific
28 | .scala_dependencies
29 | .worksheet
30 | 
31 | #Eclipse specific
32 | .classpath
33 | .project
34 | .cache
35 | .settings/
36 | 
37 | #IntelliJ IDEA specific
38 | .idea/
39 | /.idea_modules
40 | /.idea
41 | /*.iml
42 | 
43 | #LevelDB specific
44 | journal/
45 | snapshots/
46 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |   - 2.11.7
4 | script:
5 |   - sbt ++$TRAVIS_SCALA_VERSION coverage test
6 | after_script:
7 |   - sbt ++$TRAVIS_SCALA_VERSION coverageReport
8 |   - sbt ++$TRAVIS_SCALA_VERSION coverageAggregate
9 |   - sbt ++$TRAVIS_SCALA_VERSION codacyCoverage


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/47deg/spark-on-lets-code.svg?branch=master)](https://travis-ci.org/47deg/spark-on-lets-code)
  2 | [![Codacy Badge](https://api.codacy.com/project/badge/a7ac855c47cc46ea80b6c69907415f5c)](https://www.codacy.com/app/47deg/spark-on-lets-code)
  3 | 
  4 | # Spark On
  5 | 
  6 | This small Spark project provides the sample code which we've talked about in the `Spark On` blog post series at [47D Blog](http://www.47deg.com/blog/tags/sparkonletscode).
  7 | 
  8 | ## App Requirements
  9 | 
 10 | * Twitter Credentials to connect to the Twitter API. Read more about it [here](https://dev.twitter.com/overview/documentation).
 11 | * In this README.md file you will see the IP address `192.168.99.100`. If you are using [docker-machine](https://docs.docker.com/machine/), `docker-machine ip <machine-name>` command should return the specific host’s IP address. You must replace `192.168.99.100` for the IP address in your case.
 12 | * The whole infrastructure has been tested on an Apple Macbook Pro (2,7 GHz Intel Core i5, 16 GB 1867 MHz DDR3).
 13 | 
 14 | To start off, we need to define a few environment variables in this [config file](https://github.com/47deg/spark-on-lets-code/blob/master/scripts/sparkOn.env#L5).
 15 | 
 16 | ## Deploy Docker Infrastructure
 17 | 
 18 | ### Start Cluster
 19 | 
 20 | We've defined a bash script to deploy all of the cluster dependencies, including the Spark Streaming Application, which means, we can run it in this way:
 21 | 
 22 |     scripts/deploy.sh
 23 | 
 24 | By default, the infrastructure deployed will be:
 25 | 
 26 | - Spark Cluster:
 27 |     - 1 Spark Master
 28 |     - 2 Spark Worker nodes
 29 | - Cassandra Cluster:
 30 |     - 2 Cassandra Docker Containers
 31 |     - 1 Docker Container with [DataStax Opscenter](http://www.datastax.com/products/datastax-enterprise-visual-admin)
 32 | - Kafka Cluster:
 33 |     - 1 Docker node Zookeper
 34 |     - 3 Docker containers running as Kafka brokers
 35 | - Hadoop HDFS Cluster:
 36 |     - 1 Docker container running as namenode
 37 |     - 1 Docker container running as datanode
 38 | - 1 Docker container for our Streaming App
 39 | 
 40 | ### Scaling Out Services
 41 | 
 42 | For instance, to increase the Spark Workers available:
 43 | 
 44 |     docker-compose scale spark_worker=5
 45 | 
 46 | ### Start the Streaming
 47 | 
 48 | If everything is functioning correctly, we can start the Twitter Streaming as follows:
 49 | 
 50 |     curl -X "POST" "http://192.168.99.100:9090/twitter-streaming" \
 51 |       -H "Content-Type: application/json" \
 52 |       -d $'{
 53 |       "recreateDatabaseSchema": true,
 54 |       "filters": [
 55 |         "lambda",
 56 |         "scala",
 57 |         "akka",
 58 |         "spray",
 59 |         "play2",
 60 |         "playframework",
 61 |         "spark",
 62 |         "java",
 63 |         "python",
 64 |         "cassandra",
 65 |         "bigdata",
 66 |         "47 Degrees",
 67 |         "47Degrees",
 68 |         "47Deg",
 69 |         "programming",
 70 |         "chicharrones",
 71 |         "cat",
 72 |         "dog"
 73 |       ]
 74 |     }'
 75 | 
 76 | ### Connect to the Web Socket
 77 | 
 78 | For instance, you could use [Simple WebSocket Client](https://goo.gl/8Jw6K) for Google Chrome, opening the connection in this URL ws://192.168.99.100:9090/trending-topics .
 79 | 
 80 | ### Stop Cluster
 81 | 
 82 | We can stop the streaming gracefully, before stopping the cluster:
 83 | 
 84 |     curl -X "DELETE" "http://192.168.99.100:9090/twitter-streaming"
 85 | 
 86 | And then, from the shell:
 87 | 
 88 |     cd scripts
 89 |     docker-compose stop
 90 |     docker-compose rm
 91 | 
 92 | # HTTP Application API - FORMAT: 1A
 93 | 
 94 | ## Spark Streaming Status Endpoint [/twitter-streaming]
 95 | 
 96 | Start, stop and fetch the Spark Streaming Context status in the application. Note: once you have stopped the context you can not start again.
 97 | 
 98 | ### Get Streaming Status [GET]
 99 | 
100 | + Response 200 (application/json)
101 | 
102 |         {
103 |             "message": "The streaming has been created, but not been started yet"
104 |         }
105 | 
106 | ### Start Streaming [POST]
107 | 
108 | This action allows you to stop the Spark Streaming Context.
109 | 
110 | + Response 200 (application/json)
111 | 
112 |         {
113 |             "message": "Started"
114 |         }
115 | 
116 | + Response 400
117 | 
118 | ### Stop Streaming [DELETE]
119 | 
120 | This action allows you to start the Spark Streaming Context.
121 | 
122 | + Response 200 (application/json)
123 | 
124 |         {
125 |             "message": "The streaming has been stopped"
126 |         }
127 | 
128 | + Response 400
129 | 
130 | ## WS Filtered Twitter Word Tracks [WS /trending-topics]
131 | 
132 | Open a websocket in order to show each new filtered track word is found.
133 | 
134 | #License
135 | 
136 | Copyright (C) 2015 47 Degrees, LLC [http://47deg.com](http://47deg.com) [hello@47deg.com](mailto:hello@47deg.com)
137 | 
138 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
139 | 
140 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
141 | 


--------------------------------------------------------------------------------
/activator:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ###  ------------------------------- ###
  4 | ###  Helper methods for BASH scripts ###
  5 | ###  ------------------------------- ###
  6 | 
  7 | realpath () {
  8 | (
  9 |   TARGET_FILE="$1"
 10 | 
 11 |   cd "$(dirname "$TARGET_FILE")"
 12 |   TARGET_FILE=$(basename "$TARGET_FILE")
 13 | 
 14 |   COUNT=0
 15 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 16 |   do
 17 |       TARGET_FILE=$(readlink "$TARGET_FILE")
 18 |       cd "$(dirname "$TARGET_FILE")"
 19 |       TARGET_FILE=$(basename "$TARGET_FILE")
 20 |       COUNT=$(($COUNT + 1))
 21 |   done
 22 | 
 23 |   if [ "$TARGET_FILE" == "." -o "$TARGET_FILE" == ".." ]; then
 24 |     cd "$TARGET_FILE"
 25 |     TARGET_FILEPATH=
 26 |   else
 27 |     TARGET_FILEPATH=/$TARGET_FILE
 28 |   fi
 29 | 
 30 |   # make sure we grab the actual windows path, instead of cygwin's path.
 31 |   if ! is_cygwin; then
 32 |     echo "$(pwd -P)/$TARGET_FILE"
 33 |   else
 34 |     echo $(cygwinpath "$(pwd -P)/$TARGET_FILE")
 35 |   fi
 36 | )
 37 | }
 38 | 
 39 | # TODO - Do we need to detect msys?
 40 | 
 41 | # Uses uname to detect if we're in the odd cygwin environment.
 42 | is_cygwin() {
 43 |   local os=$(uname -s)
 44 |   case "$os" in
 45 |     CYGWIN*) return 0 ;;
 46 |     *)  return 1 ;;
 47 |   esac
 48 | }
 49 | 
 50 | # This can fix cygwin style /cygdrive paths so we get the
 51 | # windows style paths.
 52 | cygwinpath() {
 53 |   local file="$1"
 54 |   if is_cygwin; then
 55 |     echo $(cygpath -w $file)
 56 |   else
 57 |     echo $file
 58 |   fi
 59 | }
 60 | 
 61 | # Make something URI friendly
 62 | make_url() {
 63 |   url="$1"
 64 |   local nospaces=${url// /%20}
 65 |   if is_cygwin; then
 66 |     echo "/${nospaces//\\//}"
 67 |   else
 68 |     echo "$nospaces"
 69 |   fi
 70 | }
 71 | 
 72 | # Detect if we should use JAVA_HOME or just try PATH.
 73 | get_java_cmd() {
 74 |   if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]];  then
 75 |     echo "$JAVA_HOME/bin/java"
 76 |   else
 77 |     echo "java"
 78 |   fi
 79 | }
 80 | 
 81 | echoerr () {
 82 |   echo 1>&2 "$@"
 83 | }
 84 | vlog () {
 85 |   [[ $verbose || $debug ]] && echoerr "$@"
 86 | }
 87 | dlog () {
 88 |   [[ $debug ]] && echoerr "$@"
 89 | }
 90 | execRunner () {
 91 |   # print the arguments one to a line, quoting any containing spaces
 92 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
 93 |     for arg; do
 94 |       if printf "%s\n" "$arg" | grep -q ' '; then
 95 |         printf "\"%s\"\n" "$arg"
 96 |       else
 97 |         printf "%s\n" "$arg"
 98 |       fi
 99 |     done
100 |     echo ""
101 |   }
102 | 
103 |   exec "$@"
104 | }
105 | addJava () {
106 |   dlog "[addJava] arg = '$1'"
107 |   java_args=( "${java_args[@]}" "$1" )
108 | }
109 | addApp () {
110 |   dlog "[addApp] arg = '$1'"
111 |   sbt_commands=( "${app_commands[@]}" "$1" )
112 | }
113 | addResidual () {
114 |   dlog "[residual] arg = '$1'"
115 |   residual_args=( "${residual_args[@]}" "$1" )
116 | }
117 | addDebugger () {
118 |   addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
119 | }
120 | addConfigOpts () {
121 |   dlog "[addConfigOpts] arg = '$*'"
122 |   for item in $*
123 |   do
124 |     addJava "$item"
125 |   done
126 | }
127 | # a ham-fisted attempt to move some memory settings in concert
128 | # so they need not be messed around with individually.
129 | get_mem_opts () {
130 |   local mem=${1:-1024}
131 |   local meta=$(( $mem / 4 ))
132 |   (( $meta > 256 )) || meta=256
133 |   (( $meta < 1024 )) || meta=1024
134 | 
135 |   # default is to set memory options but this can be overridden by code section below
136 |   memopts="-Xms${mem}m -Xmx${mem}m"
137 |   if [[ "${java_version}" > "1.8" ]]; then
138 |     extmemopts="-XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=${meta}m"
139 |   else
140 |     extmemopts="-XX:PermSize=64m -XX:MaxPermSize=${meta}m"
141 |   fi
142 | 
143 |   if [[ "${java_opts}" == *-Xmx* ]] || [[ "${java_opts}" == *-Xms* ]] || [[ "${java_opts}" == *-XX:MaxPermSize* ]] || [[ "${java_opts}" == *-XX:ReservedCodeCacheSize* ]] || [[ "${java_opts}" == *-XX:MaxMetaspaceSize* ]]; then
144 |     # if we detect any of these settings in ${java_opts} we need to NOT output our settings.
145 |     # The reason is the Xms/Xmx, if they don't line up, cause errors.
146 |     memopts=""
147 |     extmemopts=""
148 |   fi
149 | 
150 |   echo "${memopts} ${extmemopts}"
151 | }
152 | require_arg () {
153 |   local type="$1"
154 |   local opt="$2"
155 |   local arg="$3"
156 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
157 |     die "$opt requires <$type> argument"
158 |   fi
159 | }
160 | is_function_defined() {
161 |   declare -f "$1" > /dev/null
162 | }
163 | 
164 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter
165 | detect_terminal_for_ui() {
166 |   [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && {
167 |     addResidual "ui"
168 |   }
169 |   # SPECIAL TEST FOR MAC
170 |   [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && {
171 |     echo "Detected MAC OSX launched script...."
172 |     echo "Swapping to UI"
173 |     addResidual "ui"
174 |   }
175 | }
176 | 
177 | # Processes incoming arguments and places them in appropriate global variables.  called by the run method.
178 | process_args () {
179 |   while [[ $# -gt 0 ]]; do
180 |     case "$1" in
181 |        -h|-help) usage; exit 1 ;;
182 |     -v|-verbose) verbose=1 && shift ;;
183 |       -d|-debug) debug=1 && shift ;;
184 |            -mem) require_arg integer "$1" "$2" && app_mem="$2" && shift 2 ;;
185 |      -jvm-debug) 
186 |         if echo "$2" | grep -E ^[0-9]+$ > /dev/null; then 
187 |             addDebugger "$2" && shift 
188 |         else
189 |             addDebugger 9999
190 |         fi 
191 |         shift ;;
192 |      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;;
193 |             -D*) addJava "$1" && shift ;;
194 |             -J*) addJava "${1:2}" && shift ;;
195 |               *) addResidual "$1" && shift ;;
196 |     esac
197 |   done
198 | 
199 |   is_function_defined process_my_args && {
200 |     myargs=("${residual_args[@]}")
201 |     residual_args=()
202 |     process_my_args "${myargs[@]}"
203 |   }
204 | }
205 | 
206 | # Actually runs the script.
207 | run() {
208 |   # TODO - check for sane environment
209 | 
210 |   # process the combined args, then reset "$@" to the residuals
211 |   process_args "$@"
212 |   detect_terminal_for_ui
213 |   set -- "${residual_args[@]}"
214 |   argumentCount=$#
215 | 
216 |   #check for jline terminal fixes on cygwin
217 |   if is_cygwin; then
218 |     stty -icanon min 1 -echo > /dev/null 2>&1
219 |     addJava "-Djline.terminal=jline.UnixTerminal"
220 |     addJava "-Dsbt.cygwin=true"
221 |   fi
222 | 
223 |   # run sbt
224 |   execRunner "$java_cmd" \
225 |     "-Dactivator.home=$(make_url "$activator_home")" \
226 |     $(get_mem_opts $app_mem) \
227 |     ${java_opts[@]} \
228 |     ${java_args[@]} \
229 |     -jar "$app_launcher" \
230 |     "${app_commands[@]}" \
231 |     "${residual_args[@]}"
232 |     
233 |   local exit_code=$?
234 |   if is_cygwin; then
235 |     stty icanon echo > /dev/null 2>&1
236 |   fi
237 |   exit $exit_code
238 | }
239 | 
240 | # Loads a configuration file full of default command line options for this script.
241 | loadConfigFile() {
242 |   cat "$1" | sed '/^\#/d'
243 | }
244 | 
245 | ###  ------------------------------- ###
246 | ###  Start of customized settings    ###
247 | ###  ------------------------------- ###
248 | usage() {
249 |  cat <<EOM
250 | Usage: $script_name <command> [options]
251 | 
252 |   Command:
253 |   ui                 Start the Activator UI
254 |   new [name] [template-id]  Create a new project with [name] using template [template-id]
255 |   list-templates     Print all available template names
256 |   -h | -help         Print this message
257 | 
258 |   Options:
259 |   -v | -verbose      Make this runner chattier
260 |   -d | -debug        Set sbt log level to debug
261 |   -mem <integer>     Set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
262 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
263 | 
264 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
265 |   -java-home <path>  Alternate JAVA_HOME
266 | 
267 |   # jvm options and output control
268 |   -Dkey=val          Pass -Dkey=val directly to the java runtime
269 |   -J-X               Pass option -X directly to the java runtime
270 |                      (-J is stripped)
271 | 
272 |   # environment variables (read from context)
273 |   JAVA_OPTS          Environment variable, if unset uses ""
274 |   SBT_OPTS           Environment variable, if unset uses ""
275 |   ACTIVATOR_OPTS     Environment variable, if unset uses ""
276 | 
277 | In the case of duplicated or conflicting options, the order above
278 | shows precedence: environment variables lowest, command line options highest.
279 | EOM
280 | }
281 | 
282 | ###  ------------------------------- ###
283 | ###  Main script                     ###
284 | ###  ------------------------------- ###
285 | 
286 | declare -a residual_args
287 | declare -a java_args
288 | declare -a app_commands
289 | declare -r real_script_path="$(realpath "$0")"
290 | declare -r activator_home="$(realpath "$(dirname "$real_script_path")")"
291 | declare -r app_version="1.3.5"
292 | 
293 | declare -r app_launcher="${activator_home}/activator-launch-${app_version}.jar"
294 | declare -r script_name=activator
295 | java_cmd=$(get_java_cmd)
296 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" )
297 | userhome="$HOME"
298 | if is_cygwin; then
299 |   # cygwin sets home to something f-d up, set to real windows homedir
300 |   userhome="$USERPROFILE"
301 | fi
302 | declare -r activator_user_home_dir="${userhome}/.activator"
303 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt"
304 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt"
305 | 
306 | # Now check to see if it's a good enough version
307 | declare -r java_version=$("$java_cmd" -version 2>&1 | awk -F '"' '/version/ {print $2}')
308 | if [[ "$java_version" == "" ]]; then
309 |   echo
310 |   echo No java installations was detected.
311 |   echo Please go to http://www.java.com/getjava/ and download
312 |   echo
313 |   exit 1
314 | elif [[ ! "$java_version" > "1.6" ]]; then
315 |   echo
316 |   echo The java installation you have is not up to date
317 |   echo Activator requires at least version 1.6+, you have
318 |   echo version $java_version
319 |   echo
320 |   echo Please go to http://www.java.com/getjava/ and download
321 |   echo a valid Java Runtime and install before running Activator.
322 |   echo
323 |   exit 1
324 | fi
325 | 
326 | # if configuration files exist, prepend their contents to the java args so it can be processed by this runner
327 | # a "versioned" config trumps one on the top level
328 | if [[ -f "$java_opts_config_version" ]]; then
329 |   addConfigOpts $(loadConfigFile "$java_opts_config_version")
330 | elif [[ -f "$java_opts_config_home" ]]; then
331 |   addConfigOpts $(loadConfigFile "$java_opts_config_home")
332 | fi
333 | 
334 | run "$@"
335 | 


--------------------------------------------------------------------------------
/activator-launch-1.3.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xebia-functional/spark-on-lets-code/3df49fb88e4b7477d14a4aded89d163a24b30632/activator-launch-1.3.5.jar


--------------------------------------------------------------------------------
/activator.bat:
--------------------------------------------------------------------------------
  1 | @REM activator launcher script
  2 | @REM
  3 | @REM Environment:
  4 | @REM In order for Activator to work you must have Java available on the classpath
  5 | @REM JAVA_HOME - location of a JDK home dir (optional if java on path)
  6 | @REM CFG_OPTS  - JVM options (optional)
  7 | @REM Configuration:
  8 | @REM activatorconfig.txt found in the ACTIVATOR_HOME or ACTIVATOR_HOME/ACTIVATOR_VERSION
  9 | @setlocal enabledelayedexpansion
 10 | 
 11 | @echo off
 12 | 
 13 | set "var1=%~1"
 14 | if defined var1 (
 15 |   if "%var1%"=="help" (
 16 |     echo.
 17 |     echo Usage activator [options] [command]
 18 |     echo.
 19 |     echo Commands:
 20 |     echo ui                 Start the Activator UI
 21 |     echo new [name] [template-id]  Create a new project with [name] using template [template-id]
 22 |     echo list-templates     Print all available template names
 23 |     echo help               Print this message
 24 |     echo.
 25 |     echo Options:
 26 |     echo -jvm-debug [port]  Turn on JVM debugging, open at the given port.  Defaults to 9999 if no port given.
 27 |     echo.
 28 |     echo Environment variables ^(read from context^):
 29 |     echo JAVA_OPTS          Environment variable, if unset uses ""
 30 |     echo SBT_OPTS           Environment variable, if unset uses ""
 31 |     echo ACTIVATOR_OPTS     Environment variable, if unset uses ""
 32 |     echo.
 33 |     echo Please note that in order for Activator to work you must have Java available on the classpath
 34 |     echo.
 35 |     goto :end
 36 |   )
 37 | )
 38 | 
 39 | if "%ACTIVATOR_HOME%"=="" (
 40 | 	set "ACTIVATOR_HOME=%~dp0"
 41 | 	@REM remove trailing "\" from path
 42 | 	set ACTIVATOR_HOME=!ACTIVATOR_HOME:~0,-1!
 43 | )
 44 | 
 45 | set ERROR_CODE=0
 46 | set APP_VERSION=1.3.5
 47 | set ACTIVATOR_LAUNCH_JAR=activator-launch-%APP_VERSION%.jar
 48 | 
 49 | rem Detect if we were double clicked, although theoretically A user could
 50 | rem manually run cmd /c
 51 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1
 52 | 
 53 | rem FIRST we load a config file of extra options (if there is one)
 54 | set "CFG_FILE_HOME=%UserProfile%\.activator\activatorconfig.txt"
 55 | set "CFG_FILE_VERSION=%UserProfile%\.activator\%APP_VERSION%\activatorconfig.txt"
 56 | set CFG_OPTS=
 57 | if exist %CFG_FILE_VERSION% (
 58 |   FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_VERSION%") DO (
 59 |     set DO_NOT_REUSE_ME=%%i
 60 |     rem ZOMG (Part #2) WE use !! here to delay the expansion of
 61 |     rem CFG_OPTS, otherwise it remains "" for this loop.
 62 |     set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 63 |   )
 64 | )
 65 | if "%CFG_OPTS%"=="" (
 66 |   if exist %CFG_FILE_HOME% (
 67 |     FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_HOME%") DO (
 68 |       set DO_NOT_REUSE_ME=%%i
 69 |       rem ZOMG (Part #2) WE use !! here to delay the expansion of
 70 |       rem CFG_OPTS, otherwise it remains "" for this loop.
 71 |       set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 72 |     )
 73 |   )
 74 | )
 75 | 
 76 | rem We use the value of the JAVACMD environment variable if defined
 77 | set _JAVACMD=%JAVACMD%
 78 | 
 79 | if "%_JAVACMD%"=="" (
 80 |   if not "%JAVA_HOME%"=="" (
 81 |     if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe"
 82 | 
 83 |     rem if there is a java home set we make sure it is the first picked up when invoking 'java'
 84 |     SET "PATH=%JAVA_HOME%\bin;%PATH%"
 85 |   )
 86 | )
 87 | 
 88 | if "%_JAVACMD%"=="" set _JAVACMD=java
 89 | 
 90 | rem Detect if this java is ok to use.
 91 | for /F %%j in ('"%_JAVACMD%" -version  2^>^&1') do (
 92 |   if %%~j==java set JAVAINSTALLED=1
 93 |   if %%~j==openjdk set JAVAINSTALLED=1
 94 | )
 95 | 
 96 | rem Detect the same thing about javac
 97 | if "%_JAVACCMD%"=="" (
 98 |   if not "%JAVA_HOME%"=="" (
 99 |     if exist "%JAVA_HOME%\bin\javac.exe" set "_JAVACCMD=%JAVA_HOME%\bin\javac.exe"
100 |   )
101 | )
102 | if "%_JAVACCMD%"=="" set _JAVACCMD=javac
103 | for /F %%j in ('"%_JAVACCMD%" -version 2^>^&1') do (
104 |   if %%~j==javac set JAVACINSTALLED=1
105 | )
106 | 
107 | rem BAT has no logical or, so we do it OLD SCHOOL! Oppan Redmond Style
108 | set JAVAOK=true
109 | if not defined JAVAINSTALLED set JAVAOK=false
110 | if not defined JAVACINSTALLED set JAVAOK=false
111 | 
112 | if "%JAVAOK%"=="false" (
113 |   echo.
114 |   echo A Java JDK is not installed or can't be found.
115 |   if not "%JAVA_HOME%"=="" (
116 |     echo JAVA_HOME = "%JAVA_HOME%"
117 |   )
118 |   echo.
119 |   echo Please go to
120 |   echo   http://www.oracle.com/technetwork/java/javase/downloads/index.html
121 |   echo and download a valid Java JDK and install before running Activator.
122 |   echo.
123 |   echo If you think this message is in error, please check
124 |   echo your environment variables to see if "java.exe" and "javac.exe" are
125 |   echo available via JAVA_HOME or PATH.
126 |   echo.
127 |   if defined DOUBLECLICKED pause
128 |   exit /B 1
129 | )
130 | 
131 | rem Check what Java version is being used to determine what memory options to use
132 | for /f "tokens=3" %%g in ('java -version 2^>^&1 ^| findstr /i "version"') do (
133 |     set JAVA_VERSION=%%g
134 | )
135 | 
136 | rem Strips away the " characters
137 | set JAVA_VERSION=%JAVA_VERSION:"=%
138 | 
139 | rem TODO Check if there are existing mem settings in JAVA_OPTS/CFG_OPTS and use those instead of the below
140 | for /f "delims=. tokens=1-3" %%v in ("%JAVA_VERSION%") do (
141 |     set MAJOR=%%v
142 |     set MINOR=%%w
143 |     set BUILD=%%x
144 | 
145 |     set META_SIZE=-XX:MetaspaceSize=64M -XX:MaxMetaspaceSize=256M
146 |     if "!MINOR!" LSS "8" (
147 |       set META_SIZE=-XX:PermSize=64M -XX:MaxPermSize=256M
148 |     )
149 | 
150 |     set MEM_OPTS=!META_SIZE!
151 |  )
152 | 
153 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config.
154 | set _JAVA_OPTS=%JAVA_OPTS%
155 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS%
156 | 
157 | set DEBUG_OPTS=
158 | 
159 | rem Loop through the arguments, building remaining args in args variable
160 | set args=
161 | :argsloop
162 | if not "%~1"=="" (
163 |   rem Checks if the argument contains "-D" and if true, adds argument 1 with 2 and puts an equal sign between them.
164 |   rem This is done since batch considers "=" to be a delimiter so we need to circumvent this behavior with a small hack.
165 |   set arg1=%~1
166 |   if "!arg1:~0,2!"=="-D" (
167 |    	set "args=%args% "%~1"="%~2""
168 |     shift
169 |     shift
170 |     goto argsloop
171 |   )
172 | 
173 |   if "%~1"=="-jvm-debug" (
174 |     if not "%~2"=="" (
175 |       rem This piece of magic somehow checks that an argument is a number
176 |       for /F "delims=0123456789" %%i in ("%~2") do (
177 |         set var="%%i"
178 |       )
179 |       if defined var (
180 |         rem Not a number, assume no argument given and default to 9999
181 |         set JPDA_PORT=9999
182 |       ) else (
183 |         rem Port was given, shift arguments
184 |         set JPDA_PORT=%~2
185 |         shift
186 |       )
187 |     ) else (
188 |       set JPDA_PORT=9999
189 |     )
190 |     shift
191 | 
192 |     set DEBUG_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=!JPDA_PORT!
193 |     goto argsloop
194 |   )
195 |   rem else
196 |   set "args=%args% "%~1""
197 |   shift
198 |   goto argsloop
199 | )
200 | 
201 | :run
202 | 
203 | if "!args!"=="" (
204 |   if defined DOUBLECLICKED (
205 |     set CMDS="ui"
206 |   ) else set CMDS=!args!
207 | ) else set CMDS=!args!
208 | 
209 | rem We add a / in front, so we get file:///C: instead of file://C:
210 | rem Java considers the later a UNC path.
211 | rem We also attempt a solid effort at making it URI friendly.
212 | rem We don't even bother with UNC paths.
213 | set JAVA_FRIENDLY_HOME_1=/!ACTIVATOR_HOME:\=/!
214 | set JAVA_FRIENDLY_HOME=/!JAVA_FRIENDLY_HOME_1: =%%20!
215 | 
216 | rem Checks if the command contains spaces to know if it should be wrapped in quotes or not
217 | set NON_SPACED_CMD=%_JAVACMD: =%
218 | if "%_JAVACMD%"=="%NON_SPACED_CMD%" %_JAVACMD% %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
219 | if NOT "%_JAVACMD%"=="%NON_SPACED_CMD%" "%_JAVACMD%" %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
220 | 
221 | if ERRORLEVEL 1 goto error
222 | goto end
223 | 
224 | :error
225 | set ERROR_CODE=1
226 | 
227 | :end
228 | 
229 | @endlocal
230 | 
231 | exit /B %ERROR_CODE%
232 | 


--------------------------------------------------------------------------------
/modules/api/src/main/resources/application.conf:
--------------------------------------------------------------------------------
  1 | spark-on {
  2 |   cassandraCQLPath = "/data/spark_on_spark.cql"
  3 |   windowSizeSeconds = 30
  4 |   slideDuration = 10
  5 |   filters = [
  6 |     "scala",
  7 |     "akka",
  8 |     "spray",
  9 |     "play2",
 10 |     "playframework",
 11 |     "spark",
 12 |     "java",
 13 |     "python",
 14 |     "cassandra",
 15 |     "bigdata",
 16 |     "47 Degrees",
 17 |     "47Degrees",
 18 |     "47Deg",
 19 |     "programming",
 20 |     "lambda",
 21 |     "chicharrones",
 22 |     "cat",
 23 |     "dog"]
 24 |   spark.jars = ["./modules/api/target/scala-2.11/sparkOn-1.0.0.jar"]
 25 |   spark.jars = [${?SPARK_APP_JARS}]
 26 |   dateFormat: "yyyy_MM_dd_HH_mm"
 27 |   dateFormatSplitter: "_"
 28 | }
 29 | 
 30 | twitter {
 31 |   credentials {
 32 |     consumerKey = ""
 33 |     consumerKey = ${?CONSUMER_KEY}
 34 |     consumerSecret = ""
 35 |     consumerSecret = ${?CONSUMER_SECRET}
 36 |     accessToken = ""
 37 |     accessToken = ${?ACCESS_TOKEN}
 38 |     accessTokenSecret = ""
 39 |     accessTokenSecret = ${?ACCESS_TOKEN_SECRET}
 40 |   }
 41 | }
 42 | 
 43 | spark {
 44 |   master = "local[*]"
 45 |   master = ${?SPARK_MASTER_PORT_7077_TCP_ADDR}
 46 |   port = 7077
 47 |   port = ${?SPARK_MASTER_ENV_SPARK_MASTER_PORT}
 48 |   home = "/usr/local/spark"
 49 |   home = ${?SPARK_HOME}
 50 |   appName = "Spark On"
 51 |   checkpoint = "./checkpoint"
 52 |   checkpoint = ${?SPARK_CHECKPOINT}
 53 | 
 54 |   streaming.batch.interval = 10
 55 | 
 56 |   executor.memory = 2g
 57 |   cores.max = 2
 58 |   akka.heartbeat.interval = 100
 59 |   serializer = "org.apache.spark.serializer.KryoSerializer"
 60 | 
 61 |   cassandra {
 62 |     connection.host = [192.168.99.100]
 63 |     # Development:
 64 |     # connection.host = ${?CASSANDRA_HOSTS}
 65 |     # Production:
 66 |     connection.host = [${?CASSANDRA_HOSTS}]
 67 |     keyspace = "spark_on_topics"
 68 |   }
 69 | }
 70 | 
 71 | http {
 72 |   interface = "0.0.0.0"
 73 |   interface = ${?HTTP_INTERFACE}
 74 |   port = 8080
 75 |   port = ${?HTTP_PORT}
 76 | }
 77 | 
 78 | kafka {
 79 |   hosts = ["192.168.99.100:9092"]
 80 |   # Development:
 81 |   # hosts = ${?KAFKA_HOSTS}
 82 |   # Production:
 83 |   hosts = [${?KAFKA_HOSTS}]
 84 |   topics = "sparkOn.raw"
 85 |   topics = ${?KAFKA_TOPIC}
 86 | 
 87 |   zookeeper {
 88 |     host = "192.168.99.100"
 89 |     host = ${?ZOOKEEPER_PORT_2181_TCP_ADDR}
 90 |     port = 2181
 91 |     port = ${?ZOOKEEPER_ENV_ZOOKEEPER_PORT}
 92 |   }
 93 | 
 94 |   group.id = "sparkOn.group"
 95 |   topic.raw = "sparkOn.raw"
 96 | 
 97 |   producer {
 98 |     value.serializer = "org.apache.kafka.common.serialization.StringSerializer"
 99 |     key.serializer = "org.apache.kafka.common.serialization.StringSerializer"
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/modules/api/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # output messages into a rolling log file as well as stdout
 2 | log4j.rootLogger=INFO,stdout
 3 | 
 4 | # stdout
 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 8 | 
 9 | # Avoid "no host ID found" when starting a fresh node
10 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR
11 | 
12 | #  If running spark local, ignore block input exists warnings, which are expected.
13 | log4j.logger.org.apache.spark.storage.BlockManager=ERROR
14 | log4j.logger.com.datastax.spark.connector=INFO
15 | log4j.logger.org.apache.spark=WARN
16 | log4j.logger.com.datastax.driver.core=WARN


--------------------------------------------------------------------------------
/modules/api/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <appender name="console" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <encoder>
 5 |             <pattern>%date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n</pattern>
 6 |         </encoder>
 7 |     </appender>
 8 | 
 9 |     <appender name="file" class="ch.qos.logback.core.FileAppender">
10 |         <file>${log-file:-logs/api.log}</file>
11 |         <encoder>
12 |             <pattern>%date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n</pattern>
13 |         </encoder>
14 |     </appender>
15 | 
16 |     <logger name="org.apache.spark" level="WARN" />
17 |     <logger name="org.apache.spark.storage.BlockManager" level="ERROR" />
18 | 
19 |     <logger name="com.datastax.driver" level="WARN" />
20 |     <logger name="com.datastax.spark" level="WARN" />
21 |     <logger name="org.apache.cassandra.db.SystemKeyspace" level="ERROR"/>
22 | 
23 |     <logger name="org.apache.zookeeper" level="ERROR"/>
24 |     <logger name="kafka" level="ERROR" />
25 | 
26 |     <logger name="org.eclipse.jetty" level="ERROR" />
27 |     <logger name="org.apache.thrift" level="ERROR"/>
28 | 
29 |     <logger name="org.apache.hadoop" level="ERROR" />
30 |     <logger name="org.apache.kafka" level="ERROR" />
31 | 
32 |     <root level="info">
33 |         <appender-ref ref="console"/>
34 |         <appender-ref ref="file"/>
35 |     </root>
36 | 
37 | </configuration>
38 | 


--------------------------------------------------------------------------------
/modules/api/src/main/scala/com/fortysevendeg/sparkon/api/http/ApiHttpService.scala:
--------------------------------------------------------------------------------
  1 | package com.fortysevendeg.sparkon.api.http
  2 | 
  3 | import akka.actor.ActorSystem
  4 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._
  5 | import akka.http.scaladsl.model.StatusCodes._
  6 | import akka.http.scaladsl.model.ws.{Message, TextMessage}
  7 | import akka.http.scaladsl.server.Directives._
  8 | import akka.stream.Materializer
  9 | import akka.stream.scaladsl.{Flow, Keep, Sink, Source}
 10 | import com.datastax.spark.connector.cql.CassandraConnector
 11 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._
 12 | import com.fortysevendeg.sparkon.services.twitter._
 13 | import com.softwaremill.react.kafka.KafkaMessages._
 14 | import com.softwaremill.react.kafka.{ConsumerProperties, ReactiveKafka}
 15 | import kafka.serializer.StringDecoder
 16 | import org.apache.spark.SparkContext
 17 | import org.apache.spark.streaming.{Seconds, StreamingContext, StreamingContextState}
 18 | import org.reactivestreams.Publisher
 19 | import org.slf4j.LoggerFactory
 20 | 
 21 | import scala.concurrent.ExecutionContextExecutor
 22 | 
 23 | case class Info(message: String)
 24 | 
 25 | case class ApiStreamingRequest(recreateDatabaseSchema: Boolean, filters: List[String])
 26 | 
 27 | trait ApiHttpService extends Protocols {
 28 | 
 29 |   val logger = LoggerFactory.getLogger(this.getClass)
 30 | 
 31 |   implicit val system: ActorSystem
 32 |   implicit def executor: ExecutionContextExecutor
 33 |   implicit val materializer: Materializer
 34 |   implicit val sparkContext: SparkContext
 35 |   implicit val ssc: StreamingContext
 36 |   implicit val cassandraConnector: CassandraConnector
 37 |   implicit val twitterStreamingServices: TwitterStreamingServices
 38 | 
 39 |   val routes = {
 40 |     logRequestResult("web-socket-services") {
 41 |       pathPrefix("trending-topics") {
 42 |         get {
 43 |           handleWebsocketMessages(handler = kafkaServiceFlow)
 44 |         }
 45 |       }
 46 |     } ~ {
 47 |       logRequestResult("twitter-streaming-services") {
 48 |         pathPrefix("twitter-streaming") {
 49 |           get {
 50 |             complete {
 51 |               Info(message = ssc.getState() match {
 52 |                 case StreamingContextState.INITIALIZED => "The streaming has been created, but not been started yet"
 53 |                 case StreamingContextState.ACTIVE => "The streaming has been started and running"
 54 |                 case StreamingContextState.STOPPED => "The streaming has been stopped"
 55 |               })
 56 |             }
 57 |           } ~
 58 |             post {
 59 |               implicit val apiStreamingRequestFormat = jsonFormat2(ApiStreamingRequest)
 60 |               entity(as[ApiStreamingRequest]) { request =>
 61 |                 complete {
 62 |                   ssc.getState() match {
 63 |                     case StreamingContextState.INITIALIZED =>
 64 |                       if (request.recreateDatabaseSchema) {
 65 |                         twitterStreamingServices.createCassandraSchema
 66 |                       }
 67 |                       val filters = TwitterServices.getTrendingTopics ++ request.filters
 68 | 
 69 |                       logger.info(s"Streaming Filters [${filters.mkString(",\n")}]")
 70 | 
 71 |                       implicit val dsStream = twitterStreamingServices.createTwitterStream()
 72 |                       twitterStreamingServices.ingestTweets(topics = filters,
 73 |                         windowSize = Seconds(windowSizeSeconds),
 74 |                         slideDuration = Seconds(slideDuration))
 75 |                       Info(message = "Started")
 76 |                     case StreamingContextState.ACTIVE =>
 77 |                       BadRequest -> "The streaming has already started"
 78 |                     case StreamingContextState.STOPPED =>
 79 |                       BadRequest -> "The streaming has already stopped"
 80 |                   }
 81 |                 }
 82 |               }
 83 |             } ~
 84 |             delete {
 85 |               complete {
 86 |                 ssc.getState() match {
 87 |                   case StreamingContextState.INITIALIZED =>
 88 |                     Info(message = "The streaming has been created, but not been started yet")
 89 |                   case StreamingContextState.ACTIVE =>
 90 |                     ssc.stop(stopSparkContext = false, stopGracefully = true)
 91 |                     ssc.awaitTermination()
 92 |                     Info(message = "The streaming has been stopped")
 93 |                   case StreamingContextState.STOPPED =>
 94 |                     BadRequest -> "The streaming has already stopped"
 95 |                 }
 96 |               }
 97 |             }
 98 |         }
 99 |       }
100 |     }
101 |   }
102 | 
103 |   def kafkaServiceFlow: Flow[Message, Message, _] = {
104 | 
105 |     val kafka = new ReactiveKafka()
106 |     val publisher: Publisher[StringKafkaMessage] =
107 |       kafka.consume(
108 |         ConsumerProperties(
109 |           brokerList = bootstrapServers,
110 |           zooKeeperHost = s"$zookeeperHost:$zookeeperPort",
111 |           topic = kafkaTopicRaw,
112 |           groupId = kafkaGroupId,
113 |           decoder = new StringDecoder()
114 |         )
115 |       )
116 | 
117 |     Flow.wrap(Sink.ignore, Source(publisher) map toMessage)(Keep.none)
118 |   }
119 | 
120 |   def toMessage(t: KafkaMessage[String]) = TextMessage("Received: " + t.message)
121 | }
122 | 


--------------------------------------------------------------------------------
/modules/api/src/main/scala/com/fortysevendeg/sparkon/api/http/Boot.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.api.http
 2 | 
 3 | import akka.actor.ActorSystem
 4 | import akka.http.scaladsl.Http
 5 | import akka.stream.ActorMaterializer
 6 | import com.datastax.spark.connector.cql.CassandraConnector
 7 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._
 8 | import com.fortysevendeg.sparkon.services.twitter.TwitterStreamingServices
 9 | import org.apache.spark.streaming.{Seconds, StreamingContext}
10 | import org.apache.spark.{SparkConf, SparkContext}
11 | 
12 | object Boot extends App with ApiHttpService {
13 | 
14 |   val sparkConf = new SparkConf()
15 |     .setMaster(sparkMaster)
16 |     .setAppName(sparkAppName)
17 |     .setSparkHome(sparkHome)
18 |     .setJars(sparkOnJars)
19 |     .set("spark.executor.memory", sparkExecutorMemory.toString)
20 |     .set("spark.cores.max", sparkCoresMax.toString)
21 |     .set("spark.cassandra.connection.host", cassandraHosts)
22 |     .set("spark.akka.heartbeat.interval", sparkAkkaHeartbeatInterval.toString)
23 |     .set("spark.serializer", sparkSerializer)
24 |     .set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory")
25 |     .set("spark.executorEnv.kafkaBootstrapServers", bootstrapServers)
26 |     .set("spark.executorEnv.kafkaProducerKeySerializer", kafkaProducerKeySerializer)
27 |     .set("spark.executorEnv.kafkaProducerValueSerializer", kafkaProducerValueSerializer)
28 |     .set("spark.streaming.backpressure.enabled", "true")
29 | 
30 |   override implicit val system = ActorSystem("ReactiveSparkOn")
31 |   override implicit val executor = system.dispatcher
32 |   override implicit val materializer = ActorMaterializer()
33 |   override implicit val sparkContext = createSparkContext
34 |   override implicit val ssc: StreamingContext = createStreamingContext(sparkContext)
35 |   override implicit val cassandraConnector: CassandraConnector = CassandraConnector(sparkConf)
36 |   override implicit val twitterStreamingServices = new TwitterStreamingServices {}
37 | 
38 |   Http().bindAndHandle(routes, interface, port)
39 |   logger.info(s"Server started at http://$interface:$port")
40 | 
41 |   def createSparkContext: SparkContext = new SparkContext(sparkConf)
42 | 
43 |   def createStreamingContext(sparkContext: SparkContext): StreamingContext =
44 |     new StreamingContext(sparkContext = sparkContext, batchDuration = Seconds(streamingBatchInterval))
45 | }
46 | 


--------------------------------------------------------------------------------
/modules/api/src/main/scala/com/fortysevendeg/sparkon/api/http/Protocols.scala:
--------------------------------------------------------------------------------
1 | package com.fortysevendeg.sparkon.api.http
2 | 
3 | import spray.json.DefaultJsonProtocol
4 | 
5 | trait Protocols extends DefaultJsonProtocol {
6 |   implicit val infoFormat = jsonFormat1(Info.apply)
7 | }


--------------------------------------------------------------------------------
/modules/common/src/main/scala/com/fortysevendeg/sparkon/common/StaticValues.scala:
--------------------------------------------------------------------------------
1 | package com.fortysevendeg.sparkon.common
2 | 
3 | object StaticValues {
4 |   val javaNull = None.orNull
5 | }
6 | 


--------------------------------------------------------------------------------
/modules/common/src/main/scala/com/fortysevendeg/sparkon/common/config/ConfigRegistry.scala:
--------------------------------------------------------------------------------
  1 | package com.fortysevendeg.sparkon.common.config
  2 | 
  3 | import com.typesafe.config.{Config, ConfigFactory}
  4 | 
  5 | import scala.collection.JavaConverters._
  6 | import scala.language.postfixOps
  7 | 
  8 | case class TwitterAuth(consumerKey: String,
  9 |     consumerSecret: String,
 10 |     accessToken: String,
 11 |     accessTokenSecret: String)
 12 | 
 13 | object ConfigRegistry {
 14 | 
 15 |   val config = ConfigFactory.load()
 16 | 
 17 |   // APP Configuration keys:
 18 | 
 19 |   lazy val sparkOnConfig = config.getConfig("spark-on")
 20 | 
 21 |   lazy val sparkOnFilters = sparkOnConfig.getStringList("filters").asScala.toSet
 22 |   lazy val windowSizeSeconds = sparkOnConfig.getLong("windowSizeSeconds")
 23 |   lazy val slideDuration = sparkOnConfig.getLong("slideDuration")
 24 |   lazy val cassandraCQLPath = sparkOnConfig.getString("cassandraCQLPath")
 25 |   lazy val sparkOnJars = sparkOnConfig.getStringList("spark.jars").asScala.toList
 26 |   lazy val dateFormat = sparkOnConfig.getString("dateFormat")
 27 |   lazy val dateFormatSplitter = sparkOnConfig.getString("dateFormatSplitter")
 28 | 
 29 |   // Twitter Configuration keys:
 30 | 
 31 |   lazy val twitterConfig = config.getConfig("twitter")
 32 |   lazy val twitterCredentials = twitterConfig.getConfig("credentials")
 33 | 
 34 |   lazy val consumerKey = twitterCredentials.getString("consumerKey")
 35 |   lazy val consumerSecret = twitterCredentials.getString("consumerSecret")
 36 |   lazy val accessToken = twitterCredentials.getString("accessToken")
 37 |   lazy val accessTokenSecret = twitterCredentials.getString("accessTokenSecret")
 38 | 
 39 |   lazy val twitterAuth = TwitterAuth(
 40 |     consumerKey,
 41 |     consumerSecret,
 42 |     accessToken,
 43 |     accessTokenSecret)
 44 | 
 45 |   // Spark Configuration keys:
 46 | 
 47 |   lazy val sparkMasterHost = getStringFromEnvOrConfig("spark.master")
 48 |   lazy val sparkMasterPort = getStringFromEnvOrConfig("spark.port")
 49 |   lazy val sparkMaster = sparkMasterHost.contains("local") match {
 50 |     case true => sparkMasterHost
 51 |     case _ => "spark://$sparkMasterHost:$sparkMasterPort"
 52 |   }
 53 | 
 54 |   lazy val sparkAppName = config.getString("spark.appName")
 55 |   lazy val sparkHome = config.getString("spark.home")
 56 |   lazy val sparkCheckpoint = config.getString("spark.checkpoint")
 57 |   lazy val streamingBatchInterval = config.getLong("spark.streaming.batch.interval")
 58 |   lazy val sparkExecutorMemory = config.getBytes("spark.executor.memory")
 59 |   lazy val sparkCoresMax = getIntFromEnvOrConfig("spark.cores.max")
 60 |   lazy val sparkSerializer = getStringFromEnvOrConfig("spark.serializer")
 61 | 
 62 |   lazy val sparkAkkaHeartbeatInterval = getIntFromEnvOrConfig("spark.akka.heartbeat.interval")
 63 | 
 64 |   // Cassandra Configuration keys:
 65 | 
 66 |   lazy val cassandraNodesValues: List[String] = List(sys.env.get(s"CASSANDRA_SEED_PORT_9160_TCP_ADDR")) ++ {
 67 |     1 to 10 map { index =>
 68 |       sys.env.get(s"CASSANDRA_SLAVE_${index}_PORT_9160_TCP_ADDR")
 69 |     }
 70 |   } flatten
 71 | 
 72 |   lazy val cassandraHosts = mkStringNodes(nodes = cassandraNodesValues, 
 73 |     propKey = "spark.cassandra.connection.host", 
 74 |     cfg = config, 
 75 |     configurationKeyList = "spark.cassandra.connection.host")
 76 | 
 77 |   lazy val sparkCassandraKeyspace: String = config.getString("spark.cassandra.keyspace")
 78 | 
 79 |   // APP HTTP Configuration keys:
 80 | 
 81 |   lazy val httpConfig = config.getConfig("http")
 82 |   lazy val interface = httpConfig.getString("interface")
 83 |   lazy val port = httpConfig.getInt("port")
 84 | 
 85 |   // Kakfa Configuration keys:
 86 | 
 87 |   lazy val kafkaConfig = config.getConfig("kafka")
 88 | 
 89 |   lazy val kafkaNodesEnvVariables = 1 to 10 map { index =>
 90 |     (sys.env.get(s"KAFKA_${index}_PORT_9092_TCP_ADDR"),
 91 |         sys.env.get(s"KAFKA_${index}_PORT_9092_TCP_PORT"))
 92 |   } toList
 93 | 
 94 |   lazy val kafkaNodesValues: List[String] = kafkaNodesEnvVariables flatMap {
 95 |     case (Some(h), Some(p)) => Some(s"$h:$p")
 96 |     case _ => None
 97 |   }
 98 | 
 99 |   lazy val bootstrapServers = mkStringNodes(nodes = kafkaNodesValues,
100 |     propKey = "kafka.hosts",
101 |     cfg = kafkaConfig,
102 |     configurationKeyList = "hosts")
103 |   lazy val kafkaTopics = kafkaConfig.getString("topics").split(",").toSet
104 | 
105 |   lazy val zookeeperHost = kafkaConfig.getString("zookeeper.host")
106 |   lazy val zookeeperPort = kafkaConfig.getInt("zookeeper.port")
107 | 
108 |   lazy val kafkaGroupId = kafkaConfig.getString("group.id")
109 |   lazy val kafkaTopicRaw = kafkaConfig.getString("topic.raw")
110 | 
111 |   lazy val kafkaProducerKeySerializer = kafkaConfig.getString("producer.key.serializer")
112 |   lazy val kafkaProducerValueSerializer = kafkaConfig.getString("producer.value.serializer")
113 | 
114 |   // Helper methods:
115 | 
116 |   private[config] def getStringFromEnvOrConfig(configKey: String) =
117 |     sys.props.get(configKey) getOrElse config.getString(configKey)
118 | 
119 |   private[config] def getIntFromEnvOrConfig(configKey: String) =
120 |     sys.props.get(configKey) map (_.toInt) getOrElse config.getInt(configKey)
121 | 
122 |   private[config] def mkStringNodes(nodes: List[String], propKey: String, cfg: Config, configurationKeyList: String): String =
123 |     if (nodes.nonEmpty) nodes.mkString(",")
124 |     else sys.props.get(propKey) getOrElse {
125 |       val hostList = cfg.getStringList(configurationKeyList).asScala
126 |       hostList.mkString(",")
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/modules/persistence/src/main/resources/data/spark_on_spark.cql:
--------------------------------------------------------------------------------
 1 | DROP KEYSPACE IF EXISTS #KEYSPACE#;
 2 | 
 3 | CREATE KEYSPACE #KEYSPACE# WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
 4 | 
 5 | CREATE TABLE #KEYSPACE#.streaming_tweets_by_day (
 6 | 	id text,
 7 | 	user_id text,
 8 | 	user_name text,
 9 | 	user_screen_name text,
10 | 	created_timestamp text,
11 | 	created_day text,
12 | 	tweet_text text,
13 | 	lang text,
14 | 	retweet_count int,
15 | 	favorite_count int,
16 | 	latitude double,
17 | 	longitude double,
18 |     PRIMARY KEY(created_day, id));
19 | 
20 | CREATE TABLE #KEYSPACE#.streaming_tweets_by_track (
21 | 	track text,
22 | 	year int,
23 | 	month int,
24 | 	day int,
25 | 	hour int,
26 | 	minute int,
27 | 	count counter,
28 |     PRIMARY KEY(track, year, month, day, hour, minute)) 
29 |     WITH CLUSTERING ORDER BY (year DESC, month DESC, day DESC, hour DESC, minute DESC);
30 | 


--------------------------------------------------------------------------------
/modules/persistence/src/main/scala/com/fortysevendeg/sparkon/persistence/schema/CassandraServices.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.persistence.schema
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | import com.fortysevendeg.sparkon.persistence.schema.domain.PersistenceException
 5 | import org.slf4j.LoggerFactory
 6 | import scala.io.Source
 7 | import scala.util.{Failure, Success, Try}
 8 | 
 9 | trait CassandraServices extends Serializable {
10 | 
11 |   val logger = LoggerFactory.getLogger(this.getClass)
12 | 
13 |   val keyspacePattern = "#KEYSPACE#"
14 | 
15 |   def createSchema(keyspace: String,
16 |     cassandraCQLPath: String)(implicit connector: CassandraConnector) = {
17 | 
18 |     val cqlStatements = Try {
19 |       val url = getClass.getResource(cassandraCQLPath)
20 |       val cql = Source.fromURL(url).mkString
21 |       cql.split("\n\n").toList
22 |     }
23 | 
24 |     cqlStatements match {
25 |       case Success(cql) =>
26 |         val finalCQL =
27 |           cql
28 |             .filterNot(_.trim.isEmpty)
29 |             .map(_.replaceAll(keyspacePattern, keyspace))
30 |         connector.withSessionDo { session => finalCQL foreach session.execute }
31 |       case Failure(e) =>
32 |         logger.error("The Cassandra schema could not be loaded", e)
33 |         throw PersistenceException(e.getMessage, Some(e))
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/modules/persistence/src/main/scala/com/fortysevendeg/sparkon/persistence/schema/domain/PersistenceException.scala:
--------------------------------------------------------------------------------
1 | package com.fortysevendeg.sparkon.persistence.schema.domain
2 | 
3 | case class PersistenceException(message: String, cause: Option[Throwable] = None)
4 |   extends RuntimeException(message, cause.orNull)
5 | 


--------------------------------------------------------------------------------
/modules/persistence/src/main/scala/com/fortysevendeg/sparkon/persistence/schema/domain/TweetsModels.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.persistence.schema.domain
 2 | 
 3 | case class TweetsByDay(id: String,
 4 |     userId: Long, userName: String,
 5 |     userScreenName: String,
 6 |     createdTimestamp: String,
 7 |     createdDay: String,
 8 |     tweetText: String,
 9 |     lang: String,
10 |     retweetCount: Int,
11 |     favoriteCount: Int,
12 |     latitude: Option[Double],
13 |     longitude: Option[Double])
14 | 
15 | case class TweetsByTrack(
16 |     track: String,
17 |     year: Int,
18 |     month: Int,
19 |     day: Int,
20 |     hour: Int,
21 |     minute: Int,
22 |     count: Long)


--------------------------------------------------------------------------------
/modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/TwitterReceiverActorStream.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.services.twitter
 2 | 
 3 | import akka.actor.Actor
 4 | import org.apache.spark.streaming.receiver.ActorHelper
 5 | import twitter4j._
 6 | import twitter4j.auth.Authorization
 7 | 
 8 | import scala.reflect.ClassTag
 9 | 
10 | class TwitterReceiverActorStream[T: ClassTag](
11 |   twitterAuth: Authorization,
12 |   filters: List[String]
13 | ) extends Actor with ActorHelper {
14 | 
15 |   val twitterStream = new TwitterStreamFactory().getInstance(twitterAuth)
16 |   val listener = new StatusListener() {
17 | 
18 |     def onStatus(status: Status) = self ! status
19 |     def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) = {}
20 |     def onTrackLimitationNotice(i: Int) = {}
21 |     def onScrubGeo(l: Long, l1: Long) = {}
22 |     def onStallWarning(stallWarning: StallWarning) = {}
23 |     def onException(e: Exception) = e.printStackTrace()
24 |   }
25 | 
26 |   override def preStart(): Unit = {
27 |     twitterStream.addListener(listener)
28 |     filters match {
29 |       case Nil => twitterStream.sample()
30 |       case _ =>
31 |         val query = new FilterQuery
32 |         query.track(filters.toArray)
33 |         twitterStream.filter(query)
34 |     }
35 |   }
36 | 
37 |   def receive = {
38 |     case data => store(data.asInstanceOf[T])
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/TwitterServices.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.services.twitter
 2 | 
 3 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._
 4 | import com.fortysevendeg.sparkon.services.twitter.domain._
 5 | import org.slf4j.LoggerFactory
 6 | import scala.language.postfixOps
 7 | import scala.util._
 8 | import twitter4j.{Twitter, TwitterFactory}
 9 | import twitter4j.auth.OAuthAuthorization
10 | import twitter4j.conf.ConfigurationBuilder
11 | 
12 | trait TwitterServices extends Serializable {
13 | 
14 |   val logger = LoggerFactory.getLogger(this.getClass)
15 |   val woeid = 1 //Worldwide
16 |   lazy val twitterClient: Twitter = 
17 |     new TwitterFactory().getInstance(buildAuthorization)
18 | 
19 |   def getTrendingTopics = {
20 |     val trends = Try(twitterClient
21 |         .trends()
22 |         .getPlaceTrends(woeid)
23 |         .getTrends
24 |         .map(_.getName)
25 |         .toSet)
26 | 
27 |     trends match {
28 |       case Success(trendSet) => 
29 |         logger.info(s"Current Trending Topics => ${trendSet.mkString(", ")}")
30 |         trendSet
31 |       case Failure(e) => throw TwitterServiceException(e.getMessage(), e)
32 |     }
33 |   }
34 | 
35 |   def buildAuthorization =
36 |     new OAuthAuthorization(new ConfigurationBuilder()
37 |         .setOAuthConsumerKey(twitterAuth.consumerKey)
38 |         .setOAuthConsumerSecret(twitterAuth.consumerSecret)
39 |         .setOAuthAccessToken(twitterAuth.accessToken)
40 |         .setOAuthAccessTokenSecret(twitterAuth.accessTokenSecret)
41 |         .build())
42 | }
43 | 
44 | object TwitterServices extends TwitterServices
45 | 


--------------------------------------------------------------------------------
/modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/TwitterStreamingServices.scala:
--------------------------------------------------------------------------------
  1 | package com.fortysevendeg.sparkon.services.twitter
  2 | 
  3 | import java.util.Properties
  4 | 
  5 | import akka.actor.Props
  6 | import com.datastax.spark.connector.SomeColumns
  7 | import com.datastax.spark.connector.cql.CassandraConnector
  8 | import com.datastax.spark.connector.streaming._
  9 | import com.fortysevendeg.sparkon.common.StaticValues
 10 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._
 11 | import com.fortysevendeg.sparkon.persistence.schema.CassandraServices
 12 | import com.fortysevendeg.sparkon.persistence.schema.domain._
 13 | import com.fortysevendeg.sparkon.services.twitter.domain.Conversions._
 14 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
 15 | import org.apache.spark.storage.StorageLevel
 16 | import org.apache.spark.streaming.dstream.DStream
 17 | import org.apache.spark.streaming.{Duration, StreamingContext}
 18 | import org.slf4j.LoggerFactory
 19 | import twitter4j.Status
 20 | import twitter4j.auth.OAuthAuthorization
 21 | import twitter4j.conf.ConfigurationBuilder
 22 | 
 23 | import scala.language.postfixOps
 24 | 
 25 | trait TwitterStreamingServices extends Serializable {
 26 | 
 27 |   val logger = LoggerFactory.getLogger(this.getClass)
 28 |   val cassandraServices = new CassandraServices {}
 29 | 
 30 |   def createCassandraSchema(implicit cassandraConnector: CassandraConnector) =
 31 |     cassandraServices.createSchema(sparkCassandraKeyspace, cassandraCQLPath)
 32 | 
 33 |   def createTwitterStream(
 34 |       filters: List[String] = Nil,
 35 |       storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER)(implicit ssc: StreamingContext) = {
 36 |     val authorization = new OAuthAuthorization(new ConfigurationBuilder()
 37 |         .setOAuthConsumerKey(twitterAuth.consumerKey)
 38 |         .setOAuthConsumerSecret(twitterAuth.consumerSecret)
 39 |         .setOAuthAccessToken(twitterAuth.accessToken)
 40 |         .setOAuthAccessTokenSecret(twitterAuth.accessTokenSecret)
 41 |         .build())
 42 | 
 43 |     ssc.actorStream[Status](
 44 |       Props(
 45 |         new TwitterReceiverActorStream[Status](
 46 |           twitterAuth = authorization,
 47 |           filters = filters)),
 48 |       "TwitterStreamingReceiverActor",
 49 |       storageLevel)
 50 |   }
 51 | 
 52 |   def ingestTweets(topics: Set[String],
 53 |       windowSize: Duration,
 54 |       slideDuration: Duration)
 55 |       (implicit ssc: StreamingContext,
 56 |           dsStream: DStream[Status]) = {
 57 | 
 58 |     val tweetsByDay: DStream[TweetsByDay] = getTweetsByDay(dsStream)
 59 | 
 60 |     val tweetsByTrack: DStream[TweetsByTrack] = getTweetsByTrack(dsStream, topics, windowSize, slideDuration)
 61 | 
 62 |     // tweetsByTrack -> kafka
 63 |     writeToKafka(tweetsByTrack)
 64 | 
 65 |     // tweetsByDay -> streaming_tweets_by_day
 66 |     tweetsByDay.saveToCassandra(
 67 |       sparkCassandraKeyspace,
 68 |       "streaming_tweets_by_day",
 69 |       SomeColumns(
 70 |         "id",
 71 |         "user_id",
 72 |         "user_name",
 73 |         "user_screen_name",
 74 |         "created_timestamp",
 75 |         "created_day",
 76 |         "tweet_text",
 77 |         "lang",
 78 |         "retweet_count",
 79 |         "favorite_count",
 80 |         "latitude",
 81 |         "longitude"))
 82 | 
 83 |     // tweetsByTrack -> streaming_tweets_by_track
 84 |     tweetsByTrack.saveToCassandra(
 85 |       sparkCassandraKeyspace,
 86 |       "streaming_tweets_by_track",
 87 |       SomeColumns(
 88 |         "track",
 89 |         "year",
 90 |         "month",
 91 |         "day",
 92 |         "hour",
 93 |         "minute",
 94 |         "count"))
 95 | 
 96 |     ssc.checkpoint(sparkCheckpoint)
 97 |     ssc.start()
 98 |   }
 99 | 
100 |   def writeToKafka(dStream: DStream[TweetsByTrack]) =
101 |     dStream.map(_.track).foreachRDD { rdd =>
102 |       rdd foreachPartition { partition =>
103 |         lazy val kafkaProducerParams = new Properties()
104 | 
105 |         val kafkaBootstrapServersFromEnv = sys.env.getOrElse("kafkaBootstrapServers", "")
106 |         val kafkaProducerKeySerializerFromEnv = sys.env.getOrElse("kafkaProducerKeySerializer", "")
107 |         val kafkaProducerValueSerializerFromEnv = sys.env.getOrElse("kafkaProducerValueSerializer", "")
108 | 
109 |         kafkaProducerParams.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, kafkaBootstrapServersFromEnv)
110 |         kafkaProducerParams.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, kafkaProducerKeySerializerFromEnv)
111 |         kafkaProducerParams.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, kafkaProducerValueSerializerFromEnv)
112 |         val producer = new KafkaProducer[String, String](kafkaProducerParams)
113 | 
114 |         partition foreach {
115 |           case m: String =>
116 |             val message = new ProducerRecord[String, String](kafkaTopicRaw, StaticValues.javaNull, m)
117 |             producer.send(message)
118 |           case _ => logger.warn("Unknown Partition Message!")
119 |         }
120 |       }
121 |     }
122 | 
123 |   def getTweetsByDay(dsStream: DStream[Status]): DStream[TweetsByDay] = dsStream.map(toTweetsByDay)
124 | 
125 |   def getTweetsByTrack(dsStream: DStream[Status],
126 |       topics: Set[String],
127 |       windowSize: Duration,
128 |       slideDuration: Duration): DStream[TweetsByTrack] =
129 |     dsStream
130 |         .flatMap(_.getText.toLowerCase.split( """\s+"""))
131 |         .filter(topics.contains)
132 |         .countByValueAndWindow(windowSize, slideDuration)
133 |         .transform {
134 |           (rdd, time) =>
135 |             val dateParts = formatTime(time, dateFormat)
136 |                 .split(dateFormatSplitter)
137 |                 .map(_.toInt)
138 |             rdd map {
139 |               case (track, count) =>
140 |                 toTweetsByTrack(dateParts, track, count)
141 |             }
142 |         }
143 | }
144 | 


--------------------------------------------------------------------------------
/modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/domain/Conversions.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.services.twitter.domain
 2 | 
 3 | import com.fortysevendeg.sparkon.persistence.schema.domain.{TweetsByDay, TweetsByTrack}
 4 | import org.apache.spark.streaming.Time
 5 | import org.joda.time.{DateTime, DateTimeZone}
 6 | import twitter4j.Status
 7 | 
 8 | object Conversions {
 9 | 
10 |   def toTweetsByDay(statusRDD: Status): TweetsByDay = {
11 |     val user = statusRDD.getUser
12 |     val geoLocation = Option(statusRDD.getGeoLocation)
13 |     TweetsByDay(
14 |       id = statusRDD.getId.toString,
15 |       userId = user.getId,
16 |       userName = user.getName,
17 |       userScreenName = user.getScreenName,
18 |       createdTimestamp = formatMillis(user.getCreatedAt.getTime),
19 |       createdDay = formatMillis(user.getCreatedAt.getTime, "yyyyMMdd"),
20 |       tweetText = statusRDD.getText,
21 |       lang = statusRDD.getLang,
22 |       retweetCount = statusRDD.getRetweetCount,
23 |       favoriteCount = statusRDD.getFavoriteCount,
24 |       latitude = geoLocation map (_.getLatitude),
25 |       longitude = geoLocation map (_.getLongitude))
26 |   }
27 | 
28 |   def toTweetsByTrack(dateParts: Array[Int], track: String, count: Long): TweetsByTrack = {
29 |     TweetsByTrack(
30 |       track = track,
31 |       year = dateParts(0),
32 |       month = dateParts(1),
33 |       day = dateParts(2),
34 |       hour = dateParts(3),
35 |       minute = dateParts(4),
36 |       count = count)
37 |   }
38 | 
39 |   def formatTime(time: Time, format: String = "yyyyMMddHH:mm:ss.SSS"): String =
40 |     formatMillis(time.milliseconds, format)
41 | 
42 |   def formatMillis(millis: Long, format: String = "yyyyMMddHH:mm:ss.SSS"): String =
43 |     new DateTime(millis, DateTimeZone.UTC).toString(format)
44 | }


--------------------------------------------------------------------------------
/modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/domain/TwitterServiceException.scala:
--------------------------------------------------------------------------------
1 | package com.fortysevendeg.sparkon.services.twitter.domain
2 | 
3 | case class TwitterServiceException(message: String, cause: Throwable)
4 |   extends RuntimeException(message, cause)
5 | 


--------------------------------------------------------------------------------
/modules/test/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # output messages into a rolling log file as well as stdout
 2 | log4j.rootLogger=INFO,stdout
 3 | 
 4 | # stdout
 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 8 | 
 9 | # Avoid "no host ID found" when starting a fresh node
10 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR
11 | 
12 | #  If running spark local, ignore block input exists warnings, which are expected.
13 | log4j.logger.org.apache.spark.storage.BlockManager=ERROR
14 | log4j.logger.com.datastax.spark.connector=INFO
15 | log4j.logger.org.apache.spark=WARN
16 | log4j.logger.com.datastax.driver.core=WARN


--------------------------------------------------------------------------------
/modules/test/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <appender name="console" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <encoder>
 5 |             <pattern>%date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n</pattern>
 6 |         </encoder>
 7 |     </appender>
 8 | 
 9 |     <appender name="file" class="ch.qos.logback.core.FileAppender">
10 |         <file>${log-file:-logs/api.log}</file>
11 |         <encoder>
12 |             <pattern>%date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n</pattern>
13 |         </encoder>
14 |     </appender>
15 | 
16 |     <logger name="org.apache.spark" level="WARN" />
17 |     <logger name="org.apache.spark.storage.BlockManager" level="ERROR" />
18 | 
19 |     <logger name="com.datastax.driver" level="WARN" />
20 |     <logger name="com.datastax.spark" level="WARN" />
21 |     <logger name="org.apache.cassandra.db.SystemKeyspace" level="ERROR"/>
22 | 
23 |     <logger name="org.apache.zookeeper" level="ERROR"/>
24 |     <logger name="kafka" level="ERROR" />
25 | 
26 |     <logger name="org.eclipse.jetty" level="ERROR" />
27 |     <logger name="org.apache.thrift" level="ERROR"/>
28 | 
29 |     <logger name="org.apache.hadoop" level="ERROR" />
30 | 
31 |     <root level="info">
32 |         <appender-ref ref="console"/>
33 |         <appender-ref ref="file"/>
34 |     </root>
35 | 
36 | </configuration>


--------------------------------------------------------------------------------
/modules/test/src/test/resources/reference.conf:
--------------------------------------------------------------------------------
 1 | spark-on {
 2 |   cassandraCQLPath = "/data/spark_on_spark.cql"
 3 |   windowSizeSeconds = 5
 4 |   filters = ["scala", "play", "akka", "spark" , "47", "global", "consulting"]
 5 |   spark.jars = ["./path/to/assembly.jar"]
 6 |   dateFormat: "yyyy_MM_dd_HH_mm"
 7 |   dateFormatSplitter: "_"
 8 | }
 9 | 
10 | twitter {
11 |   credentials {
12 |     consumerKey = ""
13 |     consumerSecret = ""
14 |     accessToken = ""
15 |     accessTokenSecret = ""
16 |   }
17 | }
18 | 
19 | spark {
20 |   master = "local[*]"
21 |   home = "/usr/local/spark"
22 |   appName = "Spark On"
23 |   checkpoint = "./checkpoint"
24 | 
25 |   streaming.batch.interval = 5
26 | 
27 |   executor.memory = 2g
28 |   cores.max = 2
29 |   akka.heartbeat.interval = 100
30 |   serializer = "org.apache.spark.serializer.KryoSerializer"
31 | 
32 |   cassandra {
33 |     connection.host = [localhost]
34 |     keyspace = "spark_on_topics"
35 |   }
36 | }
37 | 
38 | http {
39 |   interface = "0.0.0.0"
40 |   port = 8080
41 | }
42 | 
43 | kafka {
44 |   hosts = ["localhost:9092"]
45 |   topics = "sparkOn.raw"
46 | 
47 |   zookeeper {
48 |     host = "localhost"
49 |     port = 2181
50 |   }
51 | 
52 |   group.id = "sparkOn.group"
53 |   topic.raw = "sparkOn.raw"
54 | 
55 |   producer {
56 |     value.serializer = "org.apache.kafka.common.serialization.StringSerializer"
57 |     key.serializer = "org.apache.kafka.common.serialization.StringSerializer"
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/modules/test/src/test/scala/com/fortysevendeg/sparkon/common/BaseServiceTest.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.common
 2 | 
 3 | import org.specs2.mutable.Specification
 4 | 
 5 | import scala.concurrent.duration.Duration
 6 | import scala.concurrent.{Await, Future}
 7 | 
 8 | trait BaseServiceTest extends Specification {
 9 | 
10 |   def await[T](future: Future[T]) = Await.result(future, Duration.Inf)
11 | }
12 | 


--------------------------------------------------------------------------------
/modules/test/src/test/scala/com/fortysevendeg/sparkon/persistence/CassandraServicesSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.persistence
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | import com.fortysevendeg.sparkon.common.BaseServiceTest
 5 | import com.fortysevendeg.sparkon.persistence.schema.CassandraServices
 6 | import com.fortysevendeg.sparkon.persistence.schema.domain.PersistenceException
 7 | import org.specs2.mock.Mockito
 8 | 
 9 | class CassandraServicesSpec extends BaseServiceTest with Mockito {
10 |   sequential
11 | 
12 |   "CassandraServices" should {
13 |     "create the cassandra schema given a valid CQL Path" in {
14 |       val cassandraServices: CassandraServices = new CassandraServices {}
15 |       implicit val connector = mock[CassandraConnector]
16 | 
17 |       cassandraServices.createSchema(
18 |         keyspace = "spark_on",
19 |         cassandraCQLPath= "/data/spark_on_spark.cql")
20 | 
21 |       there was one(connector).withSessionDo(_ => "")
22 |     }
23 | 
24 |     "throw an exception when the cassandra cql script is not valid" in {
25 |       val cassandraServices: CassandraServices = new CassandraServices {}
26 |       implicit val connector = mock[CassandraConnector]
27 | 
28 |       cassandraServices.createSchema(
29 |         keyspace = "spark_on",
30 |         cassandraCQLPath= "/wrong/path/to.cql") must throwA[PersistenceException]
31 |     }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/modules/test/src/test/scala/com/fortysevendeg/sparkon/services/twitter/TwitterReceiverActorStreamSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.services.twitter
 2 | 
 3 | import akka.actor.ActorSystem
 4 | import akka.testkit.{ImplicitSender, TestActorRef, TestKitBase}
 5 | import com.fortysevendeg.sparkon.common.BaseServiceTest
 6 | import org.specs2.mock.Mockito
 7 | import scala.reflect.ClassTag
 8 | import twitter4j.{Status, StatusListener, TwitterStream}
 9 | import twitter4j.auth.Authorization
10 | 
11 | class TwitterReceiverActorStreamSpec
12 |   extends BaseServiceTest
13 |   with TestKitBase
14 |   with ImplicitSender
15 |   with Mockito {
16 | 
17 |   val twitterStreamMock = mock[TwitterStream]
18 | 
19 |   class TwitterReceiverActorStreamStub[T: ClassTag](
20 |     twitterAuth: Authorization, filters: List[String])
21 |     extends TwitterReceiverActorStream[T](twitterAuth, filters) {
22 |      override val twitterStream = twitterStreamMock
23 |      override val listener = mock[StatusListener]
24 |    }
25 | 
26 |   implicit lazy val system = ActorSystem()
27 | 
28 |   "TwitterReceiverActorStream Actor" should {
29 |     "process all the actor streaming messages" in {
30 | 
31 |       val twitterAuth = mock[Authorization]
32 |       val filters = List("scala", "play", "akka", "spark", "47")
33 |       val status = mock[Status]
34 |       val actorRef = TestActorRef(
35 |         new TwitterReceiverActorStreamStub[Status](twitterAuth, filters))
36 | 
37 |       actorRef ! status      
38 | 
39 |       there was one(status)
40 |     }
41 |   }
42 | }


--------------------------------------------------------------------------------
/modules/test/src/test/scala/com/fortysevendeg/sparkon/services/twitter/TwitterServicesSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.services.twitter
 2 | 
 3 | import com.fortysevendeg.sparkon.common.BaseServiceTest
 4 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry
 5 | import com.fortysevendeg.sparkon.services.twitter.domain.TwitterServiceException
 6 | import org.specs2.mock.Mockito
 7 | import twitter4j._
 8 | import twitter4j.api.TrendsResources
 9 | 
10 | class TwitterServicesSpec
11 |   extends BaseServiceTest
12 |   with Mockito {
13 | 
14 |   trait TwitterServicesStub extends TwitterServices {
15 |     override lazy val twitterClient = mock[Twitter]
16 |   }
17 | 
18 |   "Twitter Services" should {
19 |     "build a twitter4j client to fetch the current trending topics " in {
20 |       val twitterServices = new TwitterServicesStub {}
21 | 
22 |       val filters = List("scala", "akka")
23 | 
24 |       val trends = mock[TrendsResources]
25 |       val placeTrends = mock[Trends]
26 |       val mockTrend1: Trend = mock[Trend]
27 |       val mockTrend2: Trend = mock[Trend]
28 |       val mockTrend3: Trend = mock[Trend]
29 | 
30 |       val trendsArray = Array(mockTrend1, mockTrend2, mockTrend3)
31 | 
32 |       mockTrend1.getName returns "scala"
33 |       mockTrend2.getName returns "play"
34 |       mockTrend3.getName returns "spark"
35 | 
36 |       twitterServices.twitterClient.trends() returns trends
37 |       trends.getPlaceTrends(anyInt) returns placeTrends
38 |       placeTrends.getTrends returns trendsArray
39 | 
40 |       val result = twitterServices.getTrendingTopics
41 | 
42 |       result.size must_== 3
43 |     }
44 | 
45 |     "return a new custom exception when twitter4j.TwitterException is thrown" in {
46 |       val twitterServices = new TwitterServicesStub {}
47 | 
48 |       val filters = List("scala", "akka")
49 | 
50 |       twitterServices.twitterClient.trends() throws new RuntimeException("something wrong")
51 | 
52 |       twitterServices.getTrendingTopics must throwA[TwitterServiceException]
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/modules/test/src/test/scala/com/fortysevendeg/sparkon/services/twitter/TwitterStreamingServicesSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.fortysevendeg.sparkon.services.twitter
 2 | 
 3 | import akka.actor.{ActorRef, ActorSystem}
 4 | import akka.testkit.TestKitBase
 5 | import com.datastax.spark.connector.cql.CassandraConnector
 6 | import com.fortysevendeg.sparkon.common.BaseServiceTest
 7 | import com.fortysevendeg.sparkon.persistence.schema.CassandraServices
 8 | import com.fortysevendeg.sparkon.persistence.schema.domain.PersistenceException
 9 | import org.apache.spark.storage.StorageLevel
10 | import org.apache.spark.streaming.{Seconds, StreamingContext}
11 | import org.specs2.mock.Mockito
12 | import org.specs2.specification.Scope
13 | 
14 | class TwitterStreamingServicesSpec
15 |   extends BaseServiceTest
16 |   with TestKitBase
17 |   with Mockito {
18 | 
19 |   implicit lazy val system = ActorSystem()
20 | 
21 |   val batchDuration = Seconds(1)
22 | 
23 |   private val master: String = "local[4]"
24 | 
25 |   private val framework: String = this.getClass.getSimpleName
26 | 
27 |   implicit val ssc = new StreamingContext(master = master, appName = framework, batchDuration = batchDuration)
28 | 
29 |   trait CreateCassandraSchemaScope extends Scope {
30 | 
31 |     val cassandraServicesMock = mock[CassandraServices]
32 | 
33 |     val twitterStreamingServices = new TwitterStreamingServicesStub {}
34 | 
35 |     implicit val connector = mock[CassandraConnector]
36 | 
37 |     class TwitterStreamingServicesStub extends TwitterStreamingServices {
38 |       override val cassandraServices = cassandraServicesMock
39 |     }
40 |   }
41 | 
42 |   trait CreateTwitterStreamScope extends Scope {
43 |     implicit val receiverActor = mock[ActorRef]
44 | 
45 |     val twitterStreamingServices = new TwitterStreamingServices {}
46 |   }
47 | 
48 |   "TwitterStreamingServices.createCassandraSchema" should {
49 | 
50 |     "create Cassandra works fine doing pass through to persistence " +
51 |       "module" in new CreateCassandraSchemaScope {
52 | 
53 |       twitterStreamingServices.createCassandraSchema(connector)
54 | 
55 |       there was one(cassandraServicesMock).createSchema(any, any)(any)
56 |     }
57 | 
58 |     "create Cassandra returns a Persistence Exception when a " +
59 |       "new exception is thrown" in new CreateCassandraSchemaScope {
60 | 
61 |       cassandraServicesMock.createSchema(any, any)(any) throws new PersistenceException("any message")
62 | 
63 |       twitterStreamingServices.createCassandraSchema(connector) must throwA[PersistenceException]
64 |     }
65 |   }
66 | 
67 |   "TwitterStreamingServices.createTwitterStream" should {
68 | 
69 |     "create a new twitter streaming for an empty filters set and using " +
70 |       "the storageLevel default value" in new CreateTwitterStreamScope {
71 | 
72 |         twitterStreamingServices.createTwitterStream(filters = Nil)
73 | 
74 |         "All combinations looks good" must endWith("good")
75 |       }
76 | 
77 |     "create a new twitter streaming for some filters and using " +
78 |       "the storageLevel default value" in new CreateTwitterStreamScope {
79 |         val filters = List("scala", "akka")
80 | 
81 |         twitterStreamingServices.createTwitterStream(filters = filters)
82 | 
83 |         "All combinations looks good" must endWith("good")
84 |       }
85 | 
86 |     "create a new twitter streaming for some filters and a " +
87 |       "specified storage level" in new CreateTwitterStreamScope {
88 |         val filters = List("scala", "akka")
89 | 
90 |         twitterStreamingServices.createTwitterStream(filters = filters,
91 |           storageLevel = StorageLevel.MEMORY_AND_DISK_SER_2)
92 | 
93 |         "All combinations looks good" must endWith("good")
94 |       }
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/project/Build.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | import sbtdocker.DockerPlugin
 3 | 
 4 | object Build extends Build with Settings with SettingsDocker with Dependencies {
 5 | 
 6 |   lazy val root = project
 7 |     .in(file("."))
 8 |     .aggregate(common, persistence, services, api, test)
 9 | 
10 |   lazy val common = project
11 |     .in(file("modules/common"))
12 |     .settings(projectSettings ++ commonDeps)
13 | 
14 |   lazy val persistence = project
15 |       .in(file("modules/persistence"))
16 |       .dependsOn(common % "test->test;compile->compile")
17 |       .settings(projectSettings ++ persistenceDeps)
18 | 
19 |   lazy val services = project.in(file("modules/services"))
20 |       .dependsOn(
21 |         common % "test->test;compile->compile",
22 |         persistence)
23 |       .settings(projectSettings ++ servicesDeps)
24 | 
25 |   lazy val api = project.in(file("modules/api"))
26 |       .enablePlugins(DockerPlugin)
27 |       .dependsOn(
28 |         common % "test->test;compile->compile",
29 |         services)
30 |       .settings(apiSettings ++ apiDeps)
31 | 
32 |   lazy val test = project.in(file("modules/test"))
33 |       .dependsOn(
34 |         common % "test->test;compile->compile",
35 |         persistence % "test->test;compile->compile",
36 |         services % "test->test;compile->compile",
37 |         api % "test->test;compile->compile")
38 |       .settings(projectSettings ++ testDeps)
39 | }
40 | 


--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | import sbt._
 3 | 
 4 | trait Dependencies extends Excludes {
 5 |   this: Build =>
 6 | 
 7 |   val akkaActor = "com.typesafe.akka" %% "akka-actor" % V.akka
 8 |   val akkaHttp = "com.typesafe.akka" %% "akka-http-experimental" % V.akkaStreams
 9 |   val akkaHttpCore = "com.typesafe.akka" %% "akka-http-core-experimental" % V.akkaStreams
10 |   val akkaHttpJson = "com.typesafe.akka" %% "akka-http-spray-json-experimental" % V.akkaStreams
11 |   val akkaHttpXml = "com.typesafe.akka" %% "akka-http-xml-experimental" % V.akkaStreams
12 |   val akkaHttpTestkit = "com.typesafe.akka" %% "akka-http-testkit-experimental" % V.akkaStreams
13 |   val akkaParsing = "com.typesafe.akka" %% "akka-parsing-experimental" % V.akkaStreams
14 |   val akkaRemote = "com.typesafe.akka" %% "akka-remote" % V.akka
15 |   val akkaSlf4j = "com.typesafe.akka" %% "akka-slf4j" % V.akka
16 |   val akkaStreams = "com.typesafe.akka" %% "akka-stream-experimental" % V.akkaStreams
17 |   val akkaTestkit = "com.typesafe.akka" %% "akka-testkit" % V.akka
18 |   val cassandraSpark = "com.datastax.spark" %% "spark-cassandra-connector" % V.cassandraSpark
19 |   val config = "com.typesafe" % "config" % V.config
20 |   val commonsCodec = "commons-codec" % "commons-codec" % V.commonsCodec
21 |   val hadoopClient = "org.apache.hadoop" % "hadoop-client" % V.hadoopClient
22 |   val jodaTime = "joda-time" % "joda-time" % V.jodaTime
23 |   val jodaConvert = "org.joda" % "joda-convert" % V.jodaConvert
24 |   val kafka = "org.apache.kafka" %% "kafka" % V.kafka
25 |   val logback = "ch.qos.logback" % "logback-classic" % V.logback
26 |   val phantomDsl = "com.websudos" %% "phantom-dsl" % V.phantom
27 |   val phantomTestkit = "com.websudos" %% "phantom-testkit" % V.phantom
28 |   val reactiveKafka = "com.softwaremill.reactivekafka" %% "reactive-kafka-core" % V.reactiveKafka
29 |   val sparkCore = "org.apache.spark" %% "spark-core" % V.spark
30 |   val sparkStreaming = "org.apache.spark" %% "spark-streaming" % V.spark
31 |   val sparkStreamingKafka = "org.apache.spark" %% "spark-streaming-kafka" % V.spark
32 |   val specs2Core = "org.specs2" %% "specs2-core" % V.specs2
33 |   val specs2Mock = "org.specs2" %% "specs2-mock" % V.specs2
34 |   val sprayHttp = "io.spray" %% "spray-http" % V.spray
35 |   val sprayHttpx = "io.spray" %% "spray-httpx" % V.spray
36 |   val sprayUtil = "io.spray" %% "spray-util" % V.spray
37 |   val sprayClient = "io.spray" %% "spray-client" % V.spray
38 |   val sprayCan = "io.spray" %% "spray-can" % V.spray
39 |   val sprayCaching = "io.spray" %% "spray-caching" % V.spray
40 |   val sprayRouting = "io.spray" %% "spray-routing" % V.spray
41 |   val sprayJson = "io.spray" %% "spray-json" % V.sprayJson
42 |   val sprayTestKit = "io.spray" %% "spray-testkit" % V.sprayJson
43 |   val twitter4jCore = "org.twitter4j" % "twitter4j-core" % V.twitter4j
44 |   val twitter4jStream = "org.twitter4j" % "twitter4j-stream" % V.twitter4j
45 | 
46 |   val baseDepts = Seq(specs2Core % "test", specs2Mock % "test")
47 | 
48 |   val commonDeps = Seq(libraryDependencies ++= Seq(config, logback))
49 | 
50 |   val persistenceDeps = Seq(libraryDependencies ++= Seq(
51 |     akkaRemote,
52 |     akkaSlf4j,
53 |     cassandraSpark exclude("org.apache.spark", "*"),
54 |     phantomDsl,
55 |     phantomTestkit,
56 |     sparkCore exclude("org.spark-project.akka", "*")))
57 | 
58 |   val servicesDeps = Seq(libraryDependencies ++= Seq(
59 |     kafka exclusionsForKafka,
60 |     sparkStreaming intransitive(),
61 |     sparkStreamingKafka intransitive(),
62 |     twitter4jCore,
63 |     twitter4jStream,
64 |     akkaTestkit % "test"))
65 | 
66 |   val testDeps = Seq(libraryDependencies ++= baseDepts ++ Seq(
67 |     twitter4jCore,
68 |     akkaHttpTestkit % "test"))
69 | 
70 |   val apiDeps = Seq(libraryDependencies ++= Seq(
71 |     akkaHttp,
72 |     akkaHttpCore,
73 |     akkaHttpJson,
74 |     akkaStreams,
75 |     hadoopClient,
76 |     reactiveKafka,
77 |     sprayJson))
78 | }
79 | 


--------------------------------------------------------------------------------
/project/Excludes.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | trait Excludes {
 4 | 
 5 |   implicit class Exclude(module: ModuleID) {
 6 | 
 7 |     def excludingLog4j: ModuleID =
 8 |       module excludeAll ExclusionRule("log4j")
 9 | 
10 |     def excludingSlf4j: ModuleID =
11 |       module excludeAll ExclusionRule("org.slf4j")
12 | 
13 |     def excludingGuava: ModuleID =
14 |       module exclude("com.google.guava", "guava")
15 | 
16 |     def excludingSpark: ModuleID =
17 |       module
18 |           .excludingGuava
19 |           .exclude("org.apache.spark", s"spark-core_${V.scala}")
20 |           .exclude("org.apache.spark", s"spark-streaming_${V.scala}")
21 |           .exclude("org.apache.spark", s"spark-sql_${V.scala}")
22 |           .exclude("org.apache.spark", s"spark-streaming_${V.scala}")
23 | 
24 |     def excludingLogback: ModuleID = module
25 |         .exclude("ch.qos.logback", "logback-classic")
26 |         .exclude("ch.qos.logback", "logback-core")
27 | 
28 |     def excludingAkka: ModuleID = module
29 |         .exclude("com.typesafe.akka", "akka-actor")
30 | 
31 |     def exclusionsForKafka: ModuleID =
32 |       module
33 |           .excludingLog4j
34 |           .excludingSlf4j
35 |           .exclude("com.sun.jmx", "jmxri")
36 |           .exclude("com.sun.jdmk", "jmxtools")
37 |           .exclude("net.sf.jopt-simple", "jopt-simple")
38 |   }
39 | 
40 | }


--------------------------------------------------------------------------------
/project/Settings.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | import sbt.Keys._
 3 | import sbtassembly.AssemblyPlugin.autoImport._
 4 | import spray.revolver.RevolverPlugin.Revolver
 5 | import sbtassembly.AssemblyPlugin._
 6 | import sbtassembly.MergeStrategy._
 7 | 
 8 | trait Settings {
 9 |   this: Build with SettingsDocker =>
10 | 
11 |   lazy val projectSettings: Seq[Def.Setting[_]] = Seq(
12 |     scalaVersion := V.scala,
13 |     scalaVersion in ThisBuild := V.scala,
14 |     organization := "com.fortysevendeg",
15 |     organizationName := "47 Degrees",
16 |     organizationHomepage := Some(new URL("http://47deg.com")),
17 |     version := V.buildVersion,
18 |     conflictWarning := ConflictWarning.disable,
19 |     scalacOptions ++= Seq("-deprecation", "-unchecked", "-feature", "-Ywarn-unused-import"),
20 |     javaOptions in Test ++= Seq("-XX:MaxPermSize=128m", "-Xms512m", "-Xmx512m"),
21 |     ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) },
22 |     sbt.Keys.fork := true,
23 |     publishMavenStyle := true,
24 |     publishArtifact in(Test, packageSrc) := true,
25 |     logLevel := Level.Info,
26 |     resolvers ++= Seq(
27 |       Resolver.mavenLocal,
28 |       Resolver.defaultLocal,
29 |       Classpaths.typesafeReleases,
30 |       DefaultMavenRepository,
31 |       Resolver.typesafeIvyRepo("snapshots"),
32 |       Resolver.sonatypeRepo("releases"),
33 |       Resolver.sonatypeRepo("snapshots"),
34 |       "Sonatype staging" at "http://oss.sonatype.org/content/repositories/staging",
35 |       "Java.net Maven2 Repository" at "http://download.java.net/maven/2/",
36 |       "Twitter Repository" at "http://maven.twttr.com",
37 |       "mvnrepository" at "http://mvnrepository.com/artifact/",
38 |       Resolver.bintrayRepo("scalaz", "releases"),
39 |       Resolver.bintrayRepo("websudos", "oss-releases")
40 |     ),
41 |     doc in Compile <<= target.map(_ / "none"),
42 |     unmanagedResourceDirectories in Compile <+= baseDirectory(_ / "src/main/scala")
43 |   )
44 | 
45 |   lazy val apiSettings = projectSettings ++ assemblySettings ++ Seq(
46 |     scalaVersion in ThisBuild := V.scala,
47 |     assemblyJarName in assembly := "sparkOn-1.0.0.jar",
48 |     assembleArtifact in assemblyPackageScala := true,
49 |     Keys.test in assembly := {},
50 |     assemblyMergeStrategy in assembly := {
51 |       case "application.conf" => concat
52 |       case "reference.conf" => concat
53 |       case "unwanted.txt" => discard
54 |       case entry =>
55 |         val oldStrategy = (assemblyMergeStrategy in assembly).value
56 |         val mergeStrategy = oldStrategy(entry)
57 |         mergeStrategy == deduplicate match {
58 |           case true => first
59 |           case _ => mergeStrategy
60 |         }
61 |     },
62 |     publishArtifact in(Test, packageBin) := false
63 |   ) ++ Revolver.settings ++ dockerSettings
64 | }
65 | 


--------------------------------------------------------------------------------
/project/SettingsDocker.scala:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | import sbt._
 3 | import sbtassembly.AssemblyPlugin.autoImport._
 4 | import sbtdocker.DockerPlugin.autoImport._
 5 | 
 6 | trait SettingsDocker {
 7 |   this: Build =>
 8 | 
 9 |   lazy val dockerSettings = Seq(
10 |     docker <<= docker dependsOn assembly,
11 |     imageNames in docker := Seq(ImageName("47deg/sparkon")),
12 |     dockerfile in docker := {
13 |       val workingDir = s"/opt/sparkOn"
14 |       val artifact = (assemblyOutputPath in assembly).value
15 | 
16 |       val artifactTargetPath = s"/opt/sparkOn/${artifact.name}"
17 |       val sparkPath = "/usr/local/spark/assembly/target/scala-2.11/spark-assembly-1.5.1-hadoop2.4.0.jar"
18 | 
19 |       val mainclass = mainClass.in(Compile, packageBin).value.getOrElse(sys.error("Expected exactly one main class"))
20 |       val classpathString = s"$sparkPath:$artifactTargetPath"
21 | 
22 |       new Dockerfile {
23 |         // Base image
24 |         from("47deg/spark:1.5.1")
25 |         // Mantainer
26 |         maintainer("47 Degrees", "juanpedro.m@47deg.com>")
27 | 
28 |         // Set working directory
29 |         workDir(workingDir)
30 | 
31 |         // Add the JAR file
32 |         add(artifact, artifactTargetPath)
33 | 
34 |         cmdRaw(s"java " +
35 |             s"-verbose:gc " +
36 |             s"-XX:+PrintGCDetails " +
37 |             s"-XX:+PrintGCTimeStamps " +
38 |             s"-Xmx2G " +
39 |             s"-XX:MaxPermSize=1G -cp $classpathString $mainclass")
40 |       }
41 |     }
42 |   )
43 | }


--------------------------------------------------------------------------------
/project/V.scala:
--------------------------------------------------------------------------------
 1 | object V {
 2 | 
 3 |   // Build version
 4 |   val buildVersion = "1.0.0-SNAPSHOT"
 5 | 
 6 |   // Core Libs
 7 |   val akka = "2.3.12"
 8 |   val akkaStreams = "1.0"
 9 |   val cassandraSpark = "1.5.0-M1"
10 |   val commonsCodec = "1.10"
11 |   val config = "1.2.1"
12 |   val hadoopClient = "2.4.0"
13 |   val jodaConvert = "1.7"
14 |   val jodaTime = "2.7"
15 |   val json4s = "3.2.11"
16 |   val kafka = "0.8.2.1"
17 |   val logback = "1.1.3"
18 |   val phantom = "1.8.12"
19 |   val reactiveKafka = "0.8.1"
20 |   val scala = "2.11.7"
21 |   val scalaUri = "0.4.7"
22 |   val scalaz = "7.1.2"
23 |   val spark = "1.5.1"
24 |   val spray = "1.3.3"
25 |   val sprayJson = "1.3.2"
26 |   val twitter4j = "4.0.3"
27 | 
28 |   // Testing libs
29 |   val specs2 = "3.6.2"
30 | }
31 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.8
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | resolvers += "Typesafe Repository" at "https://repo.typesafe.com/typesafe/releases/"
 2 | 
 3 | resolvers += Classpaths.sbtPluginReleases
 4 | 
 5 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.7.2")
 6 | 
 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
 8 | 
 9 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0")
10 | 
11 | addSbtPlugin("com.codacy" % "sbt-codacy-coverage" % "1.1.0")
12 | 
13 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.0.3")
14 | 
15 | addSbtPlugin("se.marcuslonnberg" % "sbt-docker" % "1.2.0")


--------------------------------------------------------------------------------
/scripts/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sbt ";project api;docker"
 4 | 
 5 | cd scripts && docker-compose up -d
 6 | 
 7 | sleep 60
 8 | 
 9 | docker exec -t namenode /usr/local/hadoop/bin/hadoop fs -mkdir /checkpoint
10 | 
11 | # Scaling out Spark:
12 | docker-compose scale spark_worker=2
13 | 


--------------------------------------------------------------------------------
/scripts/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | zookeeper:
  2 |   image: 47deg/zookeeper
  3 |   ports:
  4 |     - "2181:2181"
  5 | kafka_1:
  6 |   image: 47deg/kafka
  7 |   ports:
  8 |     - "9092"
  9 |   links:
 10 |     - zookeeper:zk
 11 |   environment:
 12 |     KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100
 13 |   volumes:
 14 |     - /var/run/docker.sock:/var/run/docker.sock
 15 | kafka_2:
 16 |   image: 47deg/kafka
 17 |   ports:
 18 |     - "9092"
 19 |   links:
 20 |     - zookeeper:zk
 21 |   environment:
 22 |     KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100
 23 |   volumes:
 24 |     - /var/run/docker.sock:/var/run/docker.sock
 25 | kafka_3:
 26 |   image: 47deg/kafka
 27 |   ports:
 28 |     - "9092"
 29 |   links:
 30 |     - zookeeper:zk
 31 |   environment:
 32 |     KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100
 33 |   volumes:
 34 |     - /var/run/docker.sock:/var/run/docker.sock
 35 | opscenter:
 36 |   image: 47deg/opscenter
 37 |   ports:
 38 |   - "8888:8888"
 39 |   container_name: opscenter
 40 | cassandra_seed:
 41 |   image: 47deg/cassandra
 42 |   ports:
 43 |   - "9042:9042"
 44 |   links:
 45 |     - opscenter
 46 |   container_name: cassandra_seed
 47 |   environment:
 48 |     - OPS_IP=opscenter
 49 | cassandra_slave:
 50 |   image: 47deg/cassandra
 51 |   links:
 52 |     - opscenter
 53 |     - cassandra_seed
 54 |   environment:
 55 |     - OPS_IP=opscenter
 56 |     - SEED=cassandra_seed
 57 | opscenter_checkpoint:
 58 |   image: 47deg/java8
 59 |   working_dir: /src
 60 |   volumes:
 61 |   - .:/src
 62 |   command: sh initOpscenter.sh
 63 |   links:
 64 |     - opscenter
 65 |     - cassandra_seed
 66 |     - cassandra_slave
 67 |   environment:
 68 |     - OPS_IP=opscenter
 69 |     - SEED=cassandra_seed
 70 |     - CASS_SLAVE=cassandra_slave
 71 |     - WAIT_SLEEP=10
 72 |     - WAIT_LOOPS=10
 73 | spark_master:
 74 |   image: 47deg/spark:1.5.1
 75 |   ports:
 76 |   - "7077:7077"
 77 |   - "8080:8080"
 78 |   container_name: spark_master
 79 |   tty: true
 80 |   command: /start-master.sh
 81 | spark_worker:
 82 |   image: 47deg/spark:1.5.1
 83 |   links:
 84 |     - spark_master
 85 |   command: /start-worker.sh
 86 | namenode:
 87 |   image: 47deg/yarn-cluster
 88 |   working_dir: /usr/local/hadoop
 89 |   ports:
 90 |   - "8088:8088"
 91 |   - "50070:50070"
 92 |   - "50075:50075"
 93 |   container_name: namenode
 94 |   command: bash -c "/etc/bootstrap.sh -d -namenode"
 95 | datanode:
 96 |   image: 47deg/yarn-cluster
 97 |   working_dir: /usr/local/hadoop
 98 |   links:
 99 |     - namenode
100 |   command: /etc/bootstrap.sh -d -datanode
101 | sparkon:
102 |   image: 47deg/sparkon
103 |   ports:
104 |   - "9090:9090"
105 |   - "4040:4040"
106 |   container_name: sparkon
107 |   links:
108 |     - spark_master
109 |     - cassandra_seed
110 |     - cassandra_slave
111 |     - namenode
112 |     - zookeeper
113 |     - kafka_1
114 |     - kafka_2
115 |     - kafka_3
116 |   env_file: sparkOn.env


--------------------------------------------------------------------------------
/scripts/initOpscenter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export OPSCENTER_IP=$(getent hosts "$OPS_IP" | awk '{print $1 ; exit}')
 4 | export SEED_IP=$(getent hosts "$SEED" | awk '{print $1 ; exit}')
 5 | export CASS_SLAVE_IP=$(getent hosts "$CASS_SLAVE" | awk '{print $1 ; exit}')
 6 | 
 7 | echo "OPSCENTER_IP = $OPSCENTER_IP"
 8 | echo "SEED = $SEED_IP"
 9 | echo "CASS_SLAVE = $CASS_SLAVE_IP"
10 | 
11 | WAIT_COMMAND_COND=
12 | 
13 | is_ready() {
14 |   eval [ $(curl --write-out %{http_code} --silent --output /dev/null http://$OPSCENTER_IP:8888/cluster-configs)  = 200 ]
15 | }
16 | 
17 | # wait until is ready
18 | i=0
19 | while ! is_ready; do
20 |     i=`expr $i + 1`
21 |     if [ $i -ge $WAIT_LOOPS ]; then
22 |         echo "$(date) - still not ready, giving up"
23 |         exit 1
24 |     fi
25 |     echo "$(date) - waiting to be ready"
26 |     sleep $WAIT_SLEEP
27 | done
28 | 
29 | #start the script
30 | echo "Registering cluster with OpsCenter"
31 | curl \
32 |  http://${OPSCENTER_IP}:8888/cluster-configs \
33 |  -X POST \
34 |  -d \
35 |  "{
36 |      \"cassandra\": {
37 |        \"seed_hosts\": \"$SEED_IP, $CASS_SLAVE_IP\"
38 |      },
39 |      \"cassandra_metrics\": {},
40 |      \"jmx\": {
41 |        \"port\": \"7199\"
42 |      }
43 |  }" > /dev/null


--------------------------------------------------------------------------------
/scripts/sparkOn.env:
--------------------------------------------------------------------------------
 1 | # Set Spark On ENV
 2 | CASSANDRA_HOSTS=cassandra_seed
 3 | SPARK_HOME=/usr/local/spark
 4 | SPARK_CHECKPOINT=hdfs://namenode:9000/checkpoint
 5 | CONSUMER_KEY=<consumer_key>
 6 | CONSUMER_SECRET=<consumer_secret>
 7 | ACCESS_TOKEN=<access_token>
 8 | ACCESS_TOKEN_SECRET=<access_token_secret>
 9 | SPARK_APP_JARS=/opt/sparkOn/sparkOn-1.0.0.jar
10 | KAFKA_HOSTS=scripts_kafka_1:9092
11 | KAFKA_TOPIC=sparkOn.raw
12 | ZOOKEEPER_HOST=zk
13 | ZOOKEEPER_PORT=2181
14 | HTTP_INTERFACE=0.0.0.0
15 | HTTP_PORT=9090


--------------------------------------------------------------------------------