├── .gitignore ├── .travis.yml ├── README.md ├── activator ├── activator-launch-1.3.5.jar ├── activator.bat ├── modules ├── api │ └── src │ │ └── main │ │ ├── resources │ │ ├── application.conf │ │ ├── log4j.properties │ │ └── logback.xml │ │ └── scala │ │ └── com │ │ └── fortysevendeg │ │ └── sparkon │ │ └── api │ │ └── http │ │ ├── ApiHttpService.scala │ │ ├── Boot.scala │ │ └── Protocols.scala ├── common │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── fortysevendeg │ │ └── sparkon │ │ └── common │ │ ├── StaticValues.scala │ │ └── config │ │ └── ConfigRegistry.scala ├── persistence │ └── src │ │ └── main │ │ ├── resources │ │ └── data │ │ │ └── spark_on_spark.cql │ │ └── scala │ │ └── com │ │ └── fortysevendeg │ │ └── sparkon │ │ └── persistence │ │ └── schema │ │ ├── CassandraServices.scala │ │ └── domain │ │ ├── PersistenceException.scala │ │ └── TweetsModels.scala ├── services │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── fortysevendeg │ │ └── sparkon │ │ └── services │ │ └── twitter │ │ ├── TwitterReceiverActorStream.scala │ │ ├── TwitterServices.scala │ │ ├── TwitterStreamingServices.scala │ │ └── domain │ │ ├── Conversions.scala │ │ └── TwitterServiceException.scala └── test │ └── src │ └── test │ ├── resources │ ├── log4j.properties │ ├── logback.xml │ └── reference.conf │ └── scala │ └── com │ └── fortysevendeg │ └── sparkon │ ├── common │ └── BaseServiceTest.scala │ ├── persistence │ └── CassandraServicesSpec.scala │ └── services │ └── twitter │ ├── TwitterReceiverActorStreamSpec.scala │ ├── TwitterServicesSpec.scala │ └── TwitterStreamingServicesSpec.scala ├── project ├── Build.scala ├── Dependencies.scala ├── Excludes.scala ├── Settings.scala ├── SettingsDocker.scala ├── V.scala ├── build.properties └── plugins.sbt └── scripts ├── deploy.sh ├── docker-compose.yml ├── initOpscenter.sh └── sparkOn.env /.gitignore: -------------------------------------------------------------------------------- 1 | checkpoint 2 | logs 3 | target 4 | tmp 5 | .history 6 | dist 7 | /out 8 | /RUNNING_PID 9 | /.ivy* 10 | 11 | # sbt specific 12 | /.sbt 13 | .cache/ 14 | .history/ 15 | .lib/ 16 | dist/* 17 | target/ 18 | lib_managed/ 19 | src_managed/ 20 | project/boot/ 21 | project/plugins/project/ 22 | project/project/target 23 | project/project/project* 24 | project/target 25 | /.activator 26 | 27 | # Scala-IDE specific 28 | .scala_dependencies 29 | .worksheet 30 | 31 | #Eclipse specific 32 | .classpath 33 | .project 34 | .cache 35 | .settings/ 36 | 37 | #IntelliJ IDEA specific 38 | .idea/ 39 | /.idea_modules 40 | /.idea 41 | /*.iml 42 | 43 | #LevelDB specific 44 | journal/ 45 | snapshots/ 46 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.7 4 | script: 5 | - sbt ++$TRAVIS_SCALA_VERSION coverage test 6 | after_script: 7 | - sbt ++$TRAVIS_SCALA_VERSION coverageReport 8 | - sbt ++$TRAVIS_SCALA_VERSION coverageAggregate 9 | - sbt ++$TRAVIS_SCALA_VERSION codacyCoverage -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/47deg/spark-on-lets-code.svg?branch=master)](https://travis-ci.org/47deg/spark-on-lets-code) 2 | [![Codacy Badge](https://api.codacy.com/project/badge/a7ac855c47cc46ea80b6c69907415f5c)](https://www.codacy.com/app/47deg/spark-on-lets-code) 3 | 4 | # Spark On 5 | 6 | This small Spark project provides the sample code which we've talked about in the `Spark On` blog post series at [47D Blog](http://www.47deg.com/blog/tags/sparkonletscode). 7 | 8 | ## App Requirements 9 | 10 | * Twitter Credentials to connect to the Twitter API. Read more about it [here](https://dev.twitter.com/overview/documentation). 11 | * In this README.md file you will see the IP address `192.168.99.100`. If you are using [docker-machine](https://docs.docker.com/machine/), `docker-machine ip ` command should return the specific host’s IP address. You must replace `192.168.99.100` for the IP address in your case. 12 | * The whole infrastructure has been tested on an Apple Macbook Pro (2,7 GHz Intel Core i5, 16 GB 1867 MHz DDR3). 13 | 14 | To start off, we need to define a few environment variables in this [config file](https://github.com/47deg/spark-on-lets-code/blob/master/scripts/sparkOn.env#L5). 15 | 16 | ## Deploy Docker Infrastructure 17 | 18 | ### Start Cluster 19 | 20 | We've defined a bash script to deploy all of the cluster dependencies, including the Spark Streaming Application, which means, we can run it in this way: 21 | 22 | scripts/deploy.sh 23 | 24 | By default, the infrastructure deployed will be: 25 | 26 | - Spark Cluster: 27 | - 1 Spark Master 28 | - 2 Spark Worker nodes 29 | - Cassandra Cluster: 30 | - 2 Cassandra Docker Containers 31 | - 1 Docker Container with [DataStax Opscenter](http://www.datastax.com/products/datastax-enterprise-visual-admin) 32 | - Kafka Cluster: 33 | - 1 Docker node Zookeper 34 | - 3 Docker containers running as Kafka brokers 35 | - Hadoop HDFS Cluster: 36 | - 1 Docker container running as namenode 37 | - 1 Docker container running as datanode 38 | - 1 Docker container for our Streaming App 39 | 40 | ### Scaling Out Services 41 | 42 | For instance, to increase the Spark Workers available: 43 | 44 | docker-compose scale spark_worker=5 45 | 46 | ### Start the Streaming 47 | 48 | If everything is functioning correctly, we can start the Twitter Streaming as follows: 49 | 50 | curl -X "POST" "http://192.168.99.100:9090/twitter-streaming" \ 51 | -H "Content-Type: application/json" \ 52 | -d $'{ 53 | "recreateDatabaseSchema": true, 54 | "filters": [ 55 | "lambda", 56 | "scala", 57 | "akka", 58 | "spray", 59 | "play2", 60 | "playframework", 61 | "spark", 62 | "java", 63 | "python", 64 | "cassandra", 65 | "bigdata", 66 | "47 Degrees", 67 | "47Degrees", 68 | "47Deg", 69 | "programming", 70 | "chicharrones", 71 | "cat", 72 | "dog" 73 | ] 74 | }' 75 | 76 | ### Connect to the Web Socket 77 | 78 | For instance, you could use [Simple WebSocket Client](https://goo.gl/8Jw6K) for Google Chrome, opening the connection in this URL ws://192.168.99.100:9090/trending-topics . 79 | 80 | ### Stop Cluster 81 | 82 | We can stop the streaming gracefully, before stopping the cluster: 83 | 84 | curl -X "DELETE" "http://192.168.99.100:9090/twitter-streaming" 85 | 86 | And then, from the shell: 87 | 88 | cd scripts 89 | docker-compose stop 90 | docker-compose rm 91 | 92 | # HTTP Application API - FORMAT: 1A 93 | 94 | ## Spark Streaming Status Endpoint [/twitter-streaming] 95 | 96 | Start, stop and fetch the Spark Streaming Context status in the application. Note: once you have stopped the context you can not start again. 97 | 98 | ### Get Streaming Status [GET] 99 | 100 | + Response 200 (application/json) 101 | 102 | { 103 | "message": "The streaming has been created, but not been started yet" 104 | } 105 | 106 | ### Start Streaming [POST] 107 | 108 | This action allows you to stop the Spark Streaming Context. 109 | 110 | + Response 200 (application/json) 111 | 112 | { 113 | "message": "Started" 114 | } 115 | 116 | + Response 400 117 | 118 | ### Stop Streaming [DELETE] 119 | 120 | This action allows you to start the Spark Streaming Context. 121 | 122 | + Response 200 (application/json) 123 | 124 | { 125 | "message": "The streaming has been stopped" 126 | } 127 | 128 | + Response 400 129 | 130 | ## WS Filtered Twitter Word Tracks [WS /trending-topics] 131 | 132 | Open a websocket in order to show each new filtered track word is found. 133 | 134 | #License 135 | 136 | Copyright (C) 2015 47 Degrees, LLC [http://47deg.com](http://47deg.com) [hello@47deg.com](mailto:hello@47deg.com) 137 | 138 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 139 | 140 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 141 | -------------------------------------------------------------------------------- /activator: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### ------------------------------- ### 4 | ### Helper methods for BASH scripts ### 5 | ### ------------------------------- ### 6 | 7 | realpath () { 8 | ( 9 | TARGET_FILE="$1" 10 | 11 | cd "$(dirname "$TARGET_FILE")" 12 | TARGET_FILE=$(basename "$TARGET_FILE") 13 | 14 | COUNT=0 15 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 16 | do 17 | TARGET_FILE=$(readlink "$TARGET_FILE") 18 | cd "$(dirname "$TARGET_FILE")" 19 | TARGET_FILE=$(basename "$TARGET_FILE") 20 | COUNT=$(($COUNT + 1)) 21 | done 22 | 23 | if [ "$TARGET_FILE" == "." -o "$TARGET_FILE" == ".." ]; then 24 | cd "$TARGET_FILE" 25 | TARGET_FILEPATH= 26 | else 27 | TARGET_FILEPATH=/$TARGET_FILE 28 | fi 29 | 30 | # make sure we grab the actual windows path, instead of cygwin's path. 31 | if ! is_cygwin; then 32 | echo "$(pwd -P)/$TARGET_FILE" 33 | else 34 | echo $(cygwinpath "$(pwd -P)/$TARGET_FILE") 35 | fi 36 | ) 37 | } 38 | 39 | # TODO - Do we need to detect msys? 40 | 41 | # Uses uname to detect if we're in the odd cygwin environment. 42 | is_cygwin() { 43 | local os=$(uname -s) 44 | case "$os" in 45 | CYGWIN*) return 0 ;; 46 | *) return 1 ;; 47 | esac 48 | } 49 | 50 | # This can fix cygwin style /cygdrive paths so we get the 51 | # windows style paths. 52 | cygwinpath() { 53 | local file="$1" 54 | if is_cygwin; then 55 | echo $(cygpath -w $file) 56 | else 57 | echo $file 58 | fi 59 | } 60 | 61 | # Make something URI friendly 62 | make_url() { 63 | url="$1" 64 | local nospaces=${url// /%20} 65 | if is_cygwin; then 66 | echo "/${nospaces//\\//}" 67 | else 68 | echo "$nospaces" 69 | fi 70 | } 71 | 72 | # Detect if we should use JAVA_HOME or just try PATH. 73 | get_java_cmd() { 74 | if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]]; then 75 | echo "$JAVA_HOME/bin/java" 76 | else 77 | echo "java" 78 | fi 79 | } 80 | 81 | echoerr () { 82 | echo 1>&2 "$@" 83 | } 84 | vlog () { 85 | [[ $verbose || $debug ]] && echoerr "$@" 86 | } 87 | dlog () { 88 | [[ $debug ]] && echoerr "$@" 89 | } 90 | execRunner () { 91 | # print the arguments one to a line, quoting any containing spaces 92 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 93 | for arg; do 94 | if printf "%s\n" "$arg" | grep -q ' '; then 95 | printf "\"%s\"\n" "$arg" 96 | else 97 | printf "%s\n" "$arg" 98 | fi 99 | done 100 | echo "" 101 | } 102 | 103 | exec "$@" 104 | } 105 | addJava () { 106 | dlog "[addJava] arg = '$1'" 107 | java_args=( "${java_args[@]}" "$1" ) 108 | } 109 | addApp () { 110 | dlog "[addApp] arg = '$1'" 111 | sbt_commands=( "${app_commands[@]}" "$1" ) 112 | } 113 | addResidual () { 114 | dlog "[residual] arg = '$1'" 115 | residual_args=( "${residual_args[@]}" "$1" ) 116 | } 117 | addDebugger () { 118 | addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1" 119 | } 120 | addConfigOpts () { 121 | dlog "[addConfigOpts] arg = '$*'" 122 | for item in $* 123 | do 124 | addJava "$item" 125 | done 126 | } 127 | # a ham-fisted attempt to move some memory settings in concert 128 | # so they need not be messed around with individually. 129 | get_mem_opts () { 130 | local mem=${1:-1024} 131 | local meta=$(( $mem / 4 )) 132 | (( $meta > 256 )) || meta=256 133 | (( $meta < 1024 )) || meta=1024 134 | 135 | # default is to set memory options but this can be overridden by code section below 136 | memopts="-Xms${mem}m -Xmx${mem}m" 137 | if [[ "${java_version}" > "1.8" ]]; then 138 | extmemopts="-XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=${meta}m" 139 | else 140 | extmemopts="-XX:PermSize=64m -XX:MaxPermSize=${meta}m" 141 | fi 142 | 143 | if [[ "${java_opts}" == *-Xmx* ]] || [[ "${java_opts}" == *-Xms* ]] || [[ "${java_opts}" == *-XX:MaxPermSize* ]] || [[ "${java_opts}" == *-XX:ReservedCodeCacheSize* ]] || [[ "${java_opts}" == *-XX:MaxMetaspaceSize* ]]; then 144 | # if we detect any of these settings in ${java_opts} we need to NOT output our settings. 145 | # The reason is the Xms/Xmx, if they don't line up, cause errors. 146 | memopts="" 147 | extmemopts="" 148 | fi 149 | 150 | echo "${memopts} ${extmemopts}" 151 | } 152 | require_arg () { 153 | local type="$1" 154 | local opt="$2" 155 | local arg="$3" 156 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 157 | die "$opt requires <$type> argument" 158 | fi 159 | } 160 | is_function_defined() { 161 | declare -f "$1" > /dev/null 162 | } 163 | 164 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter 165 | detect_terminal_for_ui() { 166 | [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && { 167 | addResidual "ui" 168 | } 169 | # SPECIAL TEST FOR MAC 170 | [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && { 171 | echo "Detected MAC OSX launched script...." 172 | echo "Swapping to UI" 173 | addResidual "ui" 174 | } 175 | } 176 | 177 | # Processes incoming arguments and places them in appropriate global variables. called by the run method. 178 | process_args () { 179 | while [[ $# -gt 0 ]]; do 180 | case "$1" in 181 | -h|-help) usage; exit 1 ;; 182 | -v|-verbose) verbose=1 && shift ;; 183 | -d|-debug) debug=1 && shift ;; 184 | -mem) require_arg integer "$1" "$2" && app_mem="$2" && shift 2 ;; 185 | -jvm-debug) 186 | if echo "$2" | grep -E ^[0-9]+$ > /dev/null; then 187 | addDebugger "$2" && shift 188 | else 189 | addDebugger 9999 190 | fi 191 | shift ;; 192 | -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;; 193 | -D*) addJava "$1" && shift ;; 194 | -J*) addJava "${1:2}" && shift ;; 195 | *) addResidual "$1" && shift ;; 196 | esac 197 | done 198 | 199 | is_function_defined process_my_args && { 200 | myargs=("${residual_args[@]}") 201 | residual_args=() 202 | process_my_args "${myargs[@]}" 203 | } 204 | } 205 | 206 | # Actually runs the script. 207 | run() { 208 | # TODO - check for sane environment 209 | 210 | # process the combined args, then reset "$@" to the residuals 211 | process_args "$@" 212 | detect_terminal_for_ui 213 | set -- "${residual_args[@]}" 214 | argumentCount=$# 215 | 216 | #check for jline terminal fixes on cygwin 217 | if is_cygwin; then 218 | stty -icanon min 1 -echo > /dev/null 2>&1 219 | addJava "-Djline.terminal=jline.UnixTerminal" 220 | addJava "-Dsbt.cygwin=true" 221 | fi 222 | 223 | # run sbt 224 | execRunner "$java_cmd" \ 225 | "-Dactivator.home=$(make_url "$activator_home")" \ 226 | $(get_mem_opts $app_mem) \ 227 | ${java_opts[@]} \ 228 | ${java_args[@]} \ 229 | -jar "$app_launcher" \ 230 | "${app_commands[@]}" \ 231 | "${residual_args[@]}" 232 | 233 | local exit_code=$? 234 | if is_cygwin; then 235 | stty icanon echo > /dev/null 2>&1 236 | fi 237 | exit $exit_code 238 | } 239 | 240 | # Loads a configuration file full of default command line options for this script. 241 | loadConfigFile() { 242 | cat "$1" | sed '/^\#/d' 243 | } 244 | 245 | ### ------------------------------- ### 246 | ### Start of customized settings ### 247 | ### ------------------------------- ### 248 | usage() { 249 | cat < [options] 251 | 252 | Command: 253 | ui Start the Activator UI 254 | new [name] [template-id] Create a new project with [name] using template [template-id] 255 | list-templates Print all available template names 256 | -h | -help Print this message 257 | 258 | Options: 259 | -v | -verbose Make this runner chattier 260 | -d | -debug Set sbt log level to debug 261 | -mem Set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) 262 | -jvm-debug Turn on JVM debugging, open at the given port. 263 | 264 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 265 | -java-home Alternate JAVA_HOME 266 | 267 | # jvm options and output control 268 | -Dkey=val Pass -Dkey=val directly to the java runtime 269 | -J-X Pass option -X directly to the java runtime 270 | (-J is stripped) 271 | 272 | # environment variables (read from context) 273 | JAVA_OPTS Environment variable, if unset uses "" 274 | SBT_OPTS Environment variable, if unset uses "" 275 | ACTIVATOR_OPTS Environment variable, if unset uses "" 276 | 277 | In the case of duplicated or conflicting options, the order above 278 | shows precedence: environment variables lowest, command line options highest. 279 | EOM 280 | } 281 | 282 | ### ------------------------------- ### 283 | ### Main script ### 284 | ### ------------------------------- ### 285 | 286 | declare -a residual_args 287 | declare -a java_args 288 | declare -a app_commands 289 | declare -r real_script_path="$(realpath "$0")" 290 | declare -r activator_home="$(realpath "$(dirname "$real_script_path")")" 291 | declare -r app_version="1.3.5" 292 | 293 | declare -r app_launcher="${activator_home}/activator-launch-${app_version}.jar" 294 | declare -r script_name=activator 295 | java_cmd=$(get_java_cmd) 296 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" ) 297 | userhome="$HOME" 298 | if is_cygwin; then 299 | # cygwin sets home to something f-d up, set to real windows homedir 300 | userhome="$USERPROFILE" 301 | fi 302 | declare -r activator_user_home_dir="${userhome}/.activator" 303 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt" 304 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt" 305 | 306 | # Now check to see if it's a good enough version 307 | declare -r java_version=$("$java_cmd" -version 2>&1 | awk -F '"' '/version/ {print $2}') 308 | if [[ "$java_version" == "" ]]; then 309 | echo 310 | echo No java installations was detected. 311 | echo Please go to http://www.java.com/getjava/ and download 312 | echo 313 | exit 1 314 | elif [[ ! "$java_version" > "1.6" ]]; then 315 | echo 316 | echo The java installation you have is not up to date 317 | echo Activator requires at least version 1.6+, you have 318 | echo version $java_version 319 | echo 320 | echo Please go to http://www.java.com/getjava/ and download 321 | echo a valid Java Runtime and install before running Activator. 322 | echo 323 | exit 1 324 | fi 325 | 326 | # if configuration files exist, prepend their contents to the java args so it can be processed by this runner 327 | # a "versioned" config trumps one on the top level 328 | if [[ -f "$java_opts_config_version" ]]; then 329 | addConfigOpts $(loadConfigFile "$java_opts_config_version") 330 | elif [[ -f "$java_opts_config_home" ]]; then 331 | addConfigOpts $(loadConfigFile "$java_opts_config_home") 332 | fi 333 | 334 | run "$@" 335 | -------------------------------------------------------------------------------- /activator-launch-1.3.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xebia-functional/spark-on-lets-code/3df49fb88e4b7477d14a4aded89d163a24b30632/activator-launch-1.3.5.jar -------------------------------------------------------------------------------- /activator.bat: -------------------------------------------------------------------------------- 1 | @REM activator launcher script 2 | @REM 3 | @REM Environment: 4 | @REM In order for Activator to work you must have Java available on the classpath 5 | @REM JAVA_HOME - location of a JDK home dir (optional if java on path) 6 | @REM CFG_OPTS - JVM options (optional) 7 | @REM Configuration: 8 | @REM activatorconfig.txt found in the ACTIVATOR_HOME or ACTIVATOR_HOME/ACTIVATOR_VERSION 9 | @setlocal enabledelayedexpansion 10 | 11 | @echo off 12 | 13 | set "var1=%~1" 14 | if defined var1 ( 15 | if "%var1%"=="help" ( 16 | echo. 17 | echo Usage activator [options] [command] 18 | echo. 19 | echo Commands: 20 | echo ui Start the Activator UI 21 | echo new [name] [template-id] Create a new project with [name] using template [template-id] 22 | echo list-templates Print all available template names 23 | echo help Print this message 24 | echo. 25 | echo Options: 26 | echo -jvm-debug [port] Turn on JVM debugging, open at the given port. Defaults to 9999 if no port given. 27 | echo. 28 | echo Environment variables ^(read from context^): 29 | echo JAVA_OPTS Environment variable, if unset uses "" 30 | echo SBT_OPTS Environment variable, if unset uses "" 31 | echo ACTIVATOR_OPTS Environment variable, if unset uses "" 32 | echo. 33 | echo Please note that in order for Activator to work you must have Java available on the classpath 34 | echo. 35 | goto :end 36 | ) 37 | ) 38 | 39 | if "%ACTIVATOR_HOME%"=="" ( 40 | set "ACTIVATOR_HOME=%~dp0" 41 | @REM remove trailing "\" from path 42 | set ACTIVATOR_HOME=!ACTIVATOR_HOME:~0,-1! 43 | ) 44 | 45 | set ERROR_CODE=0 46 | set APP_VERSION=1.3.5 47 | set ACTIVATOR_LAUNCH_JAR=activator-launch-%APP_VERSION%.jar 48 | 49 | rem Detect if we were double clicked, although theoretically A user could 50 | rem manually run cmd /c 51 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1 52 | 53 | rem FIRST we load a config file of extra options (if there is one) 54 | set "CFG_FILE_HOME=%UserProfile%\.activator\activatorconfig.txt" 55 | set "CFG_FILE_VERSION=%UserProfile%\.activator\%APP_VERSION%\activatorconfig.txt" 56 | set CFG_OPTS= 57 | if exist %CFG_FILE_VERSION% ( 58 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_VERSION%") DO ( 59 | set DO_NOT_REUSE_ME=%%i 60 | rem ZOMG (Part #2) WE use !! here to delay the expansion of 61 | rem CFG_OPTS, otherwise it remains "" for this loop. 62 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! 63 | ) 64 | ) 65 | if "%CFG_OPTS%"=="" ( 66 | if exist %CFG_FILE_HOME% ( 67 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_HOME%") DO ( 68 | set DO_NOT_REUSE_ME=%%i 69 | rem ZOMG (Part #2) WE use !! here to delay the expansion of 70 | rem CFG_OPTS, otherwise it remains "" for this loop. 71 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! 72 | ) 73 | ) 74 | ) 75 | 76 | rem We use the value of the JAVACMD environment variable if defined 77 | set _JAVACMD=%JAVACMD% 78 | 79 | if "%_JAVACMD%"=="" ( 80 | if not "%JAVA_HOME%"=="" ( 81 | if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" 82 | 83 | rem if there is a java home set we make sure it is the first picked up when invoking 'java' 84 | SET "PATH=%JAVA_HOME%\bin;%PATH%" 85 | ) 86 | ) 87 | 88 | if "%_JAVACMD%"=="" set _JAVACMD=java 89 | 90 | rem Detect if this java is ok to use. 91 | for /F %%j in ('"%_JAVACMD%" -version 2^>^&1') do ( 92 | if %%~j==java set JAVAINSTALLED=1 93 | if %%~j==openjdk set JAVAINSTALLED=1 94 | ) 95 | 96 | rem Detect the same thing about javac 97 | if "%_JAVACCMD%"=="" ( 98 | if not "%JAVA_HOME%"=="" ( 99 | if exist "%JAVA_HOME%\bin\javac.exe" set "_JAVACCMD=%JAVA_HOME%\bin\javac.exe" 100 | ) 101 | ) 102 | if "%_JAVACCMD%"=="" set _JAVACCMD=javac 103 | for /F %%j in ('"%_JAVACCMD%" -version 2^>^&1') do ( 104 | if %%~j==javac set JAVACINSTALLED=1 105 | ) 106 | 107 | rem BAT has no logical or, so we do it OLD SCHOOL! Oppan Redmond Style 108 | set JAVAOK=true 109 | if not defined JAVAINSTALLED set JAVAOK=false 110 | if not defined JAVACINSTALLED set JAVAOK=false 111 | 112 | if "%JAVAOK%"=="false" ( 113 | echo. 114 | echo A Java JDK is not installed or can't be found. 115 | if not "%JAVA_HOME%"=="" ( 116 | echo JAVA_HOME = "%JAVA_HOME%" 117 | ) 118 | echo. 119 | echo Please go to 120 | echo http://www.oracle.com/technetwork/java/javase/downloads/index.html 121 | echo and download a valid Java JDK and install before running Activator. 122 | echo. 123 | echo If you think this message is in error, please check 124 | echo your environment variables to see if "java.exe" and "javac.exe" are 125 | echo available via JAVA_HOME or PATH. 126 | echo. 127 | if defined DOUBLECLICKED pause 128 | exit /B 1 129 | ) 130 | 131 | rem Check what Java version is being used to determine what memory options to use 132 | for /f "tokens=3" %%g in ('java -version 2^>^&1 ^| findstr /i "version"') do ( 133 | set JAVA_VERSION=%%g 134 | ) 135 | 136 | rem Strips away the " characters 137 | set JAVA_VERSION=%JAVA_VERSION:"=% 138 | 139 | rem TODO Check if there are existing mem settings in JAVA_OPTS/CFG_OPTS and use those instead of the below 140 | for /f "delims=. tokens=1-3" %%v in ("%JAVA_VERSION%") do ( 141 | set MAJOR=%%v 142 | set MINOR=%%w 143 | set BUILD=%%x 144 | 145 | set META_SIZE=-XX:MetaspaceSize=64M -XX:MaxMetaspaceSize=256M 146 | if "!MINOR!" LSS "8" ( 147 | set META_SIZE=-XX:PermSize=64M -XX:MaxPermSize=256M 148 | ) 149 | 150 | set MEM_OPTS=!META_SIZE! 151 | ) 152 | 153 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. 154 | set _JAVA_OPTS=%JAVA_OPTS% 155 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% 156 | 157 | set DEBUG_OPTS= 158 | 159 | rem Loop through the arguments, building remaining args in args variable 160 | set args= 161 | :argsloop 162 | if not "%~1"=="" ( 163 | rem Checks if the argument contains "-D" and if true, adds argument 1 with 2 and puts an equal sign between them. 164 | rem This is done since batch considers "=" to be a delimiter so we need to circumvent this behavior with a small hack. 165 | set arg1=%~1 166 | if "!arg1:~0,2!"=="-D" ( 167 | set "args=%args% "%~1"="%~2"" 168 | shift 169 | shift 170 | goto argsloop 171 | ) 172 | 173 | if "%~1"=="-jvm-debug" ( 174 | if not "%~2"=="" ( 175 | rem This piece of magic somehow checks that an argument is a number 176 | for /F "delims=0123456789" %%i in ("%~2") do ( 177 | set var="%%i" 178 | ) 179 | if defined var ( 180 | rem Not a number, assume no argument given and default to 9999 181 | set JPDA_PORT=9999 182 | ) else ( 183 | rem Port was given, shift arguments 184 | set JPDA_PORT=%~2 185 | shift 186 | ) 187 | ) else ( 188 | set JPDA_PORT=9999 189 | ) 190 | shift 191 | 192 | set DEBUG_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=!JPDA_PORT! 193 | goto argsloop 194 | ) 195 | rem else 196 | set "args=%args% "%~1"" 197 | shift 198 | goto argsloop 199 | ) 200 | 201 | :run 202 | 203 | if "!args!"=="" ( 204 | if defined DOUBLECLICKED ( 205 | set CMDS="ui" 206 | ) else set CMDS=!args! 207 | ) else set CMDS=!args! 208 | 209 | rem We add a / in front, so we get file:///C: instead of file://C: 210 | rem Java considers the later a UNC path. 211 | rem We also attempt a solid effort at making it URI friendly. 212 | rem We don't even bother with UNC paths. 213 | set JAVA_FRIENDLY_HOME_1=/!ACTIVATOR_HOME:\=/! 214 | set JAVA_FRIENDLY_HOME=/!JAVA_FRIENDLY_HOME_1: =%%20! 215 | 216 | rem Checks if the command contains spaces to know if it should be wrapped in quotes or not 217 | set NON_SPACED_CMD=%_JAVACMD: =% 218 | if "%_JAVACMD%"=="%NON_SPACED_CMD%" %_JAVACMD% %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\%ACTIVATOR_LAUNCH_JAR%" %CMDS% 219 | if NOT "%_JAVACMD%"=="%NON_SPACED_CMD%" "%_JAVACMD%" %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\%ACTIVATOR_LAUNCH_JAR%" %CMDS% 220 | 221 | if ERRORLEVEL 1 goto error 222 | goto end 223 | 224 | :error 225 | set ERROR_CODE=1 226 | 227 | :end 228 | 229 | @endlocal 230 | 231 | exit /B %ERROR_CODE% 232 | -------------------------------------------------------------------------------- /modules/api/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | spark-on { 2 | cassandraCQLPath = "/data/spark_on_spark.cql" 3 | windowSizeSeconds = 30 4 | slideDuration = 10 5 | filters = [ 6 | "scala", 7 | "akka", 8 | "spray", 9 | "play2", 10 | "playframework", 11 | "spark", 12 | "java", 13 | "python", 14 | "cassandra", 15 | "bigdata", 16 | "47 Degrees", 17 | "47Degrees", 18 | "47Deg", 19 | "programming", 20 | "lambda", 21 | "chicharrones", 22 | "cat", 23 | "dog"] 24 | spark.jars = ["./modules/api/target/scala-2.11/sparkOn-1.0.0.jar"] 25 | spark.jars = [${?SPARK_APP_JARS}] 26 | dateFormat: "yyyy_MM_dd_HH_mm" 27 | dateFormatSplitter: "_" 28 | } 29 | 30 | twitter { 31 | credentials { 32 | consumerKey = "" 33 | consumerKey = ${?CONSUMER_KEY} 34 | consumerSecret = "" 35 | consumerSecret = ${?CONSUMER_SECRET} 36 | accessToken = "" 37 | accessToken = ${?ACCESS_TOKEN} 38 | accessTokenSecret = "" 39 | accessTokenSecret = ${?ACCESS_TOKEN_SECRET} 40 | } 41 | } 42 | 43 | spark { 44 | master = "local[*]" 45 | master = ${?SPARK_MASTER_PORT_7077_TCP_ADDR} 46 | port = 7077 47 | port = ${?SPARK_MASTER_ENV_SPARK_MASTER_PORT} 48 | home = "/usr/local/spark" 49 | home = ${?SPARK_HOME} 50 | appName = "Spark On" 51 | checkpoint = "./checkpoint" 52 | checkpoint = ${?SPARK_CHECKPOINT} 53 | 54 | streaming.batch.interval = 10 55 | 56 | executor.memory = 2g 57 | cores.max = 2 58 | akka.heartbeat.interval = 100 59 | serializer = "org.apache.spark.serializer.KryoSerializer" 60 | 61 | cassandra { 62 | connection.host = [192.168.99.100] 63 | # Development: 64 | # connection.host = ${?CASSANDRA_HOSTS} 65 | # Production: 66 | connection.host = [${?CASSANDRA_HOSTS}] 67 | keyspace = "spark_on_topics" 68 | } 69 | } 70 | 71 | http { 72 | interface = "0.0.0.0" 73 | interface = ${?HTTP_INTERFACE} 74 | port = 8080 75 | port = ${?HTTP_PORT} 76 | } 77 | 78 | kafka { 79 | hosts = ["192.168.99.100:9092"] 80 | # Development: 81 | # hosts = ${?KAFKA_HOSTS} 82 | # Production: 83 | hosts = [${?KAFKA_HOSTS}] 84 | topics = "sparkOn.raw" 85 | topics = ${?KAFKA_TOPIC} 86 | 87 | zookeeper { 88 | host = "192.168.99.100" 89 | host = ${?ZOOKEEPER_PORT_2181_TCP_ADDR} 90 | port = 2181 91 | port = ${?ZOOKEEPER_ENV_ZOOKEEPER_PORT} 92 | } 93 | 94 | group.id = "sparkOn.group" 95 | topic.raw = "sparkOn.raw" 96 | 97 | producer { 98 | value.serializer = "org.apache.kafka.common.serialization.StringSerializer" 99 | key.serializer = "org.apache.kafka.common.serialization.StringSerializer" 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /modules/api/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # output messages into a rolling log file as well as stdout 2 | log4j.rootLogger=INFO,stdout 3 | 4 | # stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 8 | 9 | # Avoid "no host ID found" when starting a fresh node 10 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 11 | 12 | # If running spark local, ignore block input exists warnings, which are expected. 13 | log4j.logger.org.apache.spark.storage.BlockManager=ERROR 14 | log4j.logger.com.datastax.spark.connector=INFO 15 | log4j.logger.org.apache.spark=WARN 16 | log4j.logger.com.datastax.driver.core=WARN -------------------------------------------------------------------------------- /modules/api/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n 6 | 7 | 8 | 9 | 10 | ${log-file:-logs/api.log} 11 | 12 | %date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /modules/api/src/main/scala/com/fortysevendeg/sparkon/api/http/ApiHttpService.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.api.http 2 | 3 | import akka.actor.ActorSystem 4 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._ 5 | import akka.http.scaladsl.model.StatusCodes._ 6 | import akka.http.scaladsl.model.ws.{Message, TextMessage} 7 | import akka.http.scaladsl.server.Directives._ 8 | import akka.stream.Materializer 9 | import akka.stream.scaladsl.{Flow, Keep, Sink, Source} 10 | import com.datastax.spark.connector.cql.CassandraConnector 11 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._ 12 | import com.fortysevendeg.sparkon.services.twitter._ 13 | import com.softwaremill.react.kafka.KafkaMessages._ 14 | import com.softwaremill.react.kafka.{ConsumerProperties, ReactiveKafka} 15 | import kafka.serializer.StringDecoder 16 | import org.apache.spark.SparkContext 17 | import org.apache.spark.streaming.{Seconds, StreamingContext, StreamingContextState} 18 | import org.reactivestreams.Publisher 19 | import org.slf4j.LoggerFactory 20 | 21 | import scala.concurrent.ExecutionContextExecutor 22 | 23 | case class Info(message: String) 24 | 25 | case class ApiStreamingRequest(recreateDatabaseSchema: Boolean, filters: List[String]) 26 | 27 | trait ApiHttpService extends Protocols { 28 | 29 | val logger = LoggerFactory.getLogger(this.getClass) 30 | 31 | implicit val system: ActorSystem 32 | implicit def executor: ExecutionContextExecutor 33 | implicit val materializer: Materializer 34 | implicit val sparkContext: SparkContext 35 | implicit val ssc: StreamingContext 36 | implicit val cassandraConnector: CassandraConnector 37 | implicit val twitterStreamingServices: TwitterStreamingServices 38 | 39 | val routes = { 40 | logRequestResult("web-socket-services") { 41 | pathPrefix("trending-topics") { 42 | get { 43 | handleWebsocketMessages(handler = kafkaServiceFlow) 44 | } 45 | } 46 | } ~ { 47 | logRequestResult("twitter-streaming-services") { 48 | pathPrefix("twitter-streaming") { 49 | get { 50 | complete { 51 | Info(message = ssc.getState() match { 52 | case StreamingContextState.INITIALIZED => "The streaming has been created, but not been started yet" 53 | case StreamingContextState.ACTIVE => "The streaming has been started and running" 54 | case StreamingContextState.STOPPED => "The streaming has been stopped" 55 | }) 56 | } 57 | } ~ 58 | post { 59 | implicit val apiStreamingRequestFormat = jsonFormat2(ApiStreamingRequest) 60 | entity(as[ApiStreamingRequest]) { request => 61 | complete { 62 | ssc.getState() match { 63 | case StreamingContextState.INITIALIZED => 64 | if (request.recreateDatabaseSchema) { 65 | twitterStreamingServices.createCassandraSchema 66 | } 67 | val filters = TwitterServices.getTrendingTopics ++ request.filters 68 | 69 | logger.info(s"Streaming Filters [${filters.mkString(",\n")}]") 70 | 71 | implicit val dsStream = twitterStreamingServices.createTwitterStream() 72 | twitterStreamingServices.ingestTweets(topics = filters, 73 | windowSize = Seconds(windowSizeSeconds), 74 | slideDuration = Seconds(slideDuration)) 75 | Info(message = "Started") 76 | case StreamingContextState.ACTIVE => 77 | BadRequest -> "The streaming has already started" 78 | case StreamingContextState.STOPPED => 79 | BadRequest -> "The streaming has already stopped" 80 | } 81 | } 82 | } 83 | } ~ 84 | delete { 85 | complete { 86 | ssc.getState() match { 87 | case StreamingContextState.INITIALIZED => 88 | Info(message = "The streaming has been created, but not been started yet") 89 | case StreamingContextState.ACTIVE => 90 | ssc.stop(stopSparkContext = false, stopGracefully = true) 91 | ssc.awaitTermination() 92 | Info(message = "The streaming has been stopped") 93 | case StreamingContextState.STOPPED => 94 | BadRequest -> "The streaming has already stopped" 95 | } 96 | } 97 | } 98 | } 99 | } 100 | } 101 | } 102 | 103 | def kafkaServiceFlow: Flow[Message, Message, _] = { 104 | 105 | val kafka = new ReactiveKafka() 106 | val publisher: Publisher[StringKafkaMessage] = 107 | kafka.consume( 108 | ConsumerProperties( 109 | brokerList = bootstrapServers, 110 | zooKeeperHost = s"$zookeeperHost:$zookeeperPort", 111 | topic = kafkaTopicRaw, 112 | groupId = kafkaGroupId, 113 | decoder = new StringDecoder() 114 | ) 115 | ) 116 | 117 | Flow.wrap(Sink.ignore, Source(publisher) map toMessage)(Keep.none) 118 | } 119 | 120 | def toMessage(t: KafkaMessage[String]) = TextMessage("Received: " + t.message) 121 | } 122 | -------------------------------------------------------------------------------- /modules/api/src/main/scala/com/fortysevendeg/sparkon/api/http/Boot.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.api.http 2 | 3 | import akka.actor.ActorSystem 4 | import akka.http.scaladsl.Http 5 | import akka.stream.ActorMaterializer 6 | import com.datastax.spark.connector.cql.CassandraConnector 7 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._ 8 | import com.fortysevendeg.sparkon.services.twitter.TwitterStreamingServices 9 | import org.apache.spark.streaming.{Seconds, StreamingContext} 10 | import org.apache.spark.{SparkConf, SparkContext} 11 | 12 | object Boot extends App with ApiHttpService { 13 | 14 | val sparkConf = new SparkConf() 15 | .setMaster(sparkMaster) 16 | .setAppName(sparkAppName) 17 | .setSparkHome(sparkHome) 18 | .setJars(sparkOnJars) 19 | .set("spark.executor.memory", sparkExecutorMemory.toString) 20 | .set("spark.cores.max", sparkCoresMax.toString) 21 | .set("spark.cassandra.connection.host", cassandraHosts) 22 | .set("spark.akka.heartbeat.interval", sparkAkkaHeartbeatInterval.toString) 23 | .set("spark.serializer", sparkSerializer) 24 | .set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory") 25 | .set("spark.executorEnv.kafkaBootstrapServers", bootstrapServers) 26 | .set("spark.executorEnv.kafkaProducerKeySerializer", kafkaProducerKeySerializer) 27 | .set("spark.executorEnv.kafkaProducerValueSerializer", kafkaProducerValueSerializer) 28 | .set("spark.streaming.backpressure.enabled", "true") 29 | 30 | override implicit val system = ActorSystem("ReactiveSparkOn") 31 | override implicit val executor = system.dispatcher 32 | override implicit val materializer = ActorMaterializer() 33 | override implicit val sparkContext = createSparkContext 34 | override implicit val ssc: StreamingContext = createStreamingContext(sparkContext) 35 | override implicit val cassandraConnector: CassandraConnector = CassandraConnector(sparkConf) 36 | override implicit val twitterStreamingServices = new TwitterStreamingServices {} 37 | 38 | Http().bindAndHandle(routes, interface, port) 39 | logger.info(s"Server started at http://$interface:$port") 40 | 41 | def createSparkContext: SparkContext = new SparkContext(sparkConf) 42 | 43 | def createStreamingContext(sparkContext: SparkContext): StreamingContext = 44 | new StreamingContext(sparkContext = sparkContext, batchDuration = Seconds(streamingBatchInterval)) 45 | } 46 | -------------------------------------------------------------------------------- /modules/api/src/main/scala/com/fortysevendeg/sparkon/api/http/Protocols.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.api.http 2 | 3 | import spray.json.DefaultJsonProtocol 4 | 5 | trait Protocols extends DefaultJsonProtocol { 6 | implicit val infoFormat = jsonFormat1(Info.apply) 7 | } -------------------------------------------------------------------------------- /modules/common/src/main/scala/com/fortysevendeg/sparkon/common/StaticValues.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.common 2 | 3 | object StaticValues { 4 | val javaNull = None.orNull 5 | } 6 | -------------------------------------------------------------------------------- /modules/common/src/main/scala/com/fortysevendeg/sparkon/common/config/ConfigRegistry.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.common.config 2 | 3 | import com.typesafe.config.{Config, ConfigFactory} 4 | 5 | import scala.collection.JavaConverters._ 6 | import scala.language.postfixOps 7 | 8 | case class TwitterAuth(consumerKey: String, 9 | consumerSecret: String, 10 | accessToken: String, 11 | accessTokenSecret: String) 12 | 13 | object ConfigRegistry { 14 | 15 | val config = ConfigFactory.load() 16 | 17 | // APP Configuration keys: 18 | 19 | lazy val sparkOnConfig = config.getConfig("spark-on") 20 | 21 | lazy val sparkOnFilters = sparkOnConfig.getStringList("filters").asScala.toSet 22 | lazy val windowSizeSeconds = sparkOnConfig.getLong("windowSizeSeconds") 23 | lazy val slideDuration = sparkOnConfig.getLong("slideDuration") 24 | lazy val cassandraCQLPath = sparkOnConfig.getString("cassandraCQLPath") 25 | lazy val sparkOnJars = sparkOnConfig.getStringList("spark.jars").asScala.toList 26 | lazy val dateFormat = sparkOnConfig.getString("dateFormat") 27 | lazy val dateFormatSplitter = sparkOnConfig.getString("dateFormatSplitter") 28 | 29 | // Twitter Configuration keys: 30 | 31 | lazy val twitterConfig = config.getConfig("twitter") 32 | lazy val twitterCredentials = twitterConfig.getConfig("credentials") 33 | 34 | lazy val consumerKey = twitterCredentials.getString("consumerKey") 35 | lazy val consumerSecret = twitterCredentials.getString("consumerSecret") 36 | lazy val accessToken = twitterCredentials.getString("accessToken") 37 | lazy val accessTokenSecret = twitterCredentials.getString("accessTokenSecret") 38 | 39 | lazy val twitterAuth = TwitterAuth( 40 | consumerKey, 41 | consumerSecret, 42 | accessToken, 43 | accessTokenSecret) 44 | 45 | // Spark Configuration keys: 46 | 47 | lazy val sparkMasterHost = getStringFromEnvOrConfig("spark.master") 48 | lazy val sparkMasterPort = getStringFromEnvOrConfig("spark.port") 49 | lazy val sparkMaster = sparkMasterHost.contains("local") match { 50 | case true => sparkMasterHost 51 | case _ => "spark://$sparkMasterHost:$sparkMasterPort" 52 | } 53 | 54 | lazy val sparkAppName = config.getString("spark.appName") 55 | lazy val sparkHome = config.getString("spark.home") 56 | lazy val sparkCheckpoint = config.getString("spark.checkpoint") 57 | lazy val streamingBatchInterval = config.getLong("spark.streaming.batch.interval") 58 | lazy val sparkExecutorMemory = config.getBytes("spark.executor.memory") 59 | lazy val sparkCoresMax = getIntFromEnvOrConfig("spark.cores.max") 60 | lazy val sparkSerializer = getStringFromEnvOrConfig("spark.serializer") 61 | 62 | lazy val sparkAkkaHeartbeatInterval = getIntFromEnvOrConfig("spark.akka.heartbeat.interval") 63 | 64 | // Cassandra Configuration keys: 65 | 66 | lazy val cassandraNodesValues: List[String] = List(sys.env.get(s"CASSANDRA_SEED_PORT_9160_TCP_ADDR")) ++ { 67 | 1 to 10 map { index => 68 | sys.env.get(s"CASSANDRA_SLAVE_${index}_PORT_9160_TCP_ADDR") 69 | } 70 | } flatten 71 | 72 | lazy val cassandraHosts = mkStringNodes(nodes = cassandraNodesValues, 73 | propKey = "spark.cassandra.connection.host", 74 | cfg = config, 75 | configurationKeyList = "spark.cassandra.connection.host") 76 | 77 | lazy val sparkCassandraKeyspace: String = config.getString("spark.cassandra.keyspace") 78 | 79 | // APP HTTP Configuration keys: 80 | 81 | lazy val httpConfig = config.getConfig("http") 82 | lazy val interface = httpConfig.getString("interface") 83 | lazy val port = httpConfig.getInt("port") 84 | 85 | // Kakfa Configuration keys: 86 | 87 | lazy val kafkaConfig = config.getConfig("kafka") 88 | 89 | lazy val kafkaNodesEnvVariables = 1 to 10 map { index => 90 | (sys.env.get(s"KAFKA_${index}_PORT_9092_TCP_ADDR"), 91 | sys.env.get(s"KAFKA_${index}_PORT_9092_TCP_PORT")) 92 | } toList 93 | 94 | lazy val kafkaNodesValues: List[String] = kafkaNodesEnvVariables flatMap { 95 | case (Some(h), Some(p)) => Some(s"$h:$p") 96 | case _ => None 97 | } 98 | 99 | lazy val bootstrapServers = mkStringNodes(nodes = kafkaNodesValues, 100 | propKey = "kafka.hosts", 101 | cfg = kafkaConfig, 102 | configurationKeyList = "hosts") 103 | lazy val kafkaTopics = kafkaConfig.getString("topics").split(",").toSet 104 | 105 | lazy val zookeeperHost = kafkaConfig.getString("zookeeper.host") 106 | lazy val zookeeperPort = kafkaConfig.getInt("zookeeper.port") 107 | 108 | lazy val kafkaGroupId = kafkaConfig.getString("group.id") 109 | lazy val kafkaTopicRaw = kafkaConfig.getString("topic.raw") 110 | 111 | lazy val kafkaProducerKeySerializer = kafkaConfig.getString("producer.key.serializer") 112 | lazy val kafkaProducerValueSerializer = kafkaConfig.getString("producer.value.serializer") 113 | 114 | // Helper methods: 115 | 116 | private[config] def getStringFromEnvOrConfig(configKey: String) = 117 | sys.props.get(configKey) getOrElse config.getString(configKey) 118 | 119 | private[config] def getIntFromEnvOrConfig(configKey: String) = 120 | sys.props.get(configKey) map (_.toInt) getOrElse config.getInt(configKey) 121 | 122 | private[config] def mkStringNodes(nodes: List[String], propKey: String, cfg: Config, configurationKeyList: String): String = 123 | if (nodes.nonEmpty) nodes.mkString(",") 124 | else sys.props.get(propKey) getOrElse { 125 | val hostList = cfg.getStringList(configurationKeyList).asScala 126 | hostList.mkString(",") 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /modules/persistence/src/main/resources/data/spark_on_spark.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE IF EXISTS #KEYSPACE#; 2 | 3 | CREATE KEYSPACE #KEYSPACE# WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1}; 4 | 5 | CREATE TABLE #KEYSPACE#.streaming_tweets_by_day ( 6 | id text, 7 | user_id text, 8 | user_name text, 9 | user_screen_name text, 10 | created_timestamp text, 11 | created_day text, 12 | tweet_text text, 13 | lang text, 14 | retweet_count int, 15 | favorite_count int, 16 | latitude double, 17 | longitude double, 18 | PRIMARY KEY(created_day, id)); 19 | 20 | CREATE TABLE #KEYSPACE#.streaming_tweets_by_track ( 21 | track text, 22 | year int, 23 | month int, 24 | day int, 25 | hour int, 26 | minute int, 27 | count counter, 28 | PRIMARY KEY(track, year, month, day, hour, minute)) 29 | WITH CLUSTERING ORDER BY (year DESC, month DESC, day DESC, hour DESC, minute DESC); 30 | -------------------------------------------------------------------------------- /modules/persistence/src/main/scala/com/fortysevendeg/sparkon/persistence/schema/CassandraServices.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.persistence.schema 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import com.fortysevendeg.sparkon.persistence.schema.domain.PersistenceException 5 | import org.slf4j.LoggerFactory 6 | import scala.io.Source 7 | import scala.util.{Failure, Success, Try} 8 | 9 | trait CassandraServices extends Serializable { 10 | 11 | val logger = LoggerFactory.getLogger(this.getClass) 12 | 13 | val keyspacePattern = "#KEYSPACE#" 14 | 15 | def createSchema(keyspace: String, 16 | cassandraCQLPath: String)(implicit connector: CassandraConnector) = { 17 | 18 | val cqlStatements = Try { 19 | val url = getClass.getResource(cassandraCQLPath) 20 | val cql = Source.fromURL(url).mkString 21 | cql.split("\n\n").toList 22 | } 23 | 24 | cqlStatements match { 25 | case Success(cql) => 26 | val finalCQL = 27 | cql 28 | .filterNot(_.trim.isEmpty) 29 | .map(_.replaceAll(keyspacePattern, keyspace)) 30 | connector.withSessionDo { session => finalCQL foreach session.execute } 31 | case Failure(e) => 32 | logger.error("The Cassandra schema could not be loaded", e) 33 | throw PersistenceException(e.getMessage, Some(e)) 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /modules/persistence/src/main/scala/com/fortysevendeg/sparkon/persistence/schema/domain/PersistenceException.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.persistence.schema.domain 2 | 3 | case class PersistenceException(message: String, cause: Option[Throwable] = None) 4 | extends RuntimeException(message, cause.orNull) 5 | -------------------------------------------------------------------------------- /modules/persistence/src/main/scala/com/fortysevendeg/sparkon/persistence/schema/domain/TweetsModels.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.persistence.schema.domain 2 | 3 | case class TweetsByDay(id: String, 4 | userId: Long, userName: String, 5 | userScreenName: String, 6 | createdTimestamp: String, 7 | createdDay: String, 8 | tweetText: String, 9 | lang: String, 10 | retweetCount: Int, 11 | favoriteCount: Int, 12 | latitude: Option[Double], 13 | longitude: Option[Double]) 14 | 15 | case class TweetsByTrack( 16 | track: String, 17 | year: Int, 18 | month: Int, 19 | day: Int, 20 | hour: Int, 21 | minute: Int, 22 | count: Long) -------------------------------------------------------------------------------- /modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/TwitterReceiverActorStream.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter 2 | 3 | import akka.actor.Actor 4 | import org.apache.spark.streaming.receiver.ActorHelper 5 | import twitter4j._ 6 | import twitter4j.auth.Authorization 7 | 8 | import scala.reflect.ClassTag 9 | 10 | class TwitterReceiverActorStream[T: ClassTag]( 11 | twitterAuth: Authorization, 12 | filters: List[String] 13 | ) extends Actor with ActorHelper { 14 | 15 | val twitterStream = new TwitterStreamFactory().getInstance(twitterAuth) 16 | val listener = new StatusListener() { 17 | 18 | def onStatus(status: Status) = self ! status 19 | def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) = {} 20 | def onTrackLimitationNotice(i: Int) = {} 21 | def onScrubGeo(l: Long, l1: Long) = {} 22 | def onStallWarning(stallWarning: StallWarning) = {} 23 | def onException(e: Exception) = e.printStackTrace() 24 | } 25 | 26 | override def preStart(): Unit = { 27 | twitterStream.addListener(listener) 28 | filters match { 29 | case Nil => twitterStream.sample() 30 | case _ => 31 | val query = new FilterQuery 32 | query.track(filters.toArray) 33 | twitterStream.filter(query) 34 | } 35 | } 36 | 37 | def receive = { 38 | case data => store(data.asInstanceOf[T]) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/TwitterServices.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter 2 | 3 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._ 4 | import com.fortysevendeg.sparkon.services.twitter.domain._ 5 | import org.slf4j.LoggerFactory 6 | import scala.language.postfixOps 7 | import scala.util._ 8 | import twitter4j.{Twitter, TwitterFactory} 9 | import twitter4j.auth.OAuthAuthorization 10 | import twitter4j.conf.ConfigurationBuilder 11 | 12 | trait TwitterServices extends Serializable { 13 | 14 | val logger = LoggerFactory.getLogger(this.getClass) 15 | val woeid = 1 //Worldwide 16 | lazy val twitterClient: Twitter = 17 | new TwitterFactory().getInstance(buildAuthorization) 18 | 19 | def getTrendingTopics = { 20 | val trends = Try(twitterClient 21 | .trends() 22 | .getPlaceTrends(woeid) 23 | .getTrends 24 | .map(_.getName) 25 | .toSet) 26 | 27 | trends match { 28 | case Success(trendSet) => 29 | logger.info(s"Current Trending Topics => ${trendSet.mkString(", ")}") 30 | trendSet 31 | case Failure(e) => throw TwitterServiceException(e.getMessage(), e) 32 | } 33 | } 34 | 35 | def buildAuthorization = 36 | new OAuthAuthorization(new ConfigurationBuilder() 37 | .setOAuthConsumerKey(twitterAuth.consumerKey) 38 | .setOAuthConsumerSecret(twitterAuth.consumerSecret) 39 | .setOAuthAccessToken(twitterAuth.accessToken) 40 | .setOAuthAccessTokenSecret(twitterAuth.accessTokenSecret) 41 | .build()) 42 | } 43 | 44 | object TwitterServices extends TwitterServices 45 | -------------------------------------------------------------------------------- /modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/TwitterStreamingServices.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter 2 | 3 | import java.util.Properties 4 | 5 | import akka.actor.Props 6 | import com.datastax.spark.connector.SomeColumns 7 | import com.datastax.spark.connector.cql.CassandraConnector 8 | import com.datastax.spark.connector.streaming._ 9 | import com.fortysevendeg.sparkon.common.StaticValues 10 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry._ 11 | import com.fortysevendeg.sparkon.persistence.schema.CassandraServices 12 | import com.fortysevendeg.sparkon.persistence.schema.domain._ 13 | import com.fortysevendeg.sparkon.services.twitter.domain.Conversions._ 14 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 15 | import org.apache.spark.storage.StorageLevel 16 | import org.apache.spark.streaming.dstream.DStream 17 | import org.apache.spark.streaming.{Duration, StreamingContext} 18 | import org.slf4j.LoggerFactory 19 | import twitter4j.Status 20 | import twitter4j.auth.OAuthAuthorization 21 | import twitter4j.conf.ConfigurationBuilder 22 | 23 | import scala.language.postfixOps 24 | 25 | trait TwitterStreamingServices extends Serializable { 26 | 27 | val logger = LoggerFactory.getLogger(this.getClass) 28 | val cassandraServices = new CassandraServices {} 29 | 30 | def createCassandraSchema(implicit cassandraConnector: CassandraConnector) = 31 | cassandraServices.createSchema(sparkCassandraKeyspace, cassandraCQLPath) 32 | 33 | def createTwitterStream( 34 | filters: List[String] = Nil, 35 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER)(implicit ssc: StreamingContext) = { 36 | val authorization = new OAuthAuthorization(new ConfigurationBuilder() 37 | .setOAuthConsumerKey(twitterAuth.consumerKey) 38 | .setOAuthConsumerSecret(twitterAuth.consumerSecret) 39 | .setOAuthAccessToken(twitterAuth.accessToken) 40 | .setOAuthAccessTokenSecret(twitterAuth.accessTokenSecret) 41 | .build()) 42 | 43 | ssc.actorStream[Status]( 44 | Props( 45 | new TwitterReceiverActorStream[Status]( 46 | twitterAuth = authorization, 47 | filters = filters)), 48 | "TwitterStreamingReceiverActor", 49 | storageLevel) 50 | } 51 | 52 | def ingestTweets(topics: Set[String], 53 | windowSize: Duration, 54 | slideDuration: Duration) 55 | (implicit ssc: StreamingContext, 56 | dsStream: DStream[Status]) = { 57 | 58 | val tweetsByDay: DStream[TweetsByDay] = getTweetsByDay(dsStream) 59 | 60 | val tweetsByTrack: DStream[TweetsByTrack] = getTweetsByTrack(dsStream, topics, windowSize, slideDuration) 61 | 62 | // tweetsByTrack -> kafka 63 | writeToKafka(tweetsByTrack) 64 | 65 | // tweetsByDay -> streaming_tweets_by_day 66 | tweetsByDay.saveToCassandra( 67 | sparkCassandraKeyspace, 68 | "streaming_tweets_by_day", 69 | SomeColumns( 70 | "id", 71 | "user_id", 72 | "user_name", 73 | "user_screen_name", 74 | "created_timestamp", 75 | "created_day", 76 | "tweet_text", 77 | "lang", 78 | "retweet_count", 79 | "favorite_count", 80 | "latitude", 81 | "longitude")) 82 | 83 | // tweetsByTrack -> streaming_tweets_by_track 84 | tweetsByTrack.saveToCassandra( 85 | sparkCassandraKeyspace, 86 | "streaming_tweets_by_track", 87 | SomeColumns( 88 | "track", 89 | "year", 90 | "month", 91 | "day", 92 | "hour", 93 | "minute", 94 | "count")) 95 | 96 | ssc.checkpoint(sparkCheckpoint) 97 | ssc.start() 98 | } 99 | 100 | def writeToKafka(dStream: DStream[TweetsByTrack]) = 101 | dStream.map(_.track).foreachRDD { rdd => 102 | rdd foreachPartition { partition => 103 | lazy val kafkaProducerParams = new Properties() 104 | 105 | val kafkaBootstrapServersFromEnv = sys.env.getOrElse("kafkaBootstrapServers", "") 106 | val kafkaProducerKeySerializerFromEnv = sys.env.getOrElse("kafkaProducerKeySerializer", "") 107 | val kafkaProducerValueSerializerFromEnv = sys.env.getOrElse("kafkaProducerValueSerializer", "") 108 | 109 | kafkaProducerParams.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, kafkaBootstrapServersFromEnv) 110 | kafkaProducerParams.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, kafkaProducerKeySerializerFromEnv) 111 | kafkaProducerParams.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, kafkaProducerValueSerializerFromEnv) 112 | val producer = new KafkaProducer[String, String](kafkaProducerParams) 113 | 114 | partition foreach { 115 | case m: String => 116 | val message = new ProducerRecord[String, String](kafkaTopicRaw, StaticValues.javaNull, m) 117 | producer.send(message) 118 | case _ => logger.warn("Unknown Partition Message!") 119 | } 120 | } 121 | } 122 | 123 | def getTweetsByDay(dsStream: DStream[Status]): DStream[TweetsByDay] = dsStream.map(toTweetsByDay) 124 | 125 | def getTweetsByTrack(dsStream: DStream[Status], 126 | topics: Set[String], 127 | windowSize: Duration, 128 | slideDuration: Duration): DStream[TweetsByTrack] = 129 | dsStream 130 | .flatMap(_.getText.toLowerCase.split( """\s+""")) 131 | .filter(topics.contains) 132 | .countByValueAndWindow(windowSize, slideDuration) 133 | .transform { 134 | (rdd, time) => 135 | val dateParts = formatTime(time, dateFormat) 136 | .split(dateFormatSplitter) 137 | .map(_.toInt) 138 | rdd map { 139 | case (track, count) => 140 | toTweetsByTrack(dateParts, track, count) 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/domain/Conversions.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter.domain 2 | 3 | import com.fortysevendeg.sparkon.persistence.schema.domain.{TweetsByDay, TweetsByTrack} 4 | import org.apache.spark.streaming.Time 5 | import org.joda.time.{DateTime, DateTimeZone} 6 | import twitter4j.Status 7 | 8 | object Conversions { 9 | 10 | def toTweetsByDay(statusRDD: Status): TweetsByDay = { 11 | val user = statusRDD.getUser 12 | val geoLocation = Option(statusRDD.getGeoLocation) 13 | TweetsByDay( 14 | id = statusRDD.getId.toString, 15 | userId = user.getId, 16 | userName = user.getName, 17 | userScreenName = user.getScreenName, 18 | createdTimestamp = formatMillis(user.getCreatedAt.getTime), 19 | createdDay = formatMillis(user.getCreatedAt.getTime, "yyyyMMdd"), 20 | tweetText = statusRDD.getText, 21 | lang = statusRDD.getLang, 22 | retweetCount = statusRDD.getRetweetCount, 23 | favoriteCount = statusRDD.getFavoriteCount, 24 | latitude = geoLocation map (_.getLatitude), 25 | longitude = geoLocation map (_.getLongitude)) 26 | } 27 | 28 | def toTweetsByTrack(dateParts: Array[Int], track: String, count: Long): TweetsByTrack = { 29 | TweetsByTrack( 30 | track = track, 31 | year = dateParts(0), 32 | month = dateParts(1), 33 | day = dateParts(2), 34 | hour = dateParts(3), 35 | minute = dateParts(4), 36 | count = count) 37 | } 38 | 39 | def formatTime(time: Time, format: String = "yyyyMMddHH:mm:ss.SSS"): String = 40 | formatMillis(time.milliseconds, format) 41 | 42 | def formatMillis(millis: Long, format: String = "yyyyMMddHH:mm:ss.SSS"): String = 43 | new DateTime(millis, DateTimeZone.UTC).toString(format) 44 | } -------------------------------------------------------------------------------- /modules/services/src/main/scala/com/fortysevendeg/sparkon/services/twitter/domain/TwitterServiceException.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter.domain 2 | 3 | case class TwitterServiceException(message: String, cause: Throwable) 4 | extends RuntimeException(message, cause) 5 | -------------------------------------------------------------------------------- /modules/test/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # output messages into a rolling log file as well as stdout 2 | log4j.rootLogger=INFO,stdout 3 | 4 | # stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 8 | 9 | # Avoid "no host ID found" when starting a fresh node 10 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 11 | 12 | # If running spark local, ignore block input exists warnings, which are expected. 13 | log4j.logger.org.apache.spark.storage.BlockManager=ERROR 14 | log4j.logger.com.datastax.spark.connector=INFO 15 | log4j.logger.org.apache.spark=WARN 16 | log4j.logger.com.datastax.driver.core=WARN -------------------------------------------------------------------------------- /modules/test/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n 6 | 7 | 8 | 9 | 10 | ${log-file:-logs/api.log} 11 | 12 | %date{HH:mm:ss} %-5level %logger{0} {%class %method} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /modules/test/src/test/resources/reference.conf: -------------------------------------------------------------------------------- 1 | spark-on { 2 | cassandraCQLPath = "/data/spark_on_spark.cql" 3 | windowSizeSeconds = 5 4 | filters = ["scala", "play", "akka", "spark" , "47", "global", "consulting"] 5 | spark.jars = ["./path/to/assembly.jar"] 6 | dateFormat: "yyyy_MM_dd_HH_mm" 7 | dateFormatSplitter: "_" 8 | } 9 | 10 | twitter { 11 | credentials { 12 | consumerKey = "" 13 | consumerSecret = "" 14 | accessToken = "" 15 | accessTokenSecret = "" 16 | } 17 | } 18 | 19 | spark { 20 | master = "local[*]" 21 | home = "/usr/local/spark" 22 | appName = "Spark On" 23 | checkpoint = "./checkpoint" 24 | 25 | streaming.batch.interval = 5 26 | 27 | executor.memory = 2g 28 | cores.max = 2 29 | akka.heartbeat.interval = 100 30 | serializer = "org.apache.spark.serializer.KryoSerializer" 31 | 32 | cassandra { 33 | connection.host = [localhost] 34 | keyspace = "spark_on_topics" 35 | } 36 | } 37 | 38 | http { 39 | interface = "0.0.0.0" 40 | port = 8080 41 | } 42 | 43 | kafka { 44 | hosts = ["localhost:9092"] 45 | topics = "sparkOn.raw" 46 | 47 | zookeeper { 48 | host = "localhost" 49 | port = 2181 50 | } 51 | 52 | group.id = "sparkOn.group" 53 | topic.raw = "sparkOn.raw" 54 | 55 | producer { 56 | value.serializer = "org.apache.kafka.common.serialization.StringSerializer" 57 | key.serializer = "org.apache.kafka.common.serialization.StringSerializer" 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /modules/test/src/test/scala/com/fortysevendeg/sparkon/common/BaseServiceTest.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.common 2 | 3 | import org.specs2.mutable.Specification 4 | 5 | import scala.concurrent.duration.Duration 6 | import scala.concurrent.{Await, Future} 7 | 8 | trait BaseServiceTest extends Specification { 9 | 10 | def await[T](future: Future[T]) = Await.result(future, Duration.Inf) 11 | } 12 | -------------------------------------------------------------------------------- /modules/test/src/test/scala/com/fortysevendeg/sparkon/persistence/CassandraServicesSpec.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.persistence 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import com.fortysevendeg.sparkon.common.BaseServiceTest 5 | import com.fortysevendeg.sparkon.persistence.schema.CassandraServices 6 | import com.fortysevendeg.sparkon.persistence.schema.domain.PersistenceException 7 | import org.specs2.mock.Mockito 8 | 9 | class CassandraServicesSpec extends BaseServiceTest with Mockito { 10 | sequential 11 | 12 | "CassandraServices" should { 13 | "create the cassandra schema given a valid CQL Path" in { 14 | val cassandraServices: CassandraServices = new CassandraServices {} 15 | implicit val connector = mock[CassandraConnector] 16 | 17 | cassandraServices.createSchema( 18 | keyspace = "spark_on", 19 | cassandraCQLPath= "/data/spark_on_spark.cql") 20 | 21 | there was one(connector).withSessionDo(_ => "") 22 | } 23 | 24 | "throw an exception when the cassandra cql script is not valid" in { 25 | val cassandraServices: CassandraServices = new CassandraServices {} 26 | implicit val connector = mock[CassandraConnector] 27 | 28 | cassandraServices.createSchema( 29 | keyspace = "spark_on", 30 | cassandraCQLPath= "/wrong/path/to.cql") must throwA[PersistenceException] 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /modules/test/src/test/scala/com/fortysevendeg/sparkon/services/twitter/TwitterReceiverActorStreamSpec.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter 2 | 3 | import akka.actor.ActorSystem 4 | import akka.testkit.{ImplicitSender, TestActorRef, TestKitBase} 5 | import com.fortysevendeg.sparkon.common.BaseServiceTest 6 | import org.specs2.mock.Mockito 7 | import scala.reflect.ClassTag 8 | import twitter4j.{Status, StatusListener, TwitterStream} 9 | import twitter4j.auth.Authorization 10 | 11 | class TwitterReceiverActorStreamSpec 12 | extends BaseServiceTest 13 | with TestKitBase 14 | with ImplicitSender 15 | with Mockito { 16 | 17 | val twitterStreamMock = mock[TwitterStream] 18 | 19 | class TwitterReceiverActorStreamStub[T: ClassTag]( 20 | twitterAuth: Authorization, filters: List[String]) 21 | extends TwitterReceiverActorStream[T](twitterAuth, filters) { 22 | override val twitterStream = twitterStreamMock 23 | override val listener = mock[StatusListener] 24 | } 25 | 26 | implicit lazy val system = ActorSystem() 27 | 28 | "TwitterReceiverActorStream Actor" should { 29 | "process all the actor streaming messages" in { 30 | 31 | val twitterAuth = mock[Authorization] 32 | val filters = List("scala", "play", "akka", "spark", "47") 33 | val status = mock[Status] 34 | val actorRef = TestActorRef( 35 | new TwitterReceiverActorStreamStub[Status](twitterAuth, filters)) 36 | 37 | actorRef ! status 38 | 39 | there was one(status) 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /modules/test/src/test/scala/com/fortysevendeg/sparkon/services/twitter/TwitterServicesSpec.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter 2 | 3 | import com.fortysevendeg.sparkon.common.BaseServiceTest 4 | import com.fortysevendeg.sparkon.common.config.ConfigRegistry 5 | import com.fortysevendeg.sparkon.services.twitter.domain.TwitterServiceException 6 | import org.specs2.mock.Mockito 7 | import twitter4j._ 8 | import twitter4j.api.TrendsResources 9 | 10 | class TwitterServicesSpec 11 | extends BaseServiceTest 12 | with Mockito { 13 | 14 | trait TwitterServicesStub extends TwitterServices { 15 | override lazy val twitterClient = mock[Twitter] 16 | } 17 | 18 | "Twitter Services" should { 19 | "build a twitter4j client to fetch the current trending topics " in { 20 | val twitterServices = new TwitterServicesStub {} 21 | 22 | val filters = List("scala", "akka") 23 | 24 | val trends = mock[TrendsResources] 25 | val placeTrends = mock[Trends] 26 | val mockTrend1: Trend = mock[Trend] 27 | val mockTrend2: Trend = mock[Trend] 28 | val mockTrend3: Trend = mock[Trend] 29 | 30 | val trendsArray = Array(mockTrend1, mockTrend2, mockTrend3) 31 | 32 | mockTrend1.getName returns "scala" 33 | mockTrend2.getName returns "play" 34 | mockTrend3.getName returns "spark" 35 | 36 | twitterServices.twitterClient.trends() returns trends 37 | trends.getPlaceTrends(anyInt) returns placeTrends 38 | placeTrends.getTrends returns trendsArray 39 | 40 | val result = twitterServices.getTrendingTopics 41 | 42 | result.size must_== 3 43 | } 44 | 45 | "return a new custom exception when twitter4j.TwitterException is thrown" in { 46 | val twitterServices = new TwitterServicesStub {} 47 | 48 | val filters = List("scala", "akka") 49 | 50 | twitterServices.twitterClient.trends() throws new RuntimeException("something wrong") 51 | 52 | twitterServices.getTrendingTopics must throwA[TwitterServiceException] 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /modules/test/src/test/scala/com/fortysevendeg/sparkon/services/twitter/TwitterStreamingServicesSpec.scala: -------------------------------------------------------------------------------- 1 | package com.fortysevendeg.sparkon.services.twitter 2 | 3 | import akka.actor.{ActorRef, ActorSystem} 4 | import akka.testkit.TestKitBase 5 | import com.datastax.spark.connector.cql.CassandraConnector 6 | import com.fortysevendeg.sparkon.common.BaseServiceTest 7 | import com.fortysevendeg.sparkon.persistence.schema.CassandraServices 8 | import com.fortysevendeg.sparkon.persistence.schema.domain.PersistenceException 9 | import org.apache.spark.storage.StorageLevel 10 | import org.apache.spark.streaming.{Seconds, StreamingContext} 11 | import org.specs2.mock.Mockito 12 | import org.specs2.specification.Scope 13 | 14 | class TwitterStreamingServicesSpec 15 | extends BaseServiceTest 16 | with TestKitBase 17 | with Mockito { 18 | 19 | implicit lazy val system = ActorSystem() 20 | 21 | val batchDuration = Seconds(1) 22 | 23 | private val master: String = "local[4]" 24 | 25 | private val framework: String = this.getClass.getSimpleName 26 | 27 | implicit val ssc = new StreamingContext(master = master, appName = framework, batchDuration = batchDuration) 28 | 29 | trait CreateCassandraSchemaScope extends Scope { 30 | 31 | val cassandraServicesMock = mock[CassandraServices] 32 | 33 | val twitterStreamingServices = new TwitterStreamingServicesStub {} 34 | 35 | implicit val connector = mock[CassandraConnector] 36 | 37 | class TwitterStreamingServicesStub extends TwitterStreamingServices { 38 | override val cassandraServices = cassandraServicesMock 39 | } 40 | } 41 | 42 | trait CreateTwitterStreamScope extends Scope { 43 | implicit val receiverActor = mock[ActorRef] 44 | 45 | val twitterStreamingServices = new TwitterStreamingServices {} 46 | } 47 | 48 | "TwitterStreamingServices.createCassandraSchema" should { 49 | 50 | "create Cassandra works fine doing pass through to persistence " + 51 | "module" in new CreateCassandraSchemaScope { 52 | 53 | twitterStreamingServices.createCassandraSchema(connector) 54 | 55 | there was one(cassandraServicesMock).createSchema(any, any)(any) 56 | } 57 | 58 | "create Cassandra returns a Persistence Exception when a " + 59 | "new exception is thrown" in new CreateCassandraSchemaScope { 60 | 61 | cassandraServicesMock.createSchema(any, any)(any) throws new PersistenceException("any message") 62 | 63 | twitterStreamingServices.createCassandraSchema(connector) must throwA[PersistenceException] 64 | } 65 | } 66 | 67 | "TwitterStreamingServices.createTwitterStream" should { 68 | 69 | "create a new twitter streaming for an empty filters set and using " + 70 | "the storageLevel default value" in new CreateTwitterStreamScope { 71 | 72 | twitterStreamingServices.createTwitterStream(filters = Nil) 73 | 74 | "All combinations looks good" must endWith("good") 75 | } 76 | 77 | "create a new twitter streaming for some filters and using " + 78 | "the storageLevel default value" in new CreateTwitterStreamScope { 79 | val filters = List("scala", "akka") 80 | 81 | twitterStreamingServices.createTwitterStream(filters = filters) 82 | 83 | "All combinations looks good" must endWith("good") 84 | } 85 | 86 | "create a new twitter streaming for some filters and a " + 87 | "specified storage level" in new CreateTwitterStreamScope { 88 | val filters = List("scala", "akka") 89 | 90 | twitterStreamingServices.createTwitterStream(filters = filters, 91 | storageLevel = StorageLevel.MEMORY_AND_DISK_SER_2) 92 | 93 | "All combinations looks good" must endWith("good") 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /project/Build.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import sbtdocker.DockerPlugin 3 | 4 | object Build extends Build with Settings with SettingsDocker with Dependencies { 5 | 6 | lazy val root = project 7 | .in(file(".")) 8 | .aggregate(common, persistence, services, api, test) 9 | 10 | lazy val common = project 11 | .in(file("modules/common")) 12 | .settings(projectSettings ++ commonDeps) 13 | 14 | lazy val persistence = project 15 | .in(file("modules/persistence")) 16 | .dependsOn(common % "test->test;compile->compile") 17 | .settings(projectSettings ++ persistenceDeps) 18 | 19 | lazy val services = project.in(file("modules/services")) 20 | .dependsOn( 21 | common % "test->test;compile->compile", 22 | persistence) 23 | .settings(projectSettings ++ servicesDeps) 24 | 25 | lazy val api = project.in(file("modules/api")) 26 | .enablePlugins(DockerPlugin) 27 | .dependsOn( 28 | common % "test->test;compile->compile", 29 | services) 30 | .settings(apiSettings ++ apiDeps) 31 | 32 | lazy val test = project.in(file("modules/test")) 33 | .dependsOn( 34 | common % "test->test;compile->compile", 35 | persistence % "test->test;compile->compile", 36 | services % "test->test;compile->compile", 37 | api % "test->test;compile->compile") 38 | .settings(projectSettings ++ testDeps) 39 | } 40 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt._ 3 | 4 | trait Dependencies extends Excludes { 5 | this: Build => 6 | 7 | val akkaActor = "com.typesafe.akka" %% "akka-actor" % V.akka 8 | val akkaHttp = "com.typesafe.akka" %% "akka-http-experimental" % V.akkaStreams 9 | val akkaHttpCore = "com.typesafe.akka" %% "akka-http-core-experimental" % V.akkaStreams 10 | val akkaHttpJson = "com.typesafe.akka" %% "akka-http-spray-json-experimental" % V.akkaStreams 11 | val akkaHttpXml = "com.typesafe.akka" %% "akka-http-xml-experimental" % V.akkaStreams 12 | val akkaHttpTestkit = "com.typesafe.akka" %% "akka-http-testkit-experimental" % V.akkaStreams 13 | val akkaParsing = "com.typesafe.akka" %% "akka-parsing-experimental" % V.akkaStreams 14 | val akkaRemote = "com.typesafe.akka" %% "akka-remote" % V.akka 15 | val akkaSlf4j = "com.typesafe.akka" %% "akka-slf4j" % V.akka 16 | val akkaStreams = "com.typesafe.akka" %% "akka-stream-experimental" % V.akkaStreams 17 | val akkaTestkit = "com.typesafe.akka" %% "akka-testkit" % V.akka 18 | val cassandraSpark = "com.datastax.spark" %% "spark-cassandra-connector" % V.cassandraSpark 19 | val config = "com.typesafe" % "config" % V.config 20 | val commonsCodec = "commons-codec" % "commons-codec" % V.commonsCodec 21 | val hadoopClient = "org.apache.hadoop" % "hadoop-client" % V.hadoopClient 22 | val jodaTime = "joda-time" % "joda-time" % V.jodaTime 23 | val jodaConvert = "org.joda" % "joda-convert" % V.jodaConvert 24 | val kafka = "org.apache.kafka" %% "kafka" % V.kafka 25 | val logback = "ch.qos.logback" % "logback-classic" % V.logback 26 | val phantomDsl = "com.websudos" %% "phantom-dsl" % V.phantom 27 | val phantomTestkit = "com.websudos" %% "phantom-testkit" % V.phantom 28 | val reactiveKafka = "com.softwaremill.reactivekafka" %% "reactive-kafka-core" % V.reactiveKafka 29 | val sparkCore = "org.apache.spark" %% "spark-core" % V.spark 30 | val sparkStreaming = "org.apache.spark" %% "spark-streaming" % V.spark 31 | val sparkStreamingKafka = "org.apache.spark" %% "spark-streaming-kafka" % V.spark 32 | val specs2Core = "org.specs2" %% "specs2-core" % V.specs2 33 | val specs2Mock = "org.specs2" %% "specs2-mock" % V.specs2 34 | val sprayHttp = "io.spray" %% "spray-http" % V.spray 35 | val sprayHttpx = "io.spray" %% "spray-httpx" % V.spray 36 | val sprayUtil = "io.spray" %% "spray-util" % V.spray 37 | val sprayClient = "io.spray" %% "spray-client" % V.spray 38 | val sprayCan = "io.spray" %% "spray-can" % V.spray 39 | val sprayCaching = "io.spray" %% "spray-caching" % V.spray 40 | val sprayRouting = "io.spray" %% "spray-routing" % V.spray 41 | val sprayJson = "io.spray" %% "spray-json" % V.sprayJson 42 | val sprayTestKit = "io.spray" %% "spray-testkit" % V.sprayJson 43 | val twitter4jCore = "org.twitter4j" % "twitter4j-core" % V.twitter4j 44 | val twitter4jStream = "org.twitter4j" % "twitter4j-stream" % V.twitter4j 45 | 46 | val baseDepts = Seq(specs2Core % "test", specs2Mock % "test") 47 | 48 | val commonDeps = Seq(libraryDependencies ++= Seq(config, logback)) 49 | 50 | val persistenceDeps = Seq(libraryDependencies ++= Seq( 51 | akkaRemote, 52 | akkaSlf4j, 53 | cassandraSpark exclude("org.apache.spark", "*"), 54 | phantomDsl, 55 | phantomTestkit, 56 | sparkCore exclude("org.spark-project.akka", "*"))) 57 | 58 | val servicesDeps = Seq(libraryDependencies ++= Seq( 59 | kafka exclusionsForKafka, 60 | sparkStreaming intransitive(), 61 | sparkStreamingKafka intransitive(), 62 | twitter4jCore, 63 | twitter4jStream, 64 | akkaTestkit % "test")) 65 | 66 | val testDeps = Seq(libraryDependencies ++= baseDepts ++ Seq( 67 | twitter4jCore, 68 | akkaHttpTestkit % "test")) 69 | 70 | val apiDeps = Seq(libraryDependencies ++= Seq( 71 | akkaHttp, 72 | akkaHttpCore, 73 | akkaHttpJson, 74 | akkaStreams, 75 | hadoopClient, 76 | reactiveKafka, 77 | sprayJson)) 78 | } 79 | -------------------------------------------------------------------------------- /project/Excludes.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | trait Excludes { 4 | 5 | implicit class Exclude(module: ModuleID) { 6 | 7 | def excludingLog4j: ModuleID = 8 | module excludeAll ExclusionRule("log4j") 9 | 10 | def excludingSlf4j: ModuleID = 11 | module excludeAll ExclusionRule("org.slf4j") 12 | 13 | def excludingGuava: ModuleID = 14 | module exclude("com.google.guava", "guava") 15 | 16 | def excludingSpark: ModuleID = 17 | module 18 | .excludingGuava 19 | .exclude("org.apache.spark", s"spark-core_${V.scala}") 20 | .exclude("org.apache.spark", s"spark-streaming_${V.scala}") 21 | .exclude("org.apache.spark", s"spark-sql_${V.scala}") 22 | .exclude("org.apache.spark", s"spark-streaming_${V.scala}") 23 | 24 | def excludingLogback: ModuleID = module 25 | .exclude("ch.qos.logback", "logback-classic") 26 | .exclude("ch.qos.logback", "logback-core") 27 | 28 | def excludingAkka: ModuleID = module 29 | .exclude("com.typesafe.akka", "akka-actor") 30 | 31 | def exclusionsForKafka: ModuleID = 32 | module 33 | .excludingLog4j 34 | .excludingSlf4j 35 | .exclude("com.sun.jmx", "jmxri") 36 | .exclude("com.sun.jdmk", "jmxtools") 37 | .exclude("net.sf.jopt-simple", "jopt-simple") 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /project/Settings.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import sbt.Keys._ 3 | import sbtassembly.AssemblyPlugin.autoImport._ 4 | import spray.revolver.RevolverPlugin.Revolver 5 | import sbtassembly.AssemblyPlugin._ 6 | import sbtassembly.MergeStrategy._ 7 | 8 | trait Settings { 9 | this: Build with SettingsDocker => 10 | 11 | lazy val projectSettings: Seq[Def.Setting[_]] = Seq( 12 | scalaVersion := V.scala, 13 | scalaVersion in ThisBuild := V.scala, 14 | organization := "com.fortysevendeg", 15 | organizationName := "47 Degrees", 16 | organizationHomepage := Some(new URL("http://47deg.com")), 17 | version := V.buildVersion, 18 | conflictWarning := ConflictWarning.disable, 19 | scalacOptions ++= Seq("-deprecation", "-unchecked", "-feature", "-Ywarn-unused-import"), 20 | javaOptions in Test ++= Seq("-XX:MaxPermSize=128m", "-Xms512m", "-Xmx512m"), 21 | ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) }, 22 | sbt.Keys.fork := true, 23 | publishMavenStyle := true, 24 | publishArtifact in(Test, packageSrc) := true, 25 | logLevel := Level.Info, 26 | resolvers ++= Seq( 27 | Resolver.mavenLocal, 28 | Resolver.defaultLocal, 29 | Classpaths.typesafeReleases, 30 | DefaultMavenRepository, 31 | Resolver.typesafeIvyRepo("snapshots"), 32 | Resolver.sonatypeRepo("releases"), 33 | Resolver.sonatypeRepo("snapshots"), 34 | "Sonatype staging" at "http://oss.sonatype.org/content/repositories/staging", 35 | "Java.net Maven2 Repository" at "http://download.java.net/maven/2/", 36 | "Twitter Repository" at "http://maven.twttr.com", 37 | "mvnrepository" at "http://mvnrepository.com/artifact/", 38 | Resolver.bintrayRepo("scalaz", "releases"), 39 | Resolver.bintrayRepo("websudos", "oss-releases") 40 | ), 41 | doc in Compile <<= target.map(_ / "none"), 42 | unmanagedResourceDirectories in Compile <+= baseDirectory(_ / "src/main/scala") 43 | ) 44 | 45 | lazy val apiSettings = projectSettings ++ assemblySettings ++ Seq( 46 | scalaVersion in ThisBuild := V.scala, 47 | assemblyJarName in assembly := "sparkOn-1.0.0.jar", 48 | assembleArtifact in assemblyPackageScala := true, 49 | Keys.test in assembly := {}, 50 | assemblyMergeStrategy in assembly := { 51 | case "application.conf" => concat 52 | case "reference.conf" => concat 53 | case "unwanted.txt" => discard 54 | case entry => 55 | val oldStrategy = (assemblyMergeStrategy in assembly).value 56 | val mergeStrategy = oldStrategy(entry) 57 | mergeStrategy == deduplicate match { 58 | case true => first 59 | case _ => mergeStrategy 60 | } 61 | }, 62 | publishArtifact in(Test, packageBin) := false 63 | ) ++ Revolver.settings ++ dockerSettings 64 | } 65 | -------------------------------------------------------------------------------- /project/SettingsDocker.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt._ 3 | import sbtassembly.AssemblyPlugin.autoImport._ 4 | import sbtdocker.DockerPlugin.autoImport._ 5 | 6 | trait SettingsDocker { 7 | this: Build => 8 | 9 | lazy val dockerSettings = Seq( 10 | docker <<= docker dependsOn assembly, 11 | imageNames in docker := Seq(ImageName("47deg/sparkon")), 12 | dockerfile in docker := { 13 | val workingDir = s"/opt/sparkOn" 14 | val artifact = (assemblyOutputPath in assembly).value 15 | 16 | val artifactTargetPath = s"/opt/sparkOn/${artifact.name}" 17 | val sparkPath = "/usr/local/spark/assembly/target/scala-2.11/spark-assembly-1.5.1-hadoop2.4.0.jar" 18 | 19 | val mainclass = mainClass.in(Compile, packageBin).value.getOrElse(sys.error("Expected exactly one main class")) 20 | val classpathString = s"$sparkPath:$artifactTargetPath" 21 | 22 | new Dockerfile { 23 | // Base image 24 | from("47deg/spark:1.5.1") 25 | // Mantainer 26 | maintainer("47 Degrees", "juanpedro.m@47deg.com>") 27 | 28 | // Set working directory 29 | workDir(workingDir) 30 | 31 | // Add the JAR file 32 | add(artifact, artifactTargetPath) 33 | 34 | cmdRaw(s"java " + 35 | s"-verbose:gc " + 36 | s"-XX:+PrintGCDetails " + 37 | s"-XX:+PrintGCTimeStamps " + 38 | s"-Xmx2G " + 39 | s"-XX:MaxPermSize=1G -cp $classpathString $mainclass") 40 | } 41 | } 42 | ) 43 | } -------------------------------------------------------------------------------- /project/V.scala: -------------------------------------------------------------------------------- 1 | object V { 2 | 3 | // Build version 4 | val buildVersion = "1.0.0-SNAPSHOT" 5 | 6 | // Core Libs 7 | val akka = "2.3.12" 8 | val akkaStreams = "1.0" 9 | val cassandraSpark = "1.5.0-M1" 10 | val commonsCodec = "1.10" 11 | val config = "1.2.1" 12 | val hadoopClient = "2.4.0" 13 | val jodaConvert = "1.7" 14 | val jodaTime = "2.7" 15 | val json4s = "3.2.11" 16 | val kafka = "0.8.2.1" 17 | val logback = "1.1.3" 18 | val phantom = "1.8.12" 19 | val reactiveKafka = "0.8.1" 20 | val scala = "2.11.7" 21 | val scalaUri = "0.4.7" 22 | val scalaz = "7.1.2" 23 | val spark = "1.5.1" 24 | val spray = "1.3.3" 25 | val sprayJson = "1.3.2" 26 | val twitter4j = "4.0.3" 27 | 28 | // Testing libs 29 | val specs2 = "3.6.2" 30 | } 31 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.8 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Typesafe Repository" at "https://repo.typesafe.com/typesafe/releases/" 2 | 3 | resolvers += Classpaths.sbtPluginReleases 4 | 5 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.7.2") 6 | 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") 8 | 9 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0") 10 | 11 | addSbtPlugin("com.codacy" % "sbt-codacy-coverage" % "1.1.0") 12 | 13 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.0.3") 14 | 15 | addSbtPlugin("se.marcuslonnberg" % "sbt-docker" % "1.2.0") -------------------------------------------------------------------------------- /scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sbt ";project api;docker" 4 | 5 | cd scripts && docker-compose up -d 6 | 7 | sleep 60 8 | 9 | docker exec -t namenode /usr/local/hadoop/bin/hadoop fs -mkdir /checkpoint 10 | 11 | # Scaling out Spark: 12 | docker-compose scale spark_worker=2 13 | -------------------------------------------------------------------------------- /scripts/docker-compose.yml: -------------------------------------------------------------------------------- 1 | zookeeper: 2 | image: 47deg/zookeeper 3 | ports: 4 | - "2181:2181" 5 | kafka_1: 6 | image: 47deg/kafka 7 | ports: 8 | - "9092" 9 | links: 10 | - zookeeper:zk 11 | environment: 12 | KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100 13 | volumes: 14 | - /var/run/docker.sock:/var/run/docker.sock 15 | kafka_2: 16 | image: 47deg/kafka 17 | ports: 18 | - "9092" 19 | links: 20 | - zookeeper:zk 21 | environment: 22 | KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100 23 | volumes: 24 | - /var/run/docker.sock:/var/run/docker.sock 25 | kafka_3: 26 | image: 47deg/kafka 27 | ports: 28 | - "9092" 29 | links: 30 | - zookeeper:zk 31 | environment: 32 | KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100 33 | volumes: 34 | - /var/run/docker.sock:/var/run/docker.sock 35 | opscenter: 36 | image: 47deg/opscenter 37 | ports: 38 | - "8888:8888" 39 | container_name: opscenter 40 | cassandra_seed: 41 | image: 47deg/cassandra 42 | ports: 43 | - "9042:9042" 44 | links: 45 | - opscenter 46 | container_name: cassandra_seed 47 | environment: 48 | - OPS_IP=opscenter 49 | cassandra_slave: 50 | image: 47deg/cassandra 51 | links: 52 | - opscenter 53 | - cassandra_seed 54 | environment: 55 | - OPS_IP=opscenter 56 | - SEED=cassandra_seed 57 | opscenter_checkpoint: 58 | image: 47deg/java8 59 | working_dir: /src 60 | volumes: 61 | - .:/src 62 | command: sh initOpscenter.sh 63 | links: 64 | - opscenter 65 | - cassandra_seed 66 | - cassandra_slave 67 | environment: 68 | - OPS_IP=opscenter 69 | - SEED=cassandra_seed 70 | - CASS_SLAVE=cassandra_slave 71 | - WAIT_SLEEP=10 72 | - WAIT_LOOPS=10 73 | spark_master: 74 | image: 47deg/spark:1.5.1 75 | ports: 76 | - "7077:7077" 77 | - "8080:8080" 78 | container_name: spark_master 79 | tty: true 80 | command: /start-master.sh 81 | spark_worker: 82 | image: 47deg/spark:1.5.1 83 | links: 84 | - spark_master 85 | command: /start-worker.sh 86 | namenode: 87 | image: 47deg/yarn-cluster 88 | working_dir: /usr/local/hadoop 89 | ports: 90 | - "8088:8088" 91 | - "50070:50070" 92 | - "50075:50075" 93 | container_name: namenode 94 | command: bash -c "/etc/bootstrap.sh -d -namenode" 95 | datanode: 96 | image: 47deg/yarn-cluster 97 | working_dir: /usr/local/hadoop 98 | links: 99 | - namenode 100 | command: /etc/bootstrap.sh -d -datanode 101 | sparkon: 102 | image: 47deg/sparkon 103 | ports: 104 | - "9090:9090" 105 | - "4040:4040" 106 | container_name: sparkon 107 | links: 108 | - spark_master 109 | - cassandra_seed 110 | - cassandra_slave 111 | - namenode 112 | - zookeeper 113 | - kafka_1 114 | - kafka_2 115 | - kafka_3 116 | env_file: sparkOn.env -------------------------------------------------------------------------------- /scripts/initOpscenter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export OPSCENTER_IP=$(getent hosts "$OPS_IP" | awk '{print $1 ; exit}') 4 | export SEED_IP=$(getent hosts "$SEED" | awk '{print $1 ; exit}') 5 | export CASS_SLAVE_IP=$(getent hosts "$CASS_SLAVE" | awk '{print $1 ; exit}') 6 | 7 | echo "OPSCENTER_IP = $OPSCENTER_IP" 8 | echo "SEED = $SEED_IP" 9 | echo "CASS_SLAVE = $CASS_SLAVE_IP" 10 | 11 | WAIT_COMMAND_COND= 12 | 13 | is_ready() { 14 | eval [ $(curl --write-out %{http_code} --silent --output /dev/null http://$OPSCENTER_IP:8888/cluster-configs) = 200 ] 15 | } 16 | 17 | # wait until is ready 18 | i=0 19 | while ! is_ready; do 20 | i=`expr $i + 1` 21 | if [ $i -ge $WAIT_LOOPS ]; then 22 | echo "$(date) - still not ready, giving up" 23 | exit 1 24 | fi 25 | echo "$(date) - waiting to be ready" 26 | sleep $WAIT_SLEEP 27 | done 28 | 29 | #start the script 30 | echo "Registering cluster with OpsCenter" 31 | curl \ 32 | http://${OPSCENTER_IP}:8888/cluster-configs \ 33 | -X POST \ 34 | -d \ 35 | "{ 36 | \"cassandra\": { 37 | \"seed_hosts\": \"$SEED_IP, $CASS_SLAVE_IP\" 38 | }, 39 | \"cassandra_metrics\": {}, 40 | \"jmx\": { 41 | \"port\": \"7199\" 42 | } 43 | }" > /dev/null -------------------------------------------------------------------------------- /scripts/sparkOn.env: -------------------------------------------------------------------------------- 1 | # Set Spark On ENV 2 | CASSANDRA_HOSTS=cassandra_seed 3 | SPARK_HOME=/usr/local/spark 4 | SPARK_CHECKPOINT=hdfs://namenode:9000/checkpoint 5 | CONSUMER_KEY= 6 | CONSUMER_SECRET= 7 | ACCESS_TOKEN= 8 | ACCESS_TOKEN_SECRET= 9 | SPARK_APP_JARS=/opt/sparkOn/sparkOn-1.0.0.jar 10 | KAFKA_HOSTS=scripts_kafka_1:9092 11 | KAFKA_TOPIC=sparkOn.raw 12 | ZOOKEEPER_HOST=zk 13 | ZOOKEEPER_PORT=2181 14 | HTTP_INTERFACE=0.0.0.0 15 | HTTP_PORT=9090 --------------------------------------------------------------------------------