├── .gitignore
├── README.md
├── SparkTuning.md
├── Vagrantfile
├── build.sbt
├── project
    ├── assembly.sbt
    ├── build.properties
    └── plugins.sbt
└── src
    ├── main
        ├── resources
        │   ├── GeoLite2-City.mmdb
        │   ├── GeoLite2-Country.mmdb
        │   ├── broker-defaults.properties
        │   ├── consumer-defaults.properties
        │   ├── log4j.properties
        │   ├── producer-defaults.properties
        │   └── reference.conf
        └── scala
        │   └── com
        │       └── cloudwick
        │           ├── cassandra
        │               ├── Cassandra.scala
        │               ├── CassandraExecutionContext.scala
        │               ├── CassandraLocationVisitServiceModule.scala
        │               ├── CassandraLogVolumeServiceModule.scala
        │               ├── CassandraService.scala
        │               ├── CassandraStatusCountServiceModule.scala
        │               ├── ConfigurableCassandraManager.scala
        │               ├── schema
        │               │   ├── LocationVisit.scala
        │               │   ├── LogVolume.scala
        │               │   └── StatusCount.scala
        │               └── service
        │               │   ├── LocationVisitServiceModule.scala
        │               │   ├── LogVolumeServiceModule.scala
        │               │   └── StatusCountServiceModule.scala
        │           ├── logging
        │               └── Logging.scala
        │           └── spark
        │               ├── embedded
        │                   ├── KafkaServer.scala
        │                   └── ZookeeperServer.scala
        │               ├── examples
        │                   ├── core
        │                   │   ├── WordCount.scala
        │                   │   ├── WordCountRunner.scala
        │                   │   └── package.scala
        │                   └── streaming
        │                   │   ├── kafka
        │                   │       ├── KafkaWordCount.scala
        │                   │       └── StatefulKafkaWordCount.scala
        │                   │   ├── kinesis
        │                   │       └── KinesisWordCount.scala
        │                   │   └── local
        │                   │       ├── NetworkWordCount.scala
        │                   │       ├── NetworkWordCountRunner.scala
        │                   │       ├── NetworkWordCountWindowed.scala
        │                   │       ├── NetworkWordCountWindowedRunner.scala
        │                   │       └── RecoverableNetworkWordCount.scala
        │               └── loganalysis
        │                   ├── LogAnalyzer.scala
        │                   ├── LogAnalyzerRunner.scala
        │                   ├── LogAnalyzerStreamingRunner.scala
        │                   └── LogEvent.scala
    └── test
        └── scala
            ├── com
                └── cloudwick
                │   └── spark
                │       ├── examples
                │           ├── core
                │           │   └── WordCountSpec.scala
                │           └── streaming
                │           │   └── local
                │           │       ├── NetworkWordCountSpec.scala
                │           │       └── NetworkWordCountWindowedSpec.scala
                │       ├── loganalysis
                │           └── LogAnalyzerSpec.scala
                │       └── sparkspec
                │           ├── SparkSpec.scala
                │           ├── SparkSqlSpec.scala
                │           └── SparkStreamingSpec.scala
            └── org
                └── apache
                    └── spark
                        └── streaming
                            └── ClockWrapper.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | wiki
 4 | *.sc
 5 | src/main/scala/com/cloudwick/Random.scala
 6 | 
 7 | # sbt specific
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | project/project
15 | project/target
16 | .project/
17 | .cache
18 | .classpath
19 | .settings
20 | 
21 | # IDE specific
22 | .idea
23 | .idea_modules
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cloudwick Spark CodeBase
 2 | 
 3 | This repository is a collection of Spark examples & use-case implementations for various components of the Spark eco-system including Spark-Core, Spark-Streaming, Spark-SQL, Spark-MLLib.
 4 | 
 5 | ## What does this repository contains ?
 6 | 
 7 | * Spark core examples
 8 |     * [WordCount](src/main/scala/com/cloudwick/spark/examples/core/WordCountRunner.scala)
 9 | * Spark streaming examples
10 |     * [NetworkWordCount](src/main/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCount.scala)
11 |     * [NetworkWordCountWindowed](src/main/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCountWindowed.scala)
12 |     * [StatefulKafkaWordCount](src/main/scala/com/cloudwick/spark/examples/streaming/kafka/StatefulKafkaWordCount.scala)
13 | * Spark core use-cases
14 |     * [LognAnalytics](src/main/scala/com/cloudwick/spark/loganalysis/LogAnalyzerRunner.scala)
15 | * Spark streaming use-cases
16 |     * [LogAnalytics](src/main/scala/com/cloudwick/spark/loganalysis/LogAnalyzerStreamingRunner.scala) 
17 |         A simple spark streaming use-case to perform apache log analysis which could read data from Kafka & Kinesis performs some analysis and persists the result's to cassandra.
18 | * Testing
19 |     * ScalaTest spec traits for Spark core, streaming and SQL API(s)
20 |     * Embedded [Kafka](src/main/scala/com/cloudwick/spark/embedded/KafkaServer.scala) and [Zookeeper](src/main/scala/com/cloudwick/spark/embedded/ZookeeperServer.scala) embedded server instances for testing
21 |     
22 | ## How to download ?
23 | 
24 | Simplest way is to clone the repository:
25 | 
26 | ```
27 | git clone https://github.com/cloudwicklabs/spark_codebase.git
28 | ```
29 |     
30 | ## How to run these ?
31 | 
32 | To run any of these examples or use-cases you have to package them using a uber-jar (most of the examples depend of external dependencies, hence have to be packaged as a assembly jar).
33 | 
34 | ### Building an assembly jar
35 | 
36 | From the project's home directory
37 | 
38 | ```
39 | sbt assembly
40 | ```
41 | 
42 | ### Running using `spark-submit`
43 | 
44 | [`spark-submit`](https://spark.apache.org/docs/latest/submitting-applications.html) is the simplest way to submit a spark application to the cluster and supports all the cluster manager's like stand-alone, yarn and mesos.
45 | 
46 | Each of the main class has documentation on how to run it.
47 | 


--------------------------------------------------------------------------------
/SparkTuning.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudwicklabs/spark_codebase/e75b066165056f0a169690cf3f4e1b311a3bae8b/SparkTuning.md


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | $script = <<SCRIPT
 5 | cat << EOF >> /etc/hosts
 6 | 192.168.34.100 sparkmaster
 7 | 192.168.34.101 sparkworker1
 8 | 192.168.34.102 sparkworker2
 9 | EOF
10 | 
11 | sudo service iptables stop
12 | sudo chkconfig iptables stop
13 | sudo /usr/sbin/setenforce 0
14 | sudo sed -i.old s/SELINUX=enforcing/SELINUX=disabled/ /etc/selinux/config
15 | SCRIPT
16 | 
17 | boxes = [
18 |     { :name => :sparkmaster,  :ip => '192.168.34.100', :cpus => 2, :memory => 1024 },
19 |     { :name => :sparkworker1, :ip => '192.168.34.101', :cpus => 2, :memory => 2048 },
20 |     { :name => :sparkworker2, :ip => '192.168.34.102', :cpus => 2, :memory => 2048 },
21 | ]
22 | 
23 | VAGRANT_API_VERSION = "2"
24 | 
25 | Vagrant.configure(VAGRANT_API_VERSION) do |conf|
26 |   conf.vm.box = "chef/centos-6.5"
27 | 
28 |   boxes.each do |box|
29 |     conf.vm.define box[:name] do |config|
30 |       config.vm.network 'private_network', ip: box[:ip]
31 |       config.vm.hostname = box[:name].to_s
32 |       config.vm.provider "virtualbox" do |v|
33 |         v.customize ["modifyvm", :id, "--memory", box[:memory]]
34 |         v.customize ["modifyvm", :id, "--cpus", box[:cpus]]
35 |       end
36 | 
37 |       #provisioning
38 |       config.vm.provision :shell, inline: $script
39 |     end
40 |   end
41 | end
42 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import sbt.ExclusionRule
 2 | import sbt.Keys._
 3 | 
 4 | name := "spark_codebase"
 5 | 
 6 | version := "1.0"
 7 | 
 8 | scalaVersion := "2.10.5"
 9 | 
10 | resolvers ++= Seq(
11 |   "Typesafe repository snapshots"   at "http://repo.typesafe.com/typesafe/snapshots/",
12 |   "Typesafe repository releases"    at "http://repo.typesafe.com/typesafe/releases/",
13 |   "Sonatype repo"                   at "https://oss.sonatype.org/content/groups/scala-tools/",
14 |   "Sonatype releases"               at "https://oss.sonatype.org/content/repositories/releases",
15 |   "Sonatype snapshots"              at "https://oss.sonatype.org/content/repositories/snapshots",
16 |   "Sonatype staging"                at "http://oss.sonatype.org/content/repositories/staging",
17 |   "Java.net Maven2 Repository"      at "http://download.java.net/maven/2/",
18 |   "Twitter Repository"              at "http://maven.twttr.com",
19 |   "Websudos releases"               at "http://maven.websudos.co.uk/ext-release-local"
20 | )
21 | 
22 | val sparkVersion = "1.3.0"
23 | val PhantomVersion = "1.6.0"
24 | 
25 | libraryDependencies ++= Seq(
26 |   "org.apache.spark" %% "spark-core" % sparkVersion % "provided"
27 |     exclude("org.apache.zookeeper", "zookeeper"),
28 |   "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided",
29 |   "org.apache.spark" %% "spark-sql" %sparkVersion % "provided",
30 |   "org.apache.spark" %% "spark-streaming-kafka" % sparkVersion
31 |     exclude("org.apache.zookeeper", "zookeeper"),
32 |   "org.apache.spark" %% "spark-streaming-twitter" % sparkVersion,
33 |   "org.apache.spark" %% "spark-streaming-kinesis-asl" % sparkVersion
34 |     excludeAll ExclusionRule(organization = "org.apache.spark", name = "spark-streaming_2.10"),
35 |   "org.slf4j" % "slf4j-api" % "1.7.12",
36 |   "org.apache.kafka" %% "kafka" % "0.8.2.1"
37 |     exclude("javax.jms", "jms")
38 |     exclude("com.sun.jdmk", "jmxtools")
39 |     exclude("com.sun.jmx", "jmxri")
40 |     exclude("log4j", "log4j")
41 |     exclude("org.apache.zookeeper", "zookeeper")
42 |     exclude("com.101tec", "zkclient")
43 |     excludeAll ExclusionRule(organization = "org.slf4j"),
44 |   "org.apache.curator" % "curator-test" % "2.4.0"
45 |     excludeAll ExclusionRule(organization = "io.netty")
46 |     excludeAll ExclusionRule(organization = "org.jboss.netty")
47 |     exclude("com.google.guava", "guava"),
48 |   "com.101tec" % "zkclient" % "0.4"
49 |     exclude("org.apache.zookeeper", "zookeeper"),
50 |   "joda-time" % "joda-time" % "2.7",
51 |   "com.maxmind.geoip2" % "geoip2" % "2.1.0"
52 |     exclude("org.apache.httpcomponents", "httpclient"),
53 |   "com.websudos" %% "phantom-dsl" % PhantomVersion
54 |     exclude("com.google.guava", "guava"),
55 |   "com.websudos" %% "phantom-zookeeper" % PhantomVersion
56 |     exclude("com.google.guava", "guava")
57 |     excludeAll ExclusionRule(organization = "io.netty")
58 |     excludeAll ExclusionRule(organization = "org.jboss.netty")
59 |     excludeAll ExclusionRule(organization = "org.slf4j"),
60 |   "com.typesafe" % "config" % "1.2.1",
61 |   "com.google.guava" % "guava" % "16.0.1",
62 |   // Test dependencies
63 |   "org.scalatest" %% "scalatest" % "2.2.4" % "test",
64 |   "org.xerial.snappy" % "snappy-java" % "1.1.1.7"
65 | )
66 | 
67 | // Cannot run tests in parallel because of:
68 | // `akka.actor.InvalidActorNameException: actor name [LocalBackendActor] is not unique!`
69 | parallelExecution in Test := false
70 | 
71 | // Forking is required as testing requires multiple threads
72 | fork in Test := true
73 | 
74 | // Skip running tests during assembly
75 | test in assembly := {}
76 | 
77 | assemblyMergeStrategy in assembly := {
78 |   case PathList("javax", "servlet", xs @ _*)                => MergeStrategy.first
79 |   case PathList(ps @ _*) if ps.last endsWith ".html"        => MergeStrategy.first
80 |   case PathList("com", "google", "common", "base", xs @ _*) => MergeStrategy.first
81 |   case "application.conf"                                   => MergeStrategy.concat
82 |   case "com/twitter/common/args/apt/cmdline.arg.info.txt.1" => MergeStrategy.first
83 |   case "org/apache/spark/unused/UnusedStubClass.class"      => MergeStrategy.first
84 |   case "log4j.properties"                                   => MergeStrategy.first
85 |   case "reference.conf"                                     => MergeStrategy.concat
86 |   case x =>
87 |     val oldStrategy = (assemblyMergeStrategy in assembly).value
88 |     oldStrategy(x)
89 | }
90 | 
91 | assemblyExcludedJars in assembly <<= (fullClasspath in assembly) map { cp =>
92 |   val excludes = Set(
93 |     "commons-httpclient-3.1.jar"
94 |   )
95 |   cp filter { jar => excludes(jar.data.getName) }
96 | }


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 | 
3 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.5")


--------------------------------------------------------------------------------
/src/main/resources/GeoLite2-City.mmdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudwicklabs/spark_codebase/e75b066165056f0a169690cf3f4e1b311a3bae8b/src/main/resources/GeoLite2-City.mmdb


--------------------------------------------------------------------------------
/src/main/resources/GeoLite2-Country.mmdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudwicklabs/spark_codebase/e75b066165056f0a169690cf3f4e1b311a3bae8b/src/main/resources/GeoLite2-Country.mmdb


--------------------------------------------------------------------------------
/src/main/resources/broker-defaults.properties:
--------------------------------------------------------------------------------
 1 | # See http://kafka.apache.org/documentation.html#brokerconfigs for default values.
 2 | 
 3 | # Each broker is uniquely identified by a non-negative integer id.  This id serves as the brokers
 4 | # "name", and allows the broker to be moved to a different host/port without confusing consumers.
 5 | # You can choose any number you like so long as it is unique.
 6 | broker.id=0
 7 | 
 8 | # Hostname of broker. If this is set, it will only bind to this address.  If this is not set, it
 9 | # will bind to all interfaces, and publish one to ZK.
10 | host.name=127.0.0.1
11 | 
12 | # The port on which the server accepts client connections.
13 | port=9092
14 | 
15 | # The default number of partitions per topic.
16 | #
17 | num.partitions=1
18 | 
19 | # Enable auto creation of topic on the server. If this is set to true then attempts to produce,
20 | # consume, or fetch metadata for a non-existent topic will automatically create it with the default
21 | # replication factor and number of partitions.
22 | auto.create.topics.enable=true
23 | 
24 | # The maximum size of a message that the server can receive.  It is important that this property be
25 | # in sync with the maximum fetch size your consumers use or else an unruly consumer will be able to
26 | # publish messages too large for consumers to consume.
27 | #
28 | # Be careful with this setting when producing messages in batches with compression enabled.  In such
29 | # a scenario the batch of messages is treated as a single message, and its total size must be
30 | # smaller than this setting.
31 | #
32 | message.max.bytes=1000000


--------------------------------------------------------------------------------
/src/main/resources/consumer-defaults.properties:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Consumer Basics
  3 | ###
  4 | 
  5 | # A string that uniquely identifies the group of consumer processes to which this consumer belongs.
  6 | # By setting the same group id multiple processes indicate that they are all part of the same
  7 | # consumer group.
  8 | #
  9 | group.id=test-consumer
 10 | 
 11 | # Specifies the zookeeper connection string in the form `hostname:port`, where host and port are the
 12 | # host and port of a zookeeper server.  To allow connecting through other zookeeper nodes when that
 13 | # zookeeper machine is down, you can also specify multiple hosts in the form
 14 | # `hostname1:port1,hostname2:port2,hostname3:port3`.
 15 | #
 16 | # The server may also have a zookeeper chroot path as part of it's zookeeper connection string which
 17 | # puts its data under some path in the global zookeeper namespace.  If so the consumer should use
 18 | # the same chroot path in its connection string. For example to give a chroot path of /chroot/path
 19 | # you would give the connection string as
 20 | # `hostname1:port1,hostname2:port2,hostname3:port3/chroot/path`.
 21 | #
 22 | # Important: You must only add the custom chroot _to the end of the string_:
 23 | #
 24 | #   # WRONG!
 25 | #   zookeeper.connect=zkserver1:2181/my_kafka,zkserver:2181/my_kafka
 26 | #
 27 | #   # CORRECT
 28 | #   zookeeper.connect=zkserver1:2181,zkserver:2181/my_kafka
 29 | #
 30 | zookeeper.connect=localhost:2181
 31 | 
 32 | # If true, periodically commit to zookeeper the offset of messages already fetched by the consumer.
 33 | # This committed offset will be used when the process fails as the position from which the new
 34 | # consumer will begin.
 35 | #
 36 | auto.commit.enable=true
 37 | 
 38 | # What to do when there is no initial offset in Zookeeper or if an offset is out of range:
 39 | #   * "smallest": Automatically reset the offset to the smallest offset (= read from the very
 40 | #       beginning of the stream)
 41 | #   * "largest": Automatically reset the offset to the largest offset (= connect and wait for new
 42 | #       data coming in)
 43 | #   * anything else: Throw exception to the consumer.
 44 | #
 45 | # If this is set to largest, the consumer may lose some messages when the number of partitions --
 46 | # for the topics it has subscribed to -- changes on the broker.  To prevent data loss during
 47 | # partition addition, set auto.offset.reset to smallest.
 48 | #
 49 | auto.offset.reset=largest
 50 | 
 51 | # Throw a timeout exception to the consumer if no message is available for consumption after the
 52 | # specified interval. A negative value means that the consumer happily waits forever (without
 53 | # throwing a timeout exception).
 54 | #
 55 | consumer.timeout.ms=-1
 56 | 
 57 | # The socket receive buffer for network requests.
 58 | #
 59 | socket.receive.buffer.bytes=65536
 60 | 
 61 | # The socket timeout for network requests.
 62 | #
 63 | # See the entry "What is the relationship between fetch.wait.max.ms and socket.timeout.ms on the
 64 | # consumer?" in the
 65 | # FAQ.
 66 | #
 67 | socket.timeout.ms=30000
 68 | 
 69 | # The maximum amount of time the server will block before answering the fetch request if there isn't
 70 | # sufficient data to immediately satisfy `fetch.min.bytes`.
 71 | #
 72 | # See the entry "What is the relationship between fetch.wait.max.ms and socket.timeout.ms on the
 73 | # consumer?" in the FAQ.
 74 | #
 75 | fetch.wait.max.ms=100
 76 | 
 77 | # The minimum amount of data the server should return for a fetch request.  If insufficient data is
 78 | # available the request will wait for that much data to accumulate before answering the request.
 79 | #
 80 | fetch.min.bytes=1
 81 | 
 82 | # The number of bytes of messages to attempt to fetch for each topic-partition in each fetch
 83 | # request.  These bytes will be read into memory for each partition, so this helps control the
 84 | # memory used by the consumer.  The fetch request size must be at least as large as the maximum
 85 | # message size the server allows or else it is possible for the producer to send messages larger
 86 | # than the consumer can fetch.
 87 | #
 88 | fetch.message.max.bytes=1048576
 89 | 
 90 | # Max number of message chunks buffered for consumption. Each chunk can be up to
 91 | # fetch.message.max.bytes.
 92 | #
 93 | queued.max.message.chunks=10
 94 | 
 95 | # When a new consumer joins a consumer group, the set of consumers attempt to "rebalance" the load
 96 | # to assign partitions to each consumer.  If the set of consumers changes while this assignment is
 97 | # taking place, the rebalance will fail and retry.  This setting controls the maximum number of
 98 | # attempts before giving up.
 99 | #
100 | rebalance.max.retries=4
101 | 
102 | # Backoff time between retries during rebalance.
103 | #
104 | rebalance.backoff.ms=2000
105 | 
106 | # Backoff time to wait before trying to determine the leader of a partition that has just lost its
107 | # leader.
108 | #
109 | refresh.leader.backoff.ms=200
110 | 
111 | # Zookeeper session timeout.  If the consumer fails to heartbeat to zookeeper for this period of
112 | # time it is considered dead and a rebalance will occur.
113 | #
114 | zookeeper.session.timeout.ms=6000
115 | 
116 | # The max time that the client waits while establishing a connection to zookeeper.
117 | #
118 | zookeeper.connection.timeout.ms=6000
119 | 
120 | # How far a ZK follower can be behind a ZK leader
121 | #
122 | zookeeper.sync.time.ms=2000


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | logs.dir=logs
 2 | 
 3 | log4j.rootLogger=WARN, stdout
 4 | 
 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 7 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n
 8 | 
 9 | log4j.appender.kafkaAppender=org.apache.log4j.DailyRollingFileAppender
10 | log4j.appender.kafkaAppender.DatePattern='.'yyyy-MM-dd-HH
11 | log4j.appender.kafkaAppender.File=${logs.dir}/server.log
12 | log4j.appender.kafkaAppender.layout=org.apache.log4j.PatternLayout
13 | log4j.appender.kafkaAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
14 | 
15 | log4j.appender.stateChangeAppender=org.apache.log4j.DailyRollingFileAppender
16 | log4j.appender.stateChangeAppender.DatePattern='.'yyyy-MM-dd-HH
17 | log4j.appender.stateChangeAppender.File=${logs.dir}/state-change.log
18 | log4j.appender.stateChangeAppender.layout=org.apache.log4j.PatternLayout
19 | log4j.appender.stateChangeAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
20 | 
21 | log4j.appender.requestAppender=org.apache.log4j.DailyRollingFileAppender
22 | log4j.appender.requestAppender.DatePattern='.'yyyy-MM-dd-HH
23 | log4j.appender.requestAppender.File=${logs.dir}/kafka-request.log
24 | log4j.appender.requestAppender.layout=org.apache.log4j.PatternLayout
25 | log4j.appender.requestAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
26 | 
27 | log4j.appender.cleanerAppender=org.apache.log4j.DailyRollingFileAppender
28 | log4j.appender.cleanerAppender.DatePattern='.'yyyy-MM-dd-HH
29 | log4j.appender.cleanerAppender.File=log-cleaner.log
30 | log4j.appender.cleanerAppender.layout=org.apache.log4j.PatternLayout
31 | log4j.appender.cleanerAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
32 | 
33 | log4j.appender.controllerAppender=org.apache.log4j.DailyRollingFileAppender
34 | log4j.appender.controllerAppender.DatePattern='.'yyyy-MM-dd-HH
35 | log4j.appender.controllerAppender.File=${logs.dir}/controller.log
36 | log4j.appender.controllerAppender.layout=org.apache.log4j.PatternLayout
37 | log4j.appender.controllerAppender.layout.ConversionPattern=[%d] %p %m (%c)%n


--------------------------------------------------------------------------------
/src/main/resources/producer-defaults.properties:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Producer Basics
  3 | ###
  4 | 
  5 | # The client id is a user-specified string sent in each request to help trace calls.  It should
  6 | # logically identify the application making the request.
  7 | #
  8 | client.id=test-producer
  9 | 
 10 | # This is for bootstrapping knowledge about the rest of the cluster, and the producer will only use
 11 | # it for getting metadata (topics, partitions and replicas).  The socket connections for sending the
 12 | # actual data will be established based on the broker information returned in the metadata.
 13 | #
 14 | # The format is:
 15 | #   host1:port1,host2:port2
 16 | #
 17 | # The list can be a subset of brokers or a VIP pointing to a subset of brokers.
 18 | #
 19 | metadata.broker.list=localhost:9092
 20 | 
 21 | # The partitioner class for partitioning messages amongst sub-topics. The default partitioner is
 22 | # based on the hash of the key.
 23 | #
 24 | partitioner.class=kafka.producer.DefaultPartitioner
 25 | 
 26 | # specifies whether the messages are sent asynchronously (async) or synchronously (sync)
 27 | # Specifies whether the messages are sent asynchronously in a background thread.  Valid values are
 28 | # (1) "async" for asynchronous send and (2) "sync" for synchronous send.  By setting the producer to
 29 | # async we allow batching together of requests (which is great for throughput) but open the
 30 | # possibility of a failure of the client machine dropping unsent data.
 31 | #
 32 | producer.type=sync
 33 | 
 34 | # This value controls when a produce request is considered completed.  Specifically, how many other
 35 | # brokers must have committed the data to their log and acknowledged this to the leader? Typical
 36 | # values are:
 37 | #
 38 | # 0)  The producer never waits for an acknowledgement from the broker, i.e. fire-and-forget
 39 | #     (the same behavior as 0.7). This option provides the lowest latency but the weakest durability
 40 | #     guarantees (some data will be lost when a server fails).
 41 | # 1)  The producer gets an acknowledgement after the leader replica has received the data.
 42 | #     This option provides better durability as the client waits until the server acknowledges the
 43 | #     request as successful (only messages that were written to the now-dead leader but not yet
 44 | #     replicated will be lost).
 45 | # -1) The producer gets an acknowledgement after all in-sync replicas have received the data.
 46 | #     This option provides the best durability, we guarantee that no messages will be lost as long
 47 | #     as at least one in-sync replica remains.
 48 | #
 49 | # In general, the valid range of this setting is [-1, #numPartitionsOfTopic].
 50 | #
 51 | request.required.acks=0
 52 | 
 53 | 
 54 | # This property will cause the producer to automatically retry a failed send request.  This property
 55 | # specifies the number of retries when such failures occur. Note that setting a non-zero value here
 56 | # can lead to duplicates in the case of network errors that cause a message to be sent but the
 57 | # acknowledgement to be lost.
 58 | message.send.max.retries=3
 59 | 
 60 | # The producer generally refreshes the topic metadata from brokers when there is a failure (key
 61 | # missing, leader not available...).  It will also poll regularly (default: every 10min = 600000ms).
 62 | #
 63 | # If you set this to a negative value, metadata will only get refreshed on failure.
 64 | #
 65 | # If you set this to zero, the metadata will get refreshed after each message sent (not
 66 | # recommended). Important note: The refresh happens only AFTER the message is sent, so if the
 67 | # producer never sends a message the metadata is never refreshed!
 68 | #
 69 | topic.metadata.refresh.interval.ms=600000
 70 | 
 71 | # Specify the compression codec for all data generated by this producer.  Valid values are "none",
 72 | # "gzip" and "snappy". The old config values work as well: 0 (none), 1 (gzipo), 2 (snappy).
 73 | #
 74 | compression.codec=snappy
 75 | 
 76 | # The serializer class for messages. The default encoder takes a byte[] and returns the same byte[].
 77 | #
 78 | serializer.class=kafka.serializer.DefaultEncoder
 79 | 
 80 | # Set whether compression should be turned on for particular topics.  If the compression codec is
 81 | # anything other than `NoCompressionCodec`, enable compression only for specified topics if any.
 82 | # If the list of compressed topics is empty, then enable the specified compression codec for all
 83 | # topics.  If the compression codec is `NoCompressionCodec`, compression is disabled for all topics.
 84 | #
 85 | #compressed.topics=
 86 | 
 87 | ###
 88 | ### Async Producer
 89 | ###
 90 | 
 91 | # Maximum time to buffer data when using async mode.  For example a setting of 100 will try to batch
 92 | # together 100ms of messages to send at once.  This will improve throughput but adds message
 93 | # delivery latency due to the buffering.
 94 | #
 95 | queue.buffering.max.ms=5000
 96 | 
 97 | # The maximum number of unsent messages that can be queued up the producer when using async mode
 98 | # before either the producer must be blocked or data must be dropped.
 99 | #
100 | queue.buffering.max.messages=10000
101 | 
102 | # The amount of time to block before dropping messages when running in async mode and the buffer has
103 | # reached
104 | # `queue.buffering.max.messages`.
105 | #
106 | # 0:   Events will be enqueued immediately or dropped if the queue is full (the producer send call
107 | #      will never block).
108 | # -ve: The producer will block indefinitely if the queue is full, and it will never willingly drop a
109 | #      send.
110 | # +ve: The producer will block up to this many milliseconds if the queue is full.
111 | #
112 | queue.enqueue.timeout.ms=-1
113 | 
114 | # The number of messages to send in one batch when using async mode.  The producer will wait until
115 | # either this number of messages are ready to send or queue.buffer.max.ms is reached.
116 | #
117 | # Important note:  If compression is enabled, then the compressed batch of messages is treated as a
118 | # single message, whose size must be smaller than `max.message.bytes`.  If compression is disabled,
119 | # only each individual (uncompressed) message must be smaller than `max.message.byes`, i.e. in this
120 | # case the batch size does not really matter w.r.t. `max.message.bytes`.
121 | # See http://grokbase.com/t/kafka/users/139v9xqqj7/understanding-messagesizetoolarge-and-batches.
122 | #
123 | batch.num.messages=200


--------------------------------------------------------------------------------
/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
 1 | cassandra {
 2 |   keyspace = "loganalytics"
 3 | 
 4 |   host = "localhost"
 5 | 
 6 |   # native_transport_port
 7 |   #   port for the CQL native transport to listen for clients on
 8 |   #
 9 |   nativePort = 9042
10 |   # rpc_port
11 |   #   port for Thrift to listen for clients on
12 |   rpcPort = 9160
13 | 
14 |   replication.strategy = "SimpleStrategy"
15 |   replication.factor = 3
16 | 
17 |   # Size of pool used for cassandra operations
18 |   concurrency = 10
19 | }
20 | 
21 | kafka {
22 |   # list (csv) of zookeeper quorums to connect to - required for kafka receiver based API
23 |   zookeeper.quorum = "localhost:2181"
24 | 
25 |   # list (csv) of kafka brokers to connect - required for kafka DirectAPI
26 |   brokers = "localhost:9092"
27 | 
28 |   # number of spark streaming receivers to create
29 |   parallelism = 1
30 | 
31 |   # group id for this consumer
32 |   consumer.group = "loganalytics"
33 | 
34 |   # comman seperated values of topics to fetch data from
35 |   topics = "log-events"
36 | 
37 |   # number of threads for consuming the topics (ideally equal number of partitions)
38 |   threads = 2
39 | }
40 | 
41 | kinesis {
42 |   # name of the kinesis stream to connect to, if this stream does not exist it will be created
43 |   stream.name = "logevents"
44 | 
45 |   # `app.name` is used to create a DynamoDB table to maintain state for the
46 |   # application and should change as `stream.name` changes. Possible errors if not changed:
47 |   # com.amazonaws.services.kinesis.model.InvalidArgumentException: StartingSequenceNumber 49550063159425038533739345959303184138691727877071699970 used in GetShardIterator on shard shardId-000000000000 in stream logevents under account XXXXXXXXX is invalid because it did not come from this stream. (Service: AmazonKinesis; Status Code: 400; Error Code: InvalidArgumentException; Request ID: c2dd700d-eb8f-11e4-a62f-8fc087d45d29)
48 |   app.name = "LogAnalyzer-"${kinesis.stream.name}
49 | 
50 |   # LATEST: most recent data.
51 |   # TRIM_HORIZON: oldest available data.
52 |   # Note: This only effects the first run of this application on a stream.
53 |   initial.position = "TRIM_HORIZON"
54 | 
55 |   # aws access key
56 |   aws {
57 |     access.key = ""
58 | 
59 |     # aws secret key
60 |     secret.key = ""
61 | 
62 |     # kinesis enpoint url to connect to
63 |     endpoint.url = "kinesis.us-west-2.amazonaws.com"
64 |   }
65 | }
66 | 
67 | streaming {
68 |   # wether to enable WAL for checkpointing reveiver data
69 |   wal.enabled = "false"
70 | 
71 |   # where to store the checkpointing information
72 |   checkpoint.dir = ""
73 | 
74 |   # batch duration in milliseconds
75 |   batch.duration.ms = 5000
76 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/Cassandra.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra
 2 | 
 3 | import com.cloudwick.cassandra.schema.{LocationVisitRecord, StatusCountRecord, LogVolumeRecord}
 4 | import com.typesafe.config.ConfigFactory
 5 | 
 6 | import scala.concurrent.Await
 7 | import scala.concurrent.duration._
 8 | 
 9 | /**
10 |  * Description goes here.
11 |  * @author ashrith
12 |  */
13 | trait Cassandra extends CassandraService {
14 |   private[this] val config = ConfigFactory.load()
15 | 
16 |   protected val cassandraHost = config.getString("cassandra.host")
17 |   protected val nativePort = config.getInt("cassandra.nativePort")
18 |   protected val rpcPort = config.getInt("cassandra.rpcPort")
19 | 
20 |   def installSchema(): Unit = {
21 |     Await.result(LogVolumeRecord.create.future(), 10.seconds)
22 |     Await.result(StatusCountRecord.create.future(), 10.seconds)
23 |     Await.result(LocationVisitRecord.create.future(), 10.seconds)
24 |   }
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/CassandraExecutionContext.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra
 2 | 
 3 | import java.util.concurrent.Executors
 4 | 
 5 | import com.google.common.util.concurrent.ThreadFactoryBuilder
 6 | import com.typesafe.config.ConfigFactory
 7 | 
 8 | import scala.concurrent.ExecutionContext
 9 | 
10 | /**
11 |  * Description goes here.
12 |  * @author ashrith
13 |  */
14 | trait CassandraExecutionContext {
15 |   implicit val cassandraExecutionContext: ExecutionContext = CassandraExecutionContext.executionContext
16 | }
17 | 
18 | object CassandraExecutionContext {
19 |   implicit val executionContext: ExecutionContext = {
20 |     val executor = {
21 |       val threadFactory = new ThreadFactoryBuilder().setNameFormat("cassandra-pool-%d").build()
22 |       val cassandraConcurrency = ConfigFactory.load().getInt("cassandra.concurrency")
23 |       Executors.newFixedThreadPool(cassandraConcurrency, threadFactory)
24 |     }
25 |     ExecutionContext.fromExecutor(executor)
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/CassandraLocationVisitServiceModule.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra
 2 | 
 3 | import com.cloudwick.cassandra.schema.{LocationVisit, LocationVisitRecord}
 4 | import com.cloudwick.cassandra.service.LocationVisitServiceModule
 5 | import com.cloudwick.logging.LazyLogging
 6 | import com.websudos.phantom.Implicits._
 7 | 
 8 | import scala.concurrent.Future
 9 | 
10 | /**
11 |  * Description goes here.
12 |  * @author ashrith
13 |  */
14 | trait CassandraLocationVisitServiceModule extends LocationVisitServiceModule with CassandraService {
15 | 
16 |   object locationVisitService extends LocationVisitService with LazyLogging {
17 | 
18 |     override def update(locationVisit: LocationVisit) = {
19 |       logger.trace(
20 |         s"Update location visit counter. Country: ${locationVisit.country} " +
21 |         s"City: ${locationVisit.city} " +
22 |         s"Count: ${locationVisit.totalCount}"
23 |       )
24 | 
25 |       LocationVisitRecord.update
26 |         .where(_.country eqs locationVisit.country)
27 |         .and(_.city eqs locationVisit.city)
28 |         .modify(_.total_count increment locationVisit.totalCount)
29 |         .future()
30 |     }
31 | 
32 |     def getCount(country: String, city: String): Future[Option[Long]] = {
33 |       LocationVisitRecord
34 |         .select(_.total_count)
35 |         .where(_.country eqs country)
36 |         .and(_.city eqs city)
37 |         .one()(session, cassandraExecutionContext)
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/CassandraLogVolumeServiceModule.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra
 2 | 
 3 | import com.cloudwick.cassandra.schema.{LogVolume, LogVolumeRecord}
 4 | import com.cloudwick.cassandra.service.LogVolumeServiceModule
 5 | import com.cloudwick.logging.LazyLogging
 6 | import com.websudos.phantom.Implicits._
 7 | 
 8 | trait CassandraLogVolumeServiceModule extends LogVolumeServiceModule with CassandraService {
 9 | 
10 |   object logVolumeService extends LogVolumeService with LazyLogging {
11 |     override def update(logVolume: LogVolume) = {
12 |       logger.trace(
13 |         s"Update volume per minute count. Minute: ${logVolume.timeStamp} " +
14 |         s"Count: ${logVolume.totalCount}"
15 |       )
16 | 
17 |       LogVolumeRecord.update
18 |         .where(_.timeStamp eqs logVolume.timeStamp)
19 |         .modify(_.total_count increment logVolume.totalCount)
20 |         .future()
21 |     }
22 | 
23 |     override def getCount(id: Long) = {
24 |       LogVolumeRecord
25 |         .select(_.total_count)
26 |         .where(_.timeStamp eqs id)
27 |         .one()(session, cassandraExecutionContext)
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/CassandraService.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra
 2 | 
 3 | import com.typesafe.config.ConfigFactory
 4 | import com.websudos.phantom.zookeeper.{CassandraManager, SimpleCassandraConnector}
 5 | 
 6 | /**
 7 |  * Description goes here.
 8 |  * @author ashrith
 9 |  */
10 | trait CassandraService extends SimpleCassandraConnector with CassandraExecutionContext {
11 |    override def keySpace: String = {
12 |       val config = ConfigFactory.load()
13 |       config.getString("cassandra.keyspace")
14 |    }
15 | 
16 |    override def manager: CassandraManager = ConfigurableCassandraManager
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/CassandraStatusCountServiceModule.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra
 2 | 
 3 | import com.cloudwick.cassandra.schema.{StatusCount, StatusCountRecord}
 4 | import com.cloudwick.cassandra.service.StatusCountServiceModule
 5 | import com.cloudwick.logging.LazyLogging
 6 | import com.websudos.phantom.Implicits._
 7 | 
 8 | trait CassandraStatusCountServiceModule extends StatusCountServiceModule with CassandraService {
 9 | 
10 |   object statusCountService extends StatusCountService with LazyLogging {
11 | 
12 |     override def update(statusCount: StatusCount) = {
13 |       logger.trace(
14 |         s"Update status count counter. StatusCode: ${statusCount.statusCode} " +
15 |         s"Count: ${statusCount.totalCount}"
16 |       )
17 | 
18 |       StatusCountRecord.update
19 |         .where(_.statusCode eqs statusCount.statusCode)
20 |         .modify(_.total_count increment statusCount.totalCount)
21 |         .future()
22 |     }
23 | 
24 |     override def getCount(id: Int) = {
25 |       StatusCountRecord
26 |         .select(_.total_count)
27 |         .where(_.statusCode eqs id)
28 |         .one()(session, cassandraExecutionContext)
29 |     }
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/ConfigurableCassandraManager.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra
 2 | 
 3 | import com.datastax.driver.core.{Session, Cluster}
 4 | import com.typesafe.config.ConfigFactory
 5 | import com.websudos.phantom.zookeeper.CassandraManager
 6 | import scala.concurrent._
 7 | 
 8 | /**
 9 |  * Description goes here.
10 |  * @author ashrith
11 |  */
12 | private[cassandra] case object CassandraInitLock
13 | 
14 | object ConfigurableCassandraManager extends CassandraManager {
15 |   private[this] val config = ConfigFactory.load()
16 |   private[this] var initialized = false
17 |   @volatile private[this] var _session: Session = _
18 | 
19 |   val nativePort = config.getInt("cassandra.nativePort")
20 | 
21 |   override def livePort: Int = 9042
22 | 
23 |   override implicit def session: Session = _session
24 | 
25 |   override def embeddedPort: Int = nativePort
26 | 
27 |   override def cluster: Cluster = Cluster.builder()
28 |     .addContactPoint(config.getString("cassandra.host"))
29 |     .withPort(nativePort)
30 |     .withoutJMXReporting()
31 |     .withoutMetrics()
32 |     .build()
33 | 
34 |   override def initIfNotInited(keySpace: String): Unit = CassandraInitLock.synchronized {
35 |     val strategy = config.getString("cassandra.replication.strategy")
36 |     val factor = config.getInt("cassandra.replication.factor")
37 | 
38 |     if(!initialized) {
39 |       _session = blocking {
40 |         val s = cluster.connect()
41 |         s.execute(s"CREATE KEYSPACE IF NOT EXISTS $keySpace WITH REPLICATION = {'class': '$strategy', 'replication_factor' : $factor};")
42 |         s.execute(s"USE $keySpace;")
43 |         s
44 |       }
45 |       initialized = true
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/schema/LocationVisit.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra.schema
 2 | 
 3 | import com.datastax.driver.core.Row
 4 | import com.websudos.phantom.CassandraTable
 5 | import com.websudos.phantom.Implicits._
 6 | 
 7 | /**
 8 |  * Counts the number of visits by country and city
 9 |  * @param country    iso code for the country
10 |  * @param city       city name
11 |  * @param totalCount number of times a unique visit has came from country, city
12 |  */
13 | case class LocationVisit(country: String, city: String, totalCount: Long)
14 | 
15 | sealed class LocationVisitRecord extends CassandraTable[LocationVisitRecord, LocationVisit] {
16 | 
17 |   object country      extends StringColumn(this)  with PartitionKey[String]
18 |   object city         extends StringColumn(this)  with PrimaryKey[String]
19 |   object total_count  extends CounterColumn(this)
20 | 
21 |   override def fromRow(row: Row): LocationVisit = {
22 |     LocationVisit(country(row), city(row), total_count(row))
23 |   }
24 | }
25 | 
26 | object LocationVisitRecord extends LocationVisitRecord {
27 |   override val tableName = "location_visits_counts"
28 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/schema/LogVolume.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra.schema
 2 | 
 3 | import com.cloudwick.cassandra.CassandraService
 4 | import com.datastax.driver.core.Row
 5 | import com.websudos.phantom.CassandraTable
 6 | import com.websudos.phantom.Implicits._
 7 | import com.websudos.phantom.keys.PartitionKey
 8 | import org.joda.time.DateTime
 9 | 
10 | /**
11 |  * Count log events by minute
12 |  * @param timeStamp   time stamp rounded off to minute in epoch format
13 |  * @param totalCount  number of times events have appeared in that minute window
14 |  */
15 | case class LogVolume(timeStamp: Long, totalCount: Long)
16 | 
17 | /**
18 |  * Seal the class and only allow importing of the companion object
19 |  */
20 | sealed class LogVolumeRecord extends CassandraTable[LogVolumeRecord, LogVolume] with Serializable {
21 |   override val tableName: String = "log_volume"
22 | 
23 |   object timeStamp   extends LongColumn(this) with PartitionKey[Long] {
24 |     override lazy val name = "time_stamp"
25 |   }
26 |   object total_count extends CounterColumn(this)
27 | 
28 |   // Mapping function, transforming a row into custom type
29 |   override def fromRow(row: Row): LogVolume = {
30 |     LogVolume(timeStamp(row), total_count(row))
31 |   }
32 | }
33 | 
34 | object LogVolumeRecord extends LogVolumeRecord {
35 |   // rename the table name in the schema
36 |   override val tableName = "log_volume_minute"
37 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/schema/StatusCount.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra.schema
 2 | 
 3 | import com.datastax.driver.core.Row
 4 | import com.websudos.phantom.CassandraTable
 5 | import com.websudos.phantom.Implicits._
 6 | 
 7 | /**
 8 |  * Counts http status code's returned by the web server
 9 |  * @param statusCode  type of the status code 200, 404, 503, ...
10 |  * @param totalCount  number of times a status code has appeared in its entirety
11 |  */
12 | case class StatusCount(statusCode: Int, totalCount: Long)
13 | 
14 | sealed class StatusCountRecord extends CassandraTable[StatusCountRecord, StatusCount] {
15 |   object statusCode extends IntColumn(this)     with PartitionKey[Int]
16 |   object total_count    extends CounterColumn(this)
17 | 
18 |   override def fromRow(row: Row): StatusCount = {
19 |     StatusCount(statusCode(row), total_count(row))
20 |   }
21 | }
22 | 
23 | object StatusCountRecord extends StatusCountRecord {
24 |   override val tableName = "status_count"
25 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/service/LocationVisitServiceModule.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra.service
 2 | 
 3 | import com.cloudwick.cassandra.schema.LocationVisit
 4 | import com.datastax.driver.core.ResultSet
 5 | 
 6 | import scala.concurrent.Future
 7 | 
 8 | trait LocationVisitServiceModule {
 9 |   def locationVisitService: LocationVisitService
10 | 
11 |   trait LocationVisitService {
12 |     def update(locationVisit: LocationVisit): Future[ResultSet]
13 |     def getCount(country: String, city: String): Future[Option[Long]]
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/service/LogVolumeServiceModule.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra.service
 2 | 
 3 | import com.cloudwick.cassandra.schema.LogVolume
 4 | import com.datastax.driver.core.ResultSet
 5 | 
 6 | import scala.concurrent.Future
 7 | 
 8 | /**
 9 |  * Description goes here.
10 |  * @author ashrith
11 |  */
12 | trait LogVolumeServiceModule {
13 |   def logVolumeService: LogVolumeService
14 | 
15 |   trait LogVolumeService {
16 |     def update(logVolume: LogVolume): Future[ResultSet]
17 |     def getCount(id: Long): Future[Option[Long]]
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/cassandra/service/StatusCountServiceModule.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.cassandra.service
 2 | 
 3 | import com.cloudwick.cassandra.schema.StatusCount
 4 | import com.datastax.driver.core.ResultSet
 5 | 
 6 | import scala.concurrent.Future
 7 | 
 8 | trait StatusCountServiceModule {
 9 |   def statusCountService: StatusCountService
10 | 
11 |   trait StatusCountService {
12 |     def update(statusCount: StatusCount): Future[ResultSet]
13 |     def getCount(id: Int): Future[Option[Long]]
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/logging/Logging.scala:
--------------------------------------------------------------------------------
1 | package com.cloudwick.logging
2 | 
3 | import org.slf4j.{LoggerFactory, Logger}
4 | 
5 | trait LazyLogging {
6 |   protected lazy val logger: Logger = LoggerFactory.getLogger(getClass.getName)
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/embedded/KafkaServer.scala:
--------------------------------------------------------------------------------
  1 | package com.cloudwick.spark.embedded
  2 | 
  3 | import java.nio.file.Files
  4 | import java.util.Properties
  5 | 
  6 | import com.cloudwick.logging.LazyLogging
  7 | import kafka.admin.AdminUtils
  8 | import kafka.server.{KafkaConfig, KafkaServerStartable}
  9 | import kafka.utils.ZKStringSerializer
 10 | import org.I0Itec.zkclient.ZkClient
 11 | 
 12 | import scala.concurrent.duration._
 13 | 
 14 | /**
 15 |  * Runs an in-memory, "embedded" instance of a Kafka broker, which listens at `127.0.0.1:9092`
 16 |  * by default.
 17 |  *
 18 |  * Requires a running ZooKeeper instance to connect to.  By default, it expects a ZooKeeper instance
 19 |  * running at `127.0.0.1:2181`.
 20 |  *
 21 |  * @param config Broker configuration settings.  Used to modify, for example, on which port the
 22 |  *               broker should listen to.
 23 |  */
 24 | class KafkaServer(config: Properties = new Properties) extends LazyLogging {
 25 |   private val defaultZkConnect = "127.0.0.1:2181"
 26 |   private val logDir = Files.createTempDirectory(this.getClass.getSimpleName)
 27 | 
 28 |   private val effectiveConfig = {
 29 |     val c = new Properties
 30 |     c.load(this.getClass.getResourceAsStream("/broker-defaults.properties"))
 31 |     c.putAll(config)
 32 |     c.setProperty("log.dirs", logDir.toString)
 33 |     c
 34 |   }
 35 | 
 36 |   private val kafkaConfig = new KafkaConfig(effectiveConfig)
 37 |   private val kafka = new KafkaServerStartable(kafkaConfig)
 38 | 
 39 |   // This is broker's `metadata.broker.list` value.  Example: `127.0.0.1:9092`.
 40 |   val brokerList = kafka.serverConfig.hostName + ":" + kafka.serverConfig.port
 41 | 
 42 |   // The ZooKeeper connection string aka `zookeeper.connect`.
 43 |   val zookeeperConnect = {
 44 |     val zkConnectLookup = Option(effectiveConfig.getProperty("zookeeper.connect"))
 45 |     zkConnectLookup match {
 46 |       case Some(zkConnect) => zkConnect
 47 |       case _ =>
 48 |         logger.warn(s"zookeeper.connect is not configured -- falling back to default " +
 49 |           s"setting $defaultZkConnect")
 50 |         defaultZkConnect
 51 |     }
 52 |   }
 53 | 
 54 |   /**
 55 |    * Start the broker
 56 |    */
 57 |   def start() {
 58 |     logger.debug(s"Starting embedded Kafka broker at $brokerList (with ZK server " +
 59 |       s"at $zookeeperConnect) ...")
 60 |     kafka.startup()
 61 |     logger.debug(s"Startup of embedded Kafka broker at $brokerList completed (with ZK server " +
 62 |       s"at $zookeeperConnect)")
 63 |   }
 64 | 
 65 |   /**
 66 |    * Stop the broker
 67 |    */
 68 |   def stop() {
 69 |     logger.debug(s"Shutting down embedded Kafka broker at $brokerList (with ZK server " +
 70 |       s"at $zookeeperConnect)...")
 71 |     kafka.shutdown()
 72 |     Files.deleteIfExists(logDir)
 73 |     logger.debug(s"Shutdown of embedded Kafka broker at $brokerList completed (with ZK server " +
 74 |       s"at $zookeeperConnect)")
 75 |   }
 76 | 
 77 |   /**
 78 |    * Creates a topic with specified name
 79 |    * @param topic name of the topic to create
 80 |    * @param partitions number of partitions for the topic
 81 |    * @param replicationFactor replication factor for the topic
 82 |    * @param config kafka configuration properties
 83 |    */
 84 |   def createTopic(topic: String,
 85 |                   partitions: Int = 1,
 86 |                   replicationFactor: Int = 1,
 87 |                   config: Properties = new Properties): Unit = {
 88 |     logger.debug(s"Creating topic { name: $topic, partitions: $partitions, " +
 89 |       s"replicationFactor: $replicationFactor, config: $config }")
 90 |     val sessionTimeout = 10.seconds
 91 |     val connectionTimeout = 8.seconds
 92 |     // Note: You must initialize the ZkClient with ZKStringSerializer.  If you don't, then
 93 |     // createTopic() will only seem to work (it will return without error).  Topic will exist in
 94 |     // only ZooKeeper, and will be returned when listing topics, but Kafka itself does not create
 95 |     // the topic.
 96 |     val zkClient = new ZkClient(zookeeperConnect,
 97 |       sessionTimeout.toMillis.toInt,
 98 |       connectionTimeout.toMillis.toInt,
 99 |       ZKStringSerializer
100 |     )
101 |     AdminUtils.createTopic(zkClient, topic, partitions, replicationFactor, config)
102 |     zkClient.close()
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/embedded/ZookeeperServer.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.embedded
 2 | 
 3 | import com.cloudwick.logging.LazyLogging
 4 | import org.apache.curator.test.TestingServer
 5 | 
 6 | /**
 7 |  * Runs an in-memory, "embedded" instance of a ZooKeeper server.
 8 |  *
 9 |  * Creates a new instance of the zookeeper server when an instance of this class is created
10 |  */
11 | class ZookeeperServer(val port: Int = 2181) extends LazyLogging {
12 |   logger.debug(s"Starting embedded ZooKeeper server on port $port...")
13 | 
14 |   private val server = new TestingServer(port)
15 | 
16 |   /**
17 |    * Stops the embedded zookeeper server
18 |    */
19 |   def stop(): Unit ={
20 |     logger.debug(s"Shutting down embedded zookeeper server on port $port...")
21 |     server.close()
22 |     logger.debug(s"Shutdown of zookeeper server on port $port completed")
23 |   }
24 | 
25 |   // The ZooKeeper connection string aka `zookeeper.connect` in `hostnameOrIp:port` format.
26 |   val connectionString: String = server.getConnectString
27 | 
28 |   // The hostname of the ZooKeeper instance.  Example: `127.0.0.1`
29 |   val hostName: String = connectionString.splitAt(connectionString lastIndexOf ':')._1
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/core/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.core
 2 | 
 3 | import org.apache.spark.Logging
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | /**
 8 |  * Simple word count program to illustrate spark standalone applications usage
 9 |  */
10 | 
11 | case class WordCount(word: String, count: Int)
12 | 
13 | object WordCount extends Logging {
14 | 
15 |   def count(lines: RDD[String], stopWords: Set[String]):RDD[WordCount] = {
16 |     val words = prepareWords(lines, stopWords)
17 | 
18 |     val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _).map {
19 |       case (word: String, count: Int) => WordCount(word, count)
20 |     }
21 | 
22 |     wordCounts
23 |   }
24 | 
25 |   def prepareWords(lines: RDD[String], stopWords: Set[String]): RDD[String] = {
26 |     lines.flatMap(_.split("\\s+"))
27 |       .map(_.strip(",").strip(".").toLowerCase)
28 |       .filter(!stopWords.contains(_)).filter(!_.isEmpty)
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/core/WordCountRunner.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.core
 2 | 
 3 | import com.cloudwick.logging.LazyLogging
 4 | import org.apache.spark.{Logging, SparkConf, SparkContext}
 5 | 
 6 | /**
 7 |  * Simple word count program to illustrate spark standalone applications
 8 |  */
 9 | object WordCountRunner extends LazyLogging {
10 | 
11 |   def main (args: Array[String] ) {
12 |     if (args.length < 2) {
13 |       System.err.println("Usage: WordCountRunner input_path output_path")
14 |       System.exit(1)
15 |     }
16 | 
17 |     val Array(inputPath, outputPath) = args
18 |     val stopWords = Set("a", "an", "the")
19 | 
20 |     val conf = new SparkConf().setAppName("WordCount")
21 |     val sc = new SparkContext(conf)
22 | 
23 |     val lines = sc.textFile(inputPath)
24 |     val counts = WordCount.count(lines, stopWords)
25 | 
26 |     // logger.info(counts.collect().mkString("[", ", ", "]"))
27 | 
28 |     counts.saveAsTextFile(outputPath)
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/core/package.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples
 2 | 
 3 | /**
 4 |  * Package object
 5 |  */
 6 | package object core {
 7 |   implicit class StringUtils(val value: String) {
 8 |     def strip(stripChars: String): String = value.stripPrefix(stripChars).stripSuffix(stripChars)
 9 |   }
10 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/kafka/KafkaWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.kafka
 2 | 
 3 | import java.nio.file.Files
 4 | 
 5 | import com.cloudwick.logging.LazyLogging
 6 | import com.cloudwick.spark.examples.core.WordCount
 7 | import com.cloudwick.spark.examples.streaming.local.NetworkWordCountWindowed
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.streaming.kafka.KafkaUtils
10 | import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
11 | import org.apache.spark.SparkConf
12 | 
13 | /**
14 |  * Consumes messages from one or more topics in Kafka and does word-count.
15 |  * Usage: KafkaWordCount <zkQuorum> <group> <topics> <numThreads>
16 |  *
17 |  * Running this example locally:
18 |  * 1. Start a zookeeper local instance
19 |  *      `bin/zookeeper-server-start.sh config/zookeeper.properties`
20 |  * 2. Start a kafka broker local instance
21 |  *      `bin/kafka-server-start.sh config/server.properties`
22 |  * 3. Create a topic
23 |  *      `bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test-wc`
24 |  * 4. Make sure the topic got created
25 |  *      `bin/kafka-topics.sh --list --zookeeper localhost:2181`
26 |  * 5. Start console producer and start sending some messages
27 |  *      `bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test-wc`
28 |  * 6. Start this streaming job
29 |  *      `spark-submit --class com.cloudwick.spark.examples.streaming.kafka.KafkaWordCount --master "local[*]" target/scala-2.10/spark_codebase-assembly-1.0.jar localhost:2181 stcg test-wc 1`
30 |  * 7. Check the offset consumption of the topic
31 |  *      `bin/kafka-consumer-offset-checker.sh --zookeeper localhost:2181 --topic test-wc --group stcg`
32 |  */
33 | object KafkaWordCount extends LazyLogging {
34 | 
35 |   def main(args: Array[String]) {
36 |     if (args.length < 4) {
37 |       logger.error(
38 |         """
39 |           |Usage: KafkaWordCount <zkQuorum> <group> <topics> <numThreads>
40 |           |         zkQuorum - Zookeeper quorum (hostname:port,hostname:port,..)
41 |           |         group - The group id for this consumer
42 |           |         topics - csv of topics to consume
43 |           |         numThreads - number of threads to use for consuming (ideally equal to number of
44 |           |                      partitions)
45 |         """.stripMargin
46 |       )
47 |       System.exit(1)
48 |     }
49 | 
50 |     val Array(zkQuorum, group, topics, numThreads) = args
51 |     val batchDuration = Seconds(5)
52 |     val windowDuration = Seconds(30)
53 |     val slideDuration = Seconds(10)
54 |     val stopWords = Set("a", "an", "the")
55 |     val checkpointDir = Files.createTempDirectory(this.getClass.getSimpleName).toString
56 | 
57 |     val sparkConf = new SparkConf().setAppName("KafkaWordCount")
58 |     val ssc = new StreamingContext(sparkConf, batchDuration)
59 |     ssc.checkpoint(checkpointDir)
60 | 
61 |     val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
62 |     val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
63 | 
64 |     NetworkWordCountWindowed.count(lines, windowDuration, slideDuration, stopWords) {
65 |       (wordsCount: RDD[WordCount], time: Time) =>
66 |         val counts = time + ": " + wordsCount.collect().mkString("[", ", ", "]")
67 |         println(counts)
68 |     }
69 | 
70 |     ssc.start()
71 |     ssc.awaitTermination()
72 |   }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/kafka/StatefulKafkaWordCount.scala:
--------------------------------------------------------------------------------
  1 | package com.cloudwick.spark.examples.streaming.kafka
  2 | 
  3 | import java.nio.file.Files
  4 | import java.util.concurrent.{TimeUnit, Executors}
  5 | 
  6 | import com.cloudwick.logging.LazyLogging
  7 | import com.cloudwick.spark.loganalysis.LogEvent
  8 | import kafka.serializer.StringDecoder
  9 | import org.apache.spark.streaming.dstream.DStream
 10 | import org.apache.spark.{HashPartitioner, SparkConf}
 11 | import org.apache.spark.streaming.kafka.KafkaUtils
 12 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 13 | import org.joda.time.format.DateTimeFormat
 14 | 
 15 | import scala.concurrent.{Future, ExecutionContext}
 16 | 
 17 | /**
 18 |  * Uses KafkaDirectAPI and Stateful transformation `updateStateByKey` to count the number of times
 19 |  * a user IP has been seen, only keeping last 10 mins data
 20 |  * @author ashrith
 21 |  */
 22 | object StatefulKafkaWordCount extends LazyLogging {
 23 | 
 24 |   private val logEventPattern = """([\d.]+) (\S+) (\S+) \[(.*)\] "([^\s]+) (/[^\s]*) HTTP/[^\s]+" (\d{3}) (\d+) "([^"]+)" "([^"]+)"""".r
 25 |   private val formatter = DateTimeFormat.forPattern("dd/MMM/yyyy:HH:mm:ss Z")
 26 |   private val KEY_REMOVE_TIME_MS = 60 * 1000
 27 | 
 28 |   /*
 29 |   val executionContext = Executors.newScheduledThreadPool(1)
 30 | 
 31 |   def cleanUpState(state: DStream[(String, (Long, Long))]): Runnable = new Runnable {
 32 |     override def run(): Unit = {
 33 |       state.map[Option[(String, (Long, Long))]](record => {
 34 |         if (System.currentTimeMillis() - record._2._1 > KEY_REMOVE_TIME_MS) {
 35 |           println("Cleaning up record: " + record._1)
 36 |           None
 37 |         } else {
 38 |           Some(record)
 39 |         }
 40 |       }).flatMap(x => x)
 41 |     }
 42 |   }
 43 |   */
 44 | 
 45 |   /**
 46 |    * Parses the raw log event and generates a `LogEvent` object
 47 |    * @param le raw log event
 48 |    * @return
 49 |    */
 50 |   def parseLogEvent(le: String): Option[LogEvent] = {
 51 |     le match {
 52 |       case logEventPattern(ip, ci, ui, ts, rt, rp, rc, rs, r, ua) =>
 53 |         Some(LogEvent(ip, ci, ui, formatter.parseDateTime(ts), rt, rp, rc.toInt, rs.toInt, r, ua))
 54 |       case _ => None
 55 |     }
 56 |   }
 57 | 
 58 |   /**
 59 |    * Update function passed to updateStateByKey which inturn calls updateIpCounter, which might
 60 |    * returns Option of key to keep
 61 |    * @param iterator of tuples (Key, Seq(Values), Option[Output])
 62 |    */
 63 |   def newUpdateFunction(iterator: Iterator[(String, Seq[(Long, Long)], Option[(Long, Long)])]) = {
 64 |     iterator.flatMap { t => updateIpCounter(t._2, t._3).map(s => (t._1, s)) }
 65 |   }
 66 | 
 67 |   /**
 68 |    * This function is called for to union of keys in the Reduce DStream with the active batch
 69 |    * events with ip address being the key. This produces a stateful RDD that has all the active
 70 |    * ip address and their counts which are not more than one day old
 71 |    *
 72 |    * @param values sequence of (maxTimeStamp, countOfEvents)
 73 |    * @param state existing state of the key (maxTimeStamp, countOfEvents)
 74 |    * @return new state with values for the current batch considered
 75 |    */
 76 |   def updateIpCounter(values: Seq[(Long, Long)], state: Option[(Long, Long)]): Option[(Long, Long)] = {
 77 |     // return value from the function, to delete a key from the state just return `None`
 78 |     var result: Option[(Long, Long)] = null
 79 | 
 80 |     // If the current batch has no values associated to a key, then check for the existing state
 81 |     // if any of the keys are expiring
 82 |     if (values.size == 0) {
 83 |       if (System.currentTimeMillis() - state.get._1 > KEY_REMOVE_TIME_MS) {
 84 |          logger.warn("Clearing key - inside no value block")
 85 |          result = None
 86 |       } else {
 87 |         result = Some((state.get._1, state.get._2))
 88 |       }
 89 |     } else {
 90 |       // As we applied reduce function previously we are only ever going to get at most one event
 91 |       // in the Sequence
 92 |       values.foreach(event => {
 93 |         // if the state is empty, that means this is the first time are are encountering an ip address
 94 |         if (state.isEmpty) {
 95 |           result = Some(event)
 96 |         } else {
 97 |           if (System.currentTimeMillis() - state.get._1 > KEY_REMOVE_TIME_MS) {
 98 |              logger.warn("Clearing key - inside values existing block")
 99 |              result = None
100 |           } else {
101 |             // state is non empty
102 |             // println("current state: " + state.get)
103 |             result = Some((
104 |               Math.max(event._1, state.get._1),
105 |               event._2 + state.get._2
106 |             ))
107 |           }
108 |         }
109 |       })
110 |     }
111 | 
112 |     result
113 |   }
114 | 
115 |   def main(args: Array[String]) {
116 |     if (args.length < 2) {
117 |       logger.error(
118 |         """
119 |           |Usage: StatefulKafkaWordCount <zkQuorum> <group> <topics> <numThreads>
120 |           |         brokers - List of kafka brokers (hostname:port,hostname:port,..)
121 |           |         topics - csv of topics to consume
122 |         """.stripMargin
123 |       )
124 |       System.exit(1)
125 |     }
126 | 
127 |     val Array(brokers, topics) = args
128 |     val batchDuration = Seconds(5)
129 |     val stopWords = Set("a", "an", "the")
130 |     val checkpointDir = Files.createTempDirectory(this.getClass.getSimpleName).toString
131 | 
132 |     val sparkConf = new SparkConf().setAppName("KafkaWordCount")
133 |     val ssc = new StreamingContext(sparkConf, batchDuration)
134 |     ssc.checkpoint(checkpointDir)
135 | 
136 |     val topicSet = topics.split(",").toSet
137 |     val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
138 | 
139 |     // Get the kafka stream
140 |     val directKafkaStream =
141 |       KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
142 |         ssc, kafkaParams, topicSet)
143 |     val lines = directKafkaStream.map(_._2)
144 | 
145 |     // parse the log events and construct LogEvent objects
146 |     val parsedEvents = lines.flatMap(_.split("\\n")).flatMap(parseLogEvent)
147 | 
148 |     // take the parsed log events and build a tuple's of `(ipAddress, (eventGeneratedTimeStamp, LogEvent))`
149 |     val reqTuples = parsedEvents.map[(String, (Long, LogEvent))](logEvent => {
150 |       val timeInMilliSecs = logEvent.timeStamp.getMillis
151 | 
152 |       (logEvent.ip, (timeInMilliSecs, logEvent))
153 |     })
154 | 
155 |     // takes in the constructed tuple's and transform it to the reducedBy tuple's including count's
156 |     val latestBatch = reqTuples.map[(String, (Long, Long))](t => {
157 |       // (ip, (timestamp, counter))
158 |       (t._1, (t._2._1, 1))
159 |     }).reduceByKey((a, b) => {
160 |       // transform to (ip, (timestamp, sumOfCounter))
161 |       (Math.max(a._1, b._1), a._2 + b._2)
162 |     })
163 | 
164 |     // store and update the state of the key using `updateStateByKey`
165 |     val latestInfo = latestBatch.updateStateByKey[(Long, Long)](
166 |       newUpdateFunction _,
167 |       new HashPartitioner(ssc.sparkContext.defaultParallelism),
168 |       true
169 |     )
170 | 
171 |     latestInfo.print()
172 | 
173 |     // schedule the task to run at every 1 minute interval
174 |     // executionContext.scheduleAtFixedRate(cleanUpState(latestInfo), 0, 1, TimeUnit.MINUTES)
175 | 
176 |     ssc.start()
177 |     ssc.awaitTermination()
178 |   }
179 | }
180 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/kinesis/KinesisWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.kinesis
 2 | 
 3 | import com.amazonaws.auth.AWSCredentials
 4 | import com.amazonaws.services.kinesis.AmazonKinesisClient
 5 | import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 6 | import com.cloudwick.logging.LazyLogging
 7 | import org.apache.spark.storage.StorageLevel
 8 | import org.apache.spark.streaming.StreamingContext._
 9 | import org.apache.spark.streaming.kinesis.KinesisUtils
10 | import org.apache.spark.streaming.{Seconds, StreamingContext}
11 | import org.apache.spark.SparkConf
12 | 
13 | /**
14 |  * Kinesis word count example
15 |  *
16 |  * Running this example locally:
17 |  *
18 |  *  `spark-submit --class com.cloudwick.spark.examples.streaming.kinesis.KinesisWordCount
19 |  *    --master "local[*]" target/scala-2.10/spark_codebase-assembly-1.0.jar
20 |  *    <stream-name> <aws-access-key> <aws-secret-key> <endpoint-url>`
21 |  */
22 | object KinesisWordCount extends LazyLogging {
23 | 
24 |   def fromCredentials(awsAccessKey: String,
25 |                       awsSecretKey: String,
26 |                       awsEndPoint: String): AmazonKinesisClient = {
27 |     val credentials = new AWSCredentials {
28 |       override def getAWSAccessKeyId: String = awsAccessKey
29 | 
30 |       override def getAWSSecretKey: String = awsSecretKey
31 |     }
32 |     val client = new AmazonKinesisClient(credentials)
33 |     client.setEndpoint(awsEndPoint)
34 |     client
35 |   }
36 | 
37 |   def main(args: Array[String]) {
38 |     if (args.length < 4) {
39 |       logger.error(
40 |         """
41 |           |Usage: KinesisWordCount <stream-name> <aws-access-key> <aws-secret-key> <endpoint-url>
42 |           |         stream-name - is the name of the kinesis stream
43 |           |         aws-access-key - is the aws access key
44 |           |         aws-secret-key - is the aws secret access keuy
45 |           |         endpoint-url - is the endpoint of the kinesis service
46 |         """.stripMargin
47 |       )
48 |       System.exit(1)
49 |     }
50 | 
51 |     val Array(streamName, awsAccessKey, awsSecretKey, endPointUrl) = args
52 | 
53 |     // Determine the number of shards for a specified stream, so that we could create one kinesis
54 |     // receiver for each shard
55 |     val kinesisClient = fromCredentials(awsAccessKey, awsSecretKey, endPointUrl)
56 |     val numShards = kinesisClient.describeStream(streamName).getStreamDescription.getShards.size
57 | 
58 |     val batchDuration = Seconds(2)
59 |     val sparkConf = new SparkConf().setAppName("KinesisWordCount").setMaster("local[*]")
60 |     val ssc = new StreamingContext(sparkConf, batchDuration)
61 | 
62 |     // create receivers
63 |     // set aws.accessKeyId and aws.secretKey as system properties
64 |     System.setProperty("aws.accessKeyId", awsAccessKey)
65 |     System.setProperty("aws.secretKey", awsSecretKey)
66 |     val kinesisStreams = (0 until numShards).map { i =>
67 |       KinesisUtils.createStream(ssc, streamName, endPointUrl, batchDuration,
68 |         InitialPositionInStream.TRIM_HORIZON, StorageLevel.MEMORY_AND_DISK_2)
69 |     }
70 | 
71 |     // union all the streams
72 |     val unionStream = ssc.union(kinesisStreams)
73 | 
74 |     // convert each record of type Byte to string
75 |     val words = unionStream.flatMap(new String(_).split("\\s+"))
76 |     val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _)
77 | 
78 |     wordCounts.print()
79 | 
80 |     ssc.start()
81 |     ssc.awaitTermination()
82 |   }
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.local
 2 | 
 3 | import com.cloudwick.spark.examples.core.WordCount
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.streaming.Time
 6 | import org.apache.spark.streaming.dstream.DStream
 7 | // required for bringing DStream operators (reduceByKey, ...) to scope not required for Spark 1.3+
 8 | import org.apache.spark.streaming.StreamingContext._
 9 | 
10 | // Bring prepareWords() and WordCount case class into scope
11 | import com.cloudwick.spark.examples._
12 | 
13 | object NetworkWordCount {
14 | 
15 |   type WordHandler = (RDD[WordCount], Time) => Unit
16 | 
17 |   def count(lines: DStream[String], stopWords: Set[String])(handler: WordHandler): Unit = {
18 |     val words = lines.transform(WordCount.prepareWords(_, stopWords))
19 | 
20 |     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _).map {
21 |       case (word: String, count: Int) => WordCount(word, count)
22 |     }
23 | 
24 |     wordCounts.foreachRDD((rdd: RDD[WordCount], time: Time) => {
25 |       handler(rdd.sortBy(_.word), time)
26 |     })
27 |   }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCountRunner.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.local
 2 | 
 3 | import com.cloudwick.logging.LazyLogging
 4 | import com.cloudwick.spark.examples.core.WordCount
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
 8 | import org.apache.spark.{Logging, SparkConf}
 9 | 
10 | /**
11 |  * Simple example illustrating the Network word count. This basically is a clone from Spark
12 |  * documentation.
13 |  *
14 |  * Usage: NetworkWordCountRunner <hostname> <port> <batchIntervalInSecs>
15 |  *   <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive
16 |  *   data
17 |  *
18 |  * To run this start an instance of netcat server:
19 |  *    `nc -lk 9999`
20 |  * and then run this example
21 |  * (NOTE: the master has to be local because we are binding nc to the localhost)
22 |  *    `spark-submit --class com.cloudwick.spark.examples.streaming.local.NetworkWordCountRunner
23 |  *                  --master local[*] <path_to_jar> localhost 9999 5`
24 |  */
25 | object NetworkWordCountRunner extends LazyLogging {
26 | 
27 |   def main(args: Array[String]) {
28 |     if (args.length < 3) {
29 |       System.err.println("Usage: NetworkWordCount host port batchIntervalInSecs")
30 |       System.exit(1)
31 |     }
32 | 
33 |     val Array(host, port, batchInterval) = args
34 |     val stopWords = Set("a", "an", "the")
35 | 
36 |     val conf = new SparkConf().setAppName("NetworkWordCount")
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
38 | 
39 |     // Create a DStream that will connect to host:port
40 |     val lines = ssc.socketTextStream(host, port.toInt, StorageLevel.MEMORY_AND_DISK_SER)
41 | 
42 |     NetworkWordCount.count(lines, stopWords) { (wordsCount: RDD[WordCount], time: Time) =>
43 |       val counts = time + ": " + wordsCount.collect().mkString("[", ", ", "]")
44 |       println(counts)
45 |     }
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCountWindowed.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.local
 2 | 
 3 | import com.cloudwick.spark.examples.core.WordCount
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | import org.apache.spark.streaming.dstream.DStream
 7 | import org.apache.spark.streaming.{Duration, Time}
 8 | 
 9 | // Bring prepareWords() and WordCount case class into scope
10 | import com.cloudwick.spark.examples._
11 | 
12 | 
13 | object NetworkWordCountWindowed {
14 | 
15 |   type WordHandler = (RDD[WordCount], Time) => Unit
16 | 
17 |   def count(lines: DStream[String],
18 |             windowDuration: Duration,
19 |             slideDuration: Duration,
20 |             stopWords: Set[String])
21 |            (handler: WordHandler): Unit = {
22 |     val words = lines.transform(WordCount.prepareWords(_, stopWords))
23 | 
24 |     val wordCounts =
25 |       words
26 |         .map(x => (x, 1))
27 |         .reduceByKeyAndWindow(_ + _, _ - _, windowDuration, slideDuration)
28 |         .map {
29 |           case (word: String, count: Int) => WordCount(word, count)
30 |         }
31 | 
32 |     wordCounts.foreachRDD((rdd: RDD[WordCount], time: Time) => {
33 |       handler(rdd.sortBy(_.word), time)
34 |     })
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCountWindowedRunner.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.local
 2 | 
 3 | import java.nio.file.Files
 4 | import com.cloudwick.logging.LazyLogging
 5 | import com.cloudwick.spark.examples.core.WordCount
 6 | import com.cloudwick.spark.examples.streaming.local.NetworkWordCountWindowed._
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.{Logging, SparkConf}
 9 | import org.apache.spark.storage.StorageLevel
10 | import org.apache.spark.streaming.{Time, Seconds, StreamingContext}
11 | 
12 | /**
13 |  * Simple example illustrating the Network word count using windowed based operations. This example
14 |  * generates word counts over last 30 seconds of data, every 10 seconds
15 |  *
16 |  * Usage: NetworkWordCountWindowed <hostname> <port>
17 |  *   <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive
18 |  *   data
19 |  *
20 |  * To run this start an instance of netcat server:
21 |  *    `nc -lk 9999`
22 |  * and then run this example
23 |  * (NOTE: the master has to be local because we are binding nc to the localhost)
24 |  *    `spark-submit --class com.cloudwick.spark.examples.streaming.local.NetworkWordCountWindowed
25 |  *                  --master local[*] <path_to_jar> localhost 9999`
26 |  */
27 | object NetworkWordCountWindowedRunner extends LazyLogging {
28 | 
29 |   def main(args: Array[String]) {
30 |     if (args.length < 2) {
31 |       System.err.println("Usage: NetworkWordCount <host> <port>")
32 |       System.exit(1)
33 |     }
34 | 
35 |     val hostname = args(0)
36 |     val port = args(1).toInt
37 | 
38 |     val checkpointDir = Files.createTempDirectory(this.getClass.getSimpleName).toString
39 |     val windowDuration = Seconds(30)
40 |     val slideDuration = Seconds(3)
41 |     val stopWords = Set("a", "an", "the")
42 | 
43 |     logger.info(s"Connecting to host: $hostname port: $port")
44 | 
45 |     // Create a local StreamingContext with master & specified batch interval
46 |     val conf = new SparkConf().setAppName("NetworkWordCount")
47 |     val ssc = new StreamingContext(conf, Seconds(5))
48 | 
49 |     ssc.checkpoint(checkpointDir)
50 | 
51 |     // Create a DStream that will connect to host:port
52 |     val lines = ssc.socketTextStream(hostname, port, StorageLevel.MEMORY_AND_DISK_SER)
53 | 
54 |     NetworkWordCountWindowed.count(lines, windowDuration, slideDuration, stopWords) {
55 |       (wordsCount: RDD[WordCount], time: Time) =>
56 |         val counts = time + ": " + wordsCount.collect().mkString("[", ", ", "]")
57 |         println(counts)
58 |     }
59 | 
60 |     // Start the computation
61 |     ssc.start()
62 |     // Wait for the computation to terminate
63 |     ssc.awaitTermination()
64 |   }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/examples/streaming/local/RecoverableNetworkWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.local
 2 | 
 3 | import java.io.File
 4 | import java.nio.charset.Charset
 5 | 
 6 | import com.cloudwick.logging.LazyLogging
 7 | import com.google.common.io.Files
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.streaming.StreamingContext._
10 | import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
11 | import org.apache.spark.{Logging, SparkConf}
12 | 
13 | /**
14 |  * Counts words in text encoded with UTF8 received from the network every second.
15 |  *
16 |  * Usage: RecoverableNetworkWordCount <hostname> <port> <checkpoint-directory> <output-file>
17 |  *   <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive
18 |  *   data. <checkpoint-directory> directory to HDFS-compatible file system which checkpoint data
19 |  *   <output-file> file to which the word counts will be appended
20 |  *
21 |  * <checkpoint-directory> and <output-file> must be absolute paths
22 |  *
23 |  * To run this on your local machine, you need to first run a NetCat server
24 |  *
25 |  *  `nc -lk 9999`
26 |  *
27 |  * and run the program
28 |  *
29 |  *  `spark-submit --class com.cloudwick.spark.examples.streaming.local.RecoverableNetworkWordCount
30 |  *     --master local[*] path-to-jar [args]`
31 |  *
32 |  * If the directory ~/checkpoint/ does not exist (e.g. running for the first time), it will create
33 |  * a new StreamingContext (will print "Creating new context" to the console). Otherwise, if
34 |  * checkpoint data exists in ~/checkpoint/, then it will create StreamingContext from
35 |  * the checkpoint data.
36 |  */
37 | object RecoverableNetworkWordCount extends LazyLogging {
38 | 
39 |   def createContext(ip: String, port: Int, outputPath: String, checkpointDirectory: String)
40 |   : StreamingContext = {
41 |     println("Creating new context")
42 |     val outputFile = new File(outputPath)
43 |     if (outputFile.exists()) outputFile.delete()
44 |     val sparkConf = new SparkConf().setAppName("RecoverableNetworkWordCount")
45 |     // Create the context with a 1 second batch size
46 |     val ssc = new StreamingContext(sparkConf, Seconds(1))
47 |     ssc.checkpoint(checkpointDirectory)
48 | 
49 |     // Create a socket stream on target ip:port and count the
50 |     // words in input stream of \n delimited text (eg. generated by 'nc')
51 |     val lines = ssc.socketTextStream(ip, port)
52 |     val words = lines.flatMap(_.split(" "))
53 |     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
54 |     wordCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => {
55 |       val counts = "Counts at time " + time + " " + rdd.collect().mkString("[", ", ", "]")
56 |       println(counts)
57 |       println("Appending to " + outputFile.getAbsolutePath)
58 |       Files.append(counts + "\n", outputFile, Charset.defaultCharset())
59 |     })
60 |     ssc
61 |   }
62 | 
63 |   def main(args: Array[String]) {
64 |     if (args.length != 4) {
65 |       System.err.println("You arguments were " + args.mkString("[", ", ", "]"))
66 |       System.err.println(
67 |         """
68 |           |Usage: RecoverableNetworkWordCount <hostname> <port> <checkpoint-directory>
69 |           |     <output-file>. <hostname> and <port> describe the TCP server that Spark
70 |           |     Streaming would connect to receive data. <checkpoint-directory> directory to
71 |           |     HDFS-compatible file system which checkpoint data & <output-file> file to which the
72 |           |     word counts will be appended to.
73 |           |
74 |           |In local mode, <master> should be 'local[n]' with n > 1
75 |           |Both <checkpoint-directory> and <output-file> must be absolute paths
76 |         """.stripMargin
77 |       )
78 |       System.exit(1)
79 |     }
80 |     val Array(ip, port, checkpointDirectory, outputPath) = args
81 |     val ssc = StreamingContext.getOrCreate(checkpointDirectory,
82 |       () => {
83 |         createContext(ip, port.toInt, outputPath, checkpointDirectory)
84 |       })
85 |     ssc.start()
86 |     ssc.awaitTermination()
87 |   }
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/loganalysis/LogAnalyzer.scala:
--------------------------------------------------------------------------------
  1 | package com.cloudwick.spark.loganalysis
  2 | 
  3 | import java.io.{File, FileNotFoundException}
  4 | import java.net.InetAddress
  5 | 
  6 | import com.cloudwick.cassandra.schema.{LocationVisit, LogVolume, StatusCount}
  7 | import com.cloudwick.cassandra.{Cassandra, CassandraLocationVisitServiceModule, CassandraLogVolumeServiceModule, CassandraStatusCountServiceModule}
  8 | import com.cloudwick.logging.LazyLogging
  9 | import com.maxmind.geoip2.DatabaseReader.Builder
 10 | import com.maxmind.geoip2.exception.AddressNotFoundException
 11 | import org.apache.spark.SparkFiles
 12 | import org.apache.spark.rdd.RDD
 13 | import org.apache.spark.streaming.Time
 14 | import org.apache.spark.streaming.dstream.DStream
 15 | import org.joda.time.format.DateTimeFormat
 16 | 
 17 | import scala.concurrent.duration._
 18 | 
 19 | /**
 20 |  * Log analytics use-case for Apache log-events generated by
 21 |  * [[https://github.com/cloudwicklabs/generator CloudwickLabs Generator]]
 22 |  *
 23 |  * @author ashrith
 24 |  */
 25 | object LogAnalyzer extends Cassandra with CassandraStatusCountServiceModule
 26 |   with CassandraLogVolumeServiceModule with CassandraLocationVisitServiceModule with LazyLogging {
 27 | 
 28 |   type StatusHandler = (RDD[StatusCount], Time) => Unit
 29 |   type VolumeHandler = (RDD[LogVolume], Time) => Unit
 30 |   type LocationHandler = (RDD[LocationVisit], Time) => Unit
 31 | 
 32 |   private val logEventPattern = """([\d.]+) (\S+) (\S+) \[(.*)\] "([^\s]+) (/[^\s]*) HTTP/[^\s]+" (\d{3}) (\d+) "([^"]+)" "([^"]+)"""".r
 33 |   private val formatter = DateTimeFormat.forPattern("dd/MMM/yyyy:HH:mm:ss Z")
 34 | 
 35 |   if (!new File(SparkFiles.getRootDirectory(), "GeoLite2-City.mmdb").exists()) {
 36 |     throw new FileNotFoundException("Please pass GeoLite2-City.mmdb using --files from spark-submit")
 37 |   }
 38 |   private val dbFile = new File(SparkFiles.get("GeoLite2-City.mmdb"))
 39 |   private val dbReader = new Builder(dbFile).build()
 40 | 
 41 |   /**
 42 |    * Installs required cassandra schema
 43 |    */
 44 |   def createCassandraSchema() = {
 45 |     installSchema()
 46 |   }
 47 | 
 48 |   /**
 49 |    * Takes a log event and validates using regex to figure out if the event is in expected format
 50 |    * @param le a log event string
 51 |    * @return either `LogEvent` (if event is in expected format) or `None`
 52 |    */
 53 |   def parseLogEvent(le: String): Option[LogEvent] = {
 54 |     le match {
 55 |       case logEventPattern(ip, ci, ui, ts, rt, rp, rc, rs, r, ua) =>
 56 |         Some(LogEvent(ip, ci, ui, formatter.parseDateTime(ts), rt, rp, rc.toInt, rs.toInt, r, ua))
 57 |       case _ => None
 58 |     }
 59 |   }
 60 | 
 61 |   /**
 62 |    * Prepares the parsed events in this case to remove the None objects from RDD using flatMap
 63 |    * @param lines a RDD of lines
 64 |    * @return a RDD of LogEvent objects
 65 |    */
 66 |   def prepareEvents(lines: RDD[String]): RDD[LogEvent] = {
 67 |     lines.flatMap(_.split("\\n")).flatMap(parseLogEvent)
 68 |   }
 69 | 
 70 |   /**
 71 |    * Resolves an ip address to geo location using maxmind's geoip2 api
 72 |    * @param ip an ip address to resolve
 73 |    * @return Location object with resolved country, city, latitude and longitude information
 74 |    */
 75 |   def resolveIp(ip: String): Option[Location] = {
 76 |     val ipAddress = InetAddress.getByName(ip)
 77 |     try {
 78 |       val response = dbReader.city(ipAddress)
 79 |       val country  = response.getCountry.getIsoCode match { case ""|null => "US" case x => x }
 80 |       val city = response.getCity.getName match { case ""|null => "<empty>" case x => x }
 81 |       Some(
 82 |         Location(ip,
 83 |           country,
 84 |           city,
 85 |           response.getLocation.getLatitude,
 86 |           response.getLocation.getLongitude))
 87 |     } catch {
 88 |       case ex: AddressNotFoundException => None
 89 |     }
 90 | 
 91 |   }
 92 | 
 93 |   /**
 94 |    * Takes in raw log events, parses them and aggregates the status counts by using status code as
 95 |    * the key
 96 |    * @param lines RDD of raw log events
 97 |    */
 98 |   def statusCounter(lines: RDD[String]): RDD[StatusCount] = {
 99 |     val events = prepareEvents(lines)
100 | 
101 |     events.map(event => (event.responseCode, 1L)).reduceByKey(_ + _).map {
102 |       case (statusCode: Int, count: Long) => StatusCount(statusCode, count)
103 |     }
104 |   }
105 | 
106 |   /**
107 |    * Takes in log events, parses them and counts the number of times a status code has appeared and
108 |    * finally persists the events to cassandra
109 |    * @param lines a DStream of raw log event lines
110 |    * @param handler a function `(RDD[StatusCount], Time) => Unit` to apply on the driver side
111 |    */
112 |   def statusCounter(lines: DStream[String])(handler: StatusHandler): Unit = {
113 |     val statusCounts = lines.transform(rdd => statusCounter(rdd))
114 | 
115 |     statusCounts.foreachRDD((rdd: RDD[StatusCount], time: Time) => {
116 |       handler(rdd.sortBy(_.statusCode), time) // executed at the driver
117 | 
118 |       rdd.foreachPartition(partitionRecords => {
119 |         partitionRecords.foreach(statusCountService.update)
120 |       })
121 |     })
122 |   }
123 | 
124 |   /**
125 |    * Takes in raw log events, parses them and performs aggregations based on the event minute
126 |    * interval, basically provides the number of hits per minute
127 |    * @param lines RDD of raw log evnets
128 |    * @return RDD of LogVolume
129 |    */
130 |   def volumeCounter(lines: RDD[String]): RDD[LogVolume] = {
131 |     val events = prepareEvents(lines)
132 | 
133 |     events.map { event =>
134 |       val millis = event.timeStamp.getMillis
135 |       val minutes = Duration(millis, MILLISECONDS).toMinutes
136 |       (minutes, 1L)
137 |     }.reduceByKey(_ + _).map {
138 |       case (minute: Long, count: Long) => LogVolume(minute, count)
139 |     }
140 |   }
141 | 
142 |   /**
143 |    * Takes in log events, parses them and counts the number of events appeared in single minute
144 |    * window and persists the results to cassandra
145 |    * @param lines a DStream of raw log events
146 |    * @param handler a function to apply on the driver side
147 |    */
148 |   def volumeCounter(lines: DStream[String])(handler: VolumeHandler): Unit = {
149 |     val volumeCounts = lines.transform(rdd => volumeCounter(rdd))
150 | 
151 |     volumeCounts.foreachRDD((rdd: RDD[LogVolume], time: Time) => {
152 |       handler(rdd.sortBy(_.timeStamp), time)
153 | 
154 |       rdd.foreachPartition(partitionRecords => {
155 |         partitionRecords.foreach(logVolumeService.update)
156 |       })
157 |     })
158 |   }
159 | 
160 |   /**
161 |    * Take in raw log events, parses them and aggreagtes based on the country & city, to obtain
162 |    * country and city information it performs GeoLocation lookup
163 |    * @param lines RDD of raw log events
164 |    * @return RDD of LocationVisit
165 |    */
166 |   def countryCounter(lines: RDD[String]): RDD[LocationVisit] = {
167 |     val events = prepareEvents(lines)
168 | 
169 |     events.flatMap { event =>
170 |       resolveIp(event.ip)
171 |     }.map { loc =>
172 |       ((loc.country, loc.city), 1L)
173 |     }.reduceByKey(_ + _).map {
174 |       case((country: String, city: String), count: Long) => LocationVisit(country, city, count)
175 |     }
176 |   }
177 | 
178 |   /**
179 |    * Performs geo-location lookup based on the ip address of the log event, counts number of
180 |    * (Country, City) counts and persists them to cassandra
181 |    * @param lines a DStream of raw log events
182 |    * @param handler a function to apply on the driver side
183 |    */
184 |   def countryCounter(lines: DStream[String])(handler: LocationHandler): Unit = {
185 |     val countryCounts = lines.transform(rdd => countryCounter(rdd))
186 | 
187 |     countryCounts.foreachRDD((rdd: RDD[LocationVisit], time: Time) => {
188 |       handler(rdd.sortBy(_.country), time)
189 | 
190 |       rdd.foreachPartition(partitionRecords => {
191 |         partitionRecords.foreach(locationVisitService.update)
192 |       })
193 |     })
194 |   }
195 | }
196 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/loganalysis/LogAnalyzerRunner.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.loganalysis
 2 | 
 3 | import com.cloudwick.logging.LazyLogging
 4 | import org.apache.spark.{SparkContext, SparkConf}
 5 | 
 6 | /**
 7 |  * Spark job to analyze apache http log events
 8 |  *  - Aggregates globally total number of times a status code's (200, 404, 503, ...) have been
 9 |  *    encountered
10 |  *  - Aggregates per minute hits received by the web server
11 |  *  - Aggregates counts based on Country & City the request originated from using GeoLocation lookup
12 |  *
13 |  * @author ashrith
14 |  */
15 | object LogAnalyzerRunner extends LazyLogging {
16 |   def main(args: Array[String]) {
17 |     if (args.length != 2) {
18 |       System.err.println("Usage: LogAnalyzerRunner input_path output_path")
19 |       System.exit(1)
20 |     }
21 | 
22 |     val Array(inputPath, outputPath) = args
23 | 
24 |     val conf = new SparkConf().setAppName("LogAnalyzerRunner")
25 |     val sc = new SparkContext(conf)
26 | 
27 |     val lines = sc.textFile(inputPath)
28 |     val statusCounts = LogAnalyzer.statusCounter(lines)
29 |     val volumeCounter = LogAnalyzer.volumeCounter(lines)
30 |     val countryCounter = LogAnalyzer.countryCounter(lines)
31 | 
32 |     statusCounts.saveAsTextFile(outputPath + "/status_counts")
33 |     volumeCounter.saveAsTextFile(outputPath + "/volume_counts")
34 |     countryCounter.saveAsTextFile(outputPath + "/country_counts")
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/loganalysis/LogAnalyzerStreamingRunner.scala:
--------------------------------------------------------------------------------
  1 | package com.cloudwick.spark.loganalysis
  2 | 
  3 | import java.io.File
  4 | import java.nio.file.{Paths, Files}
  5 | 
  6 | import com.amazonaws.auth.AWSCredentials
  7 | import com.amazonaws.services.kinesis.AmazonKinesisClient
  8 | import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
  9 | import com.cloudwick.cassandra.schema.{LocationVisit, LogVolume, StatusCount}
 10 | import com.cloudwick.logging.LazyLogging
 11 | import com.typesafe.config.{Config, ConfigFactory}
 12 | import kafka.serializer.StringDecoder
 13 | import org.apache.spark.rdd.RDD
 14 | import org.apache.spark.storage.StorageLevel
 15 | import org.apache.spark.streaming.dstream.DStream
 16 | import org.apache.spark.streaming.kafka.KafkaUtils
 17 | import org.apache.spark.streaming.kinesis.KinesisUtils
 18 | import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext, Time}
 19 | import org.apache.spark.{Logging, SparkConf, SparkContext}
 20 | 
 21 | /**
 22 |  * Starts a spark streaming job which does apache httpd log analytics, currently supported analytics:
 23 |  *  - Aggregates globally total number of times a status code's (200, 404, 503, ...) have been
 24 |  *    encountered
 25 |  *  - Aggregates per minute hits received by the web server
 26 |  *  - Aggregates counts based on Country & City the request originated from using GeoLocation lookup
 27 |  *
 28 |  * TODO:
 29 |  *  - Add windowed based transformations
 30 |  *
 31 |  * Running this job locally:
 32 |  * 1. Start a local Cassandra instance
 33 |  *      `${CASSANDRA_HOME}/bin/cassandra -f`
 34 |  * 2. Copy `src/main/resources/reference.conf` to convenient location and edit to match respective
 35 |  *    properties to match your current environment
 36 |  *      `cp src/main/resources/reference.conf loganalysis.conf`
 37 |  * 3. If using Kafka as data source:
 38 |  *    i. Start a zookeeper local instance
 39 |  *      `${KAFKA_HOME}/bin/zookeeper-server-start.sh config/zookeeper.properties`
 40 |  *    ii. Start a kafka broker local instance
 41 |  *      `${KAFKA_HOME}/bin/kafka-server-start.sh config/server.properties`
 42 |  *    iii. Create a topic
 43 |  *      `${KAFKA_HOME}/bin/kafka-topics.sh --create --zookeeper localhost:2181 \
 44 |  *        --replication-factor 1 --partitions 1 --topic log-events`
 45 |  *    iv. Make sure the topic got created
 46 |  *      `${KAFKA_HOME}/bin/kafka-topics.sh --list --zookeeper localhost:2181`
 47 |  *    v. Start the [[https://github.com/cloudwicklabs/generator]] and start sending some messages to
 48 |  *       kafka
 49 |  *      `${GENERATOR_HOME}/bin/generator log --eventsPerSec 1 --outputFormat text \
 50 |  *        --destination kafka --kafkaTopicName log-events --totalEvents 100 --flushBatch 10`
 51 |  *    vi. Check the offset consumption of the topic
 52 |  *      `${KAFKA_HOME}/bin/kafka-consumer-offset-checker.sh --zookeeper localhost:2181
 53 |  *        --topic log-events --group loganalytics`
 54 |  * 4. If using kinesis as a data source:
 55 |  *    i. Start the [[https://github.com/cloudwicklabs/generator]] and start sending some messages to
 56 |  *       kinesis
 57 |  *       `${GENERATOR_HOME}/bin/generator log --eventsPerSec 1 --outputFormat text \
 58 |  *        --destination kinesis --kinesisStreamName logevents --kinesisShardCount 1 \
 59 |  *        --awsAccessKey [your_aws_access_key] --awsSecretKey [your_aws_secret_key] \
 60 |  *        --awsEndPoint [aws_end_point_url] --loggingLevel debug`
 61 |  * 5. Start this streaming job
 62 |  *      `${SPARK_HOME}/bin/spark-submit --class com.cloudwick.spark.loganalysis.LogAnalyzerRunner \
 63 |  *        --master "local[*]" --files src/main/resources/GeoLite2-City.mmdb \
 64 |  *        target/scala-2.10/spark_codebase-assembly-1.0.jar [kafka|kinesis] [config_file]`
 65 |  *
 66 |  * @author ashrith
 67 |  */
 68 | object LogAnalyzerStreamingRunner extends LazyLogging {
 69 | 
 70 |   /**
 71 |    * Creates a aws kinesis client connection
 72 |    * @param awsAccessKey aws access key
 73 |    * @param awsSecretKey aws secret key
 74 |    * @param awsEndPoint aws end point url for kinesis
 75 |    * @return AmazonKinesisClient connection object
 76 |    */
 77 |   def fromCredentials(awsAccessKey: String,
 78 |                       awsSecretKey: String,
 79 |                       awsEndPoint: String): AmazonKinesisClient = {
 80 |     val credentials = new AWSCredentials {
 81 |       override def getAWSAccessKeyId: String = awsAccessKey
 82 | 
 83 |       override def getAWSSecretKey: String = awsSecretKey
 84 |     }
 85 |     val client = new AmazonKinesisClient(credentials)
 86 |     client.setEndpoint(awsEndPoint)
 87 |     client
 88 |   }
 89 | 
 90 |   def main(args: Array[String]) {
 91 |     if (args.length < 1) {
 92 |       logger.error(
 93 |         """
 94 |           |Usage: KafkaWordCount <source> <configFile>
 95 |           |         source - specifies where to read data from, ex: kinesis, kafka, kafka-direct
 96 |           |         configFile - where to read the configurations from (ex: resources/reference.conf)
 97 |         """.stripMargin
 98 |       )
 99 |       System.exit(1)
100 |     }
101 | 
102 |     val config: Config = args.length match {
103 |       case 2 =>
104 |         val configFile = args(1)
105 |         if (Files.exists(Paths.get(configFile))) {
106 |           ConfigFactory.parseFile(new File(configFile))
107 |         } else {
108 |           logger.warn("Cannot find config file specified at {}. Falling back to default.", args(1))
109 |           ConfigFactory.load("default")
110 |         }
111 |       case _ =>
112 |         ConfigFactory.load("default")
113 |     }
114 | 
115 |     val kafkaConfig = config.resolve.getConfig("kafka")
116 |     val zkQuorum = kafkaConfig.getString("zookeeper.quorum")
117 |     val brokers = kafkaConfig.getString("brokers")
118 |     val group = kafkaConfig.getString("consumer.group")
119 |     val topics = kafkaConfig.getString("topics")
120 |     val numThreads = kafkaConfig.getInt("threads")
121 |     val kafkaParallelism = kafkaConfig.getInt("parallelism")
122 |     val kinesisConfig = config.resolve.getConfig("kinesis")
123 |     val streamName = kinesisConfig.getString("stream.name")
124 |     val streamAppname = kinesisConfig.getString("app.name")
125 |     val streamInitialPosition = kinesisConfig.getString("initial.position")
126 |     val awsAccessKey = kinesisConfig.getString("aws.access.key")
127 |     val awsSecretKey = kinesisConfig.getString("aws.secret.key")
128 |     val endPointUrl = kinesisConfig.getString("aws.endpoint.url")
129 |     val streamingAppConfig = config.resolve.getConfig("streaming")
130 |     val walEnabled = streamingAppConfig.getString("wal.enabled")
131 |     val batchDuration = streamingAppConfig.getInt("batch.duration.ms")
132 |     var checkpointDir = streamingAppConfig.getString("checkpoint.dir")
133 |     if (checkpointDir.isEmpty) {
134 |       checkpointDir = Files.createTempDirectory(this.getClass.getSimpleName).toString
135 |     }
136 | 
137 |     /**
138 |      * Creates a Streaming context
139 |      * @return
140 |      */
141 |     def createContext() = {
142 |       println("Creating new Spark Streaming Context...")
143 |       val sparkConf = new SparkConf()
144 |         .setAppName(streamAppname)
145 |         .set("spark.streaming.receiver.writeAheadLog.enable", walEnabled)
146 |       // .set("spark.executor.userClassPathFirst", "true")
147 |       // .set("spark.driver.userClassPathFirst", "true")
148 | 
149 |       val sc = new SparkContext(sparkConf)
150 |       val ssc = new StreamingContext(sc, Milliseconds(batchDuration))
151 |       var lines: DStream[String] = null
152 | 
153 |       val defaultStorageLevel = walEnabled match {
154 |         case "true" => StorageLevel.MEMORY_AND_DISK
155 |         case "false" => StorageLevel.MEMORY_AND_DISK_2
156 |       }
157 | 
158 |       args(0) match {
159 |         case "kafka"|"KAFKA" =>
160 |           /*
161 |            * Kafka receiver based approach
162 |            */
163 |           val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
164 |           val kafkaList = (0 until kafkaParallelism).map { _ =>
165 |             KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, defaultStorageLevel).map(_._2)
166 |           }
167 |           lines = ssc.union(kafkaList)
168 |         case "kafka-direct"|"KAFKA-DIRECT" =>
169 |           /*
170 |            * Using kafka direct api (no receiver-based approach)
171 |            * Features:
172 |            *  - # RDD partitions = # Kafka topic partitions
173 |            *  - No WAL
174 |            *  - Exactly once semantics by using Kafka simple API
175 |            *
176 |            * NOTE: This is a experimental feature introduced in 1.3
177 |            */
178 |           val topicsSet = topics.split(",").toSet
179 |           val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
180 |           val directKafkaStream =
181 |             KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
182 |               ssc, kafkaParams, topicsSet)
183 |           lines = directKafkaStream.map(_._2)
184 |         case "kinesis"|"KINESIS" =>
185 |           /*
186 |            * Kinesis receiver initialization
187 |            */
188 |           val kinesisClient = fromCredentials(awsAccessKey, awsSecretKey, endPointUrl)
189 |           // create multiple receivers based on number of stream shards
190 |           val numShards = kinesisClient.describeStream(streamName).getStreamDescription.getShards.size
191 |           // KinesisUtils uses aws java sdk which requires reading aws creds from system properties
192 |           System.setProperty("aws.accessKeyId", awsAccessKey)
193 |           System.setProperty("aws.secretKey", awsSecretKey)
194 |           val kinesisStreams = (0 until numShards).map { _ =>
195 |             KinesisUtils.createStream(ssc, streamName, endPointUrl, Milliseconds(batchDuration),
196 |               InitialPositionInStream.valueOf(streamInitialPosition), defaultStorageLevel)
197 |           }
198 |           lines = ssc.union(kinesisStreams).map(new String(_))
199 |         case _ =>
200 |           logger.error("Unexpected value found as argument.")
201 |           System.exit(1)
202 |       }
203 | 
204 |       // create required cassandra schema for storing the results
205 |       LogAnalyzer.createCassandraSchema()
206 | 
207 |       LogAnalyzer.statusCounter(lines) {(statusCount: RDD[StatusCount], time: Time) =>
208 |         val statusCounts = statusCount.collect()
209 |         println("StatusCounter: " + time + ": " + statusCounts.mkString("[", ", ", "]"))
210 |       }
211 | 
212 |       LogAnalyzer.volumeCounter(lines) {(volumeCount: RDD[LogVolume], time: Time) =>
213 |         val counts = "VolumeCounter: " + time + ": " + volumeCount.collect().mkString("[", ", ", "]")
214 |         println(counts)
215 |       }
216 | 
217 |       LogAnalyzer.countryCounter(lines) {(countryCount: RDD[LocationVisit], time: Time) =>
218 |         val counts = "CountryCounts: " + time + ": " + countryCount.collect().mkString("[", ", ", "]")
219 |         println(counts)
220 |       }
221 | 
222 |       /*
223 |        * Set the checkpoint directory
224 |        */
225 |       println("Checkpoint Dir: " + checkpointDir.toString)
226 |       ssc.checkpoint(checkpointDir)
227 | 
228 |       ssc
229 |     }
230 | 
231 |     val ssc = StreamingContext.getOrCreate(checkpointDir, createContext)
232 | 
233 |     ssc.start()
234 |     ssc.awaitTermination()
235 |   }
236 | 
237 | }
238 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudwick/spark/loganalysis/LogEvent.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.loganalysis
 2 | 
 3 | import org.joda.time.DateTime
 4 | 
 5 | /**
 6 |  * Wrapper for representing log event
 7 |  *
 8 |  * @param ip ip address of the request originator
 9 |  * @param clientIdentity RFC 1413 identity of the client determined by `identd` on the client
10 |  *                       machine
11 |  * @param userId UsedId of the person requesting the document as determined by the http
12 |  *               authentication
13 |  * @param timeStamp time at which the request was received
14 |  * @param requestType indicates info about http method used by the client is GET
15 |  * @param requestPage client requested resource /test.php
16 |  * @param responseCode status code that the server sends back to the client
17 |  * @param responseSize size of the object returned to the client, not including the response headers
18 |  * @param referrer identifies the site that the client reports having been referred from
19 |  * @param userAgent user-agent HTTP request header, identifies information that the client browser
20 |  *                  reports about itself
21 |  */
22 | case class LogEvent(ip: String,
23 |                     clientIdentity: String,
24 |                     userId: String,
25 |                     timeStamp: DateTime,
26 |                     requestType: String,
27 |                     requestPage: String,
28 |                     responseCode: Int,
29 |                     responseSize: Int,
30 |                     referrer: String,
31 |                     userAgent: String)
32 | 
33 | /**
34 |  * Wrapper for representing location from where the web request originated from
35 |  * @param ip ip address of the lookup
36 |  * @param country originating country
37 |  * @param city originating city
38 |  * @param lat latitude
39 |  * @param lon longitude
40 |  */
41 | case class Location(ip: String, country: String, city: String, lat: Double, lon: Double)


--------------------------------------------------------------------------------
/src/test/scala/com/cloudwick/spark/examples/core/WordCountSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.core
 2 | 
 3 | import com.cloudwick.spark.sparkspec.SparkSpec
 4 | import org.scalatest.{FlatSpec, GivenWhenThen, Matchers}
 5 | 
 6 | /**
 7 |  * Sample test suite for Spark WordCount application
 8 |  */
 9 | class WordCountSpec extends FlatSpec with SparkSpec with GivenWhenThen with Matchers {
10 |   "Empty set" should "be counted" in {
11 |     Given("empty set")
12 |     val lines = Array("")
13 | 
14 |     When("count words")
15 |     val wordCounts = WordCount.count(sc.parallelize(lines), Set()).collect()
16 | 
17 |     Then("empty count")
18 |     wordCounts shouldBe empty
19 |   }
20 | 
21 |   "Shakespeare most famous quote" should "be counted" in {
22 |     Given("quote")
23 |     val lines = Array("To be or not to be.", "That is the question.")
24 | 
25 |     Given("stop words")
26 |     val stopWords = Set("the")
27 | 
28 |     When("count words")
29 |     val wordCounts = WordCount.count(sc.parallelize(lines), stopWords).collect()
30 | 
31 |     Then("words counted")
32 |     wordCounts.sortBy(_.word) should equal(Array(
33 |       WordCount("be", 2),
34 |       WordCount("is", 1),
35 |       WordCount("not", 1),
36 |       WordCount("or", 1),
37 |       WordCount("question", 1),
38 |       WordCount("that", 1),
39 |       WordCount("to", 2)))
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/test/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCountSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.local
 2 | 
 3 | import com.cloudwick.spark.examples.core.WordCount
 4 | import com.cloudwick.spark.sparkspec.SparkStreamingSpec
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.streaming.{Seconds, Time}
 7 | import org.scalatest.concurrent.Eventually
 8 | import org.scalatest.time.{Millis, Span}
 9 | import org.scalatest.{Matchers, GivenWhenThen, FlatSpec}
10 | 
11 | import scala.collection.mutable
12 | import scala.collection.mutable.ListBuffer
13 | 
14 | class NetworkWordCountSpec
15 |   extends FlatSpec
16 |   with SparkStreamingSpec
17 |   with GivenWhenThen
18 |   with Matchers
19 |   with Eventually {
20 | 
21 |   private val batch = Seconds(5)
22 | 
23 |   implicit override val patienceConfig = PatienceConfig(timeout = scaled(Span(1000, Millis)))
24 | 
25 |   "Sample set" should "be counted" in {
26 |     Given("streaming context is initialized")
27 |     val lines = mutable.Queue[RDD[String]]()
28 | 
29 |     var results = ListBuffer.empty[Array[WordCount]]
30 | 
31 |     NetworkWordCount.count(ssc.queueStream(lines), Set()) {
32 |       (wordsCount: RDD[WordCount], time: Time) =>
33 |         results += wordsCount.collect()
34 |     }
35 | 
36 |     ssc.start()
37 | 
38 |     When("first batch of words queued")
39 |     lines += sc.makeRDD(Seq("a", "b", "hello", "world", "hello", "b"))
40 | 
41 |     Then("words counted in batch")
42 |     clock.advance(batch.milliseconds)
43 |     eventually {
44 |       println(results.last)
45 |       results.last should equal(Array(
46 |         WordCount("a", 1),
47 |         WordCount("b", 2),
48 |         WordCount("hello", 2),
49 |         WordCount("world", 1)
50 |       ))
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/test/scala/com/cloudwick/spark/examples/streaming/local/NetworkWordCountWindowedSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.examples.streaming.local
 2 | 
 3 | import com.cloudwick.spark.examples.core.WordCount
 4 | import com.cloudwick.spark.sparkspec.SparkStreamingSpec
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.streaming.{Seconds, Time}
 7 | import org.scalatest.concurrent.Eventually
 8 | import org.scalatest.time.{Millis, Span}
 9 | import org.scalatest.{FlatSpec, GivenWhenThen, Matchers}
10 | 
11 | import scala.collection.mutable
12 | import scala.collection.mutable.ListBuffer
13 | 
14 | class NetworkWordCountWindowedSpec
15 |   extends FlatSpec
16 |   with SparkStreamingSpec
17 |   with GivenWhenThen
18 |   with Matchers
19 |   with Eventually {
20 | 
21 |   private val windowDuration = Seconds(4)
22 |   private val slideDuration = Seconds(2)
23 | 
24 |   // default timeout for eventually trait
25 |   implicit override val patienceConfig = PatienceConfig(timeout = scaled(Span(1500, Millis)))
26 | 
27 |   "Sample set" should "be counted" in {
28 |     Given("streaming context is initialized")
29 |     val lines = mutable.Queue[RDD[String]]()
30 | 
31 |     var results = ListBuffer.empty[Array[WordCount]]
32 | 
33 |     NetworkWordCountWindowed.count(ssc.queueStream(lines), windowDuration, slideDuration, Set()) { (wordsCount: RDD[WordCount], time: Time) =>
34 |       results += wordsCount.collect()
35 |     }
36 | 
37 |     ssc.start()
38 | 
39 |     When("first set of words queued")
40 |     lines += sc.makeRDD(Seq("a", "b"))
41 | 
42 |     Then("words counted after first slide")
43 |     clock.advance(slideDuration.milliseconds)
44 |     eventually {
45 |       results.last should equal(Array(
46 |         WordCount("a", 1),
47 |         WordCount("b", 1)))
48 |     }
49 | 
50 |     When("second set of words queued")
51 |     lines += sc.makeRDD(Seq("b", "c"))
52 | 
53 |     Then("words counted after second slide")
54 |     clock.advance(slideDuration.milliseconds)
55 |     eventually {
56 |       results.last should equal(Array(
57 |         WordCount("a", 1),
58 |         WordCount("b", 2),
59 |         WordCount("c", 1)))
60 |     }
61 | 
62 |     When("nothing more queued")
63 | 
64 |     Then("word counted after third slide")
65 |     clock.advance(slideDuration.milliseconds)
66 |     eventually {
67 |       results.last should equal(Array(
68 |         WordCount("a", 0),
69 |         WordCount("b", 1),
70 |         WordCount("c", 1)))
71 |     }
72 | 
73 |     When("nothing more queued")
74 | 
75 |     Then("word counted after fourth slide")
76 |     clock.advance(slideDuration.milliseconds)
77 |     eventually {
78 |       results.last should equal(Array(
79 |         WordCount("a", 0),
80 |         WordCount("b", 0),
81 |         WordCount("c", 0)))
82 |     }
83 |   }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/test/scala/com/cloudwick/spark/loganalysis/LogAnalyzerSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.loganalysis
 2 | 
 3 | import org.scalatest.{Matchers, FlatSpec}
 4 | 
 5 | /**
 6 |  * Description goes here.
 7 |  * @author ashrith
 8 |  */
 9 | class LogAnalyzerSpec extends FlatSpec with Matchers {
10 |   "Parser with proper log event" should "return back not None" in {
11 |     val sampleLogEvent = "95.22.50.11 - - [09/Sep/2013:16:36:44 -0700] \"GET /test.php HTTP/1.1\" 200 1832 \"-\" \"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1\""
12 |     //println(LogAnalyzer.parseLogEvent(sampleLogEvent))
13 |     assert(LogAnalyzer.parseLogEvent(sampleLogEvent) != None)
14 |   }
15 | 
16 |   "Parser with improper log event" should "return back None" in {
17 |     val sampleLogEvent = "[09/Sep/2013:16:36:44 -0700] \"GET /test.php HTTP/1.1\" 200 1832 \"-\" \"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1\""
18 |     assert(LogAnalyzer.parseLogEvent(sampleLogEvent) == None)
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/test/scala/com/cloudwick/spark/sparkspec/SparkSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.sparkspec
 2 | 
 3 | import org.apache.spark.{SparkContext, SparkConf}
 4 | import org.scalatest.{Suite, BeforeAndAfterAll}
 5 | 
 6 | /**
 7 |  * Extending the Suite for convenience, this base SparkSpec set's the SparkContext app
 8 |  */
 9 | trait SparkSpec extends BeforeAndAfterAll {
10 |   this: Suite =>
11 | 
12 |   private val master = "local[2]"
13 |   private val appName = this.getClass.getSimpleName
14 | 
15 |   private var _sc: SparkContext = _
16 | 
17 |   def sc = _sc
18 | 
19 |   val conf: SparkConf = new SparkConf().setAppName(appName).setMaster(master)
20 | 
21 |   override def beforeAll(): Unit = {
22 |     super.beforeAll()
23 |     _sc = new SparkContext(conf)
24 |   }
25 | 
26 |   override def afterAll(): Unit = {
27 |     if (_sc != null) {
28 |       _sc.stop()
29 |       _sc = null
30 |     }
31 |     super.afterAll()
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/test/scala/com/cloudwick/spark/sparkspec/SparkSqlSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.sparkspec
 2 | 
 3 | import org.apache.spark.sql.SQLContext
 4 | import org.scalatest.Suite
 5 | 
 6 | trait SparkSqlSpec extends SparkSpec {
 7 |   this: Suite =>
 8 | 
 9 |   private var _sqlc: SQLContext = _
10 | 
11 |   def sqlc = _sqlc
12 | 
13 |   override def beforeAll(): Unit = {
14 |     super.beforeAll()
15 | 
16 |     _sqlc = new SQLContext(sc)
17 |   }
18 | 
19 |   override def afterAll(): Unit = {
20 |     _sqlc = null
21 | 
22 |     super.afterAll()
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/scala/com/cloudwick/spark/sparkspec/SparkStreamingSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudwick.spark.sparkspec
 2 | 
 3 | import java.nio.file.Files
 4 | import org.apache.spark.streaming.{Seconds, ClockWrapper, StreamingContext}
 5 | import org.scalatest.Suite
 6 | 
 7 | trait SparkStreamingSpec extends SparkSpec {
 8 |   this: Suite =>
 9 | 
10 |   private var _ssc: StreamingContext = _
11 |   private var _clock: ClockWrapper = _
12 |   val batchDuration = Seconds(1)
13 |   val checkpointDir = Files.createTempDirectory(this.getClass.getSimpleName)
14 | 
15 |   def ssc = _ssc
16 |   def clock = _clock
17 | 
18 |   conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock")
19 | 
20 |   override def beforeAll(): Unit = {
21 |     super.beforeAll()
22 | 
23 |     _ssc = new StreamingContext(sc, batchDuration)
24 |     _ssc.checkpoint(checkpointDir.toString)
25 | 
26 |     _clock = new ClockWrapper(ssc)
27 |   }
28 | 
29 |   override def afterAll(): Unit = {
30 |     if (_ssc != null) {
31 |       _ssc.stop(stopSparkContext = false, stopGracefully = false)
32 |       _ssc = null
33 |     }
34 | 
35 |     super.afterAll()
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/streaming/ClockWrapper.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.streaming
 2 | 
 3 | import org.apache.spark.util.ManualClock
 4 | 
 5 | /**
 6 |  * Spark Streaming transformations are depend on batch interval's. We should wait for at-least the
 7 |  * time specified for streaming interval in the test suite's to actually see the results. This
 8 |  * wrapper class would avoid us to write a lot of `Thread.sleep()` method's in the test suites.
 9 |  */
10 | class ClockWrapper(ssc: StreamingContext) {
11 |   private def manualClock: ManualClock = ssc.scheduler.clock.asInstanceOf[ManualClock]
12 | 
13 |   def getTimeMillis: Long = manualClock.getTimeMillis()
14 | 
15 |   def setTime(time: Long) = manualClock.setTime(time)
16 | 
17 |   def advance(timeToAdd: Long) = manualClock.advance(timeToAdd)
18 | 
19 |   def waitTillTime(targetTime: Long) = manualClock.waitTillTime(targetTime)
20 | }
21 | 


--------------------------------------------------------------------------------