├── docs
    ├── architecture.dia
    └── architecture.png
├── config
    ├── kafka-server1.properties
    └── zookeeper.properties
├── project
    └── plugins.sbt
├── .gitignore
├── src
    └── main
    │   └── scala
    │       └── com
    │           └── chimpler
    │               └── sparkstreaminglogaggregation
    │                   ├── MongoConversions.scala
    │                   ├── Constants.scala
    │                   ├── Models.scala
    │                   ├── Codec.scala
    │                   ├── RandomLogGenerator.scala
    │                   └── LogAggregator.scala
└── README.md


/docs/architecture.dia:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chimpler/blog-spark-streaming-log-aggregation/HEAD/docs/architecture.dia


--------------------------------------------------------------------------------
/docs/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chimpler/blog-spark-streaming-log-aggregation/HEAD/docs/architecture.png


--------------------------------------------------------------------------------
/config/kafka-server1.properties:
--------------------------------------------------------------------------------
1 | broker.id=1
2 | port=9093
3 | log.dir=/tmp/kafka-logs-1
4 | zookeeper.connect=localhost:2181
5 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 | 
3 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
4 | 
5 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.5.0")
6 | 
7 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.4.0")


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | 


--------------------------------------------------------------------------------
/config/zookeeper.properties:
--------------------------------------------------------------------------------
1 | # the directory where the snapshot is stored.
2 | dataDir=/tmp/zookeeper
3 | # the port at which the clients will connect
4 | clientPort=2181
5 | # disable the per-ip limit on the number of connections since this is a non-production config
6 | maxClientCnxns=0


--------------------------------------------------------------------------------
/src/main/scala/com/chimpler/sparkstreaminglogaggregation/MongoConversions.scala:
--------------------------------------------------------------------------------
 1 | package com.chimpler.sparkstreaminglogaggregation
 2 | 
 3 | import org.joda.time.DateTime
 4 | import reactivemongo.bson._
 5 | 
 6 | object MongoConversions {
 7 |   implicit object DateTimeHandler extends BSONHandler[BSONDateTime, DateTime] {
 8 |     def read(t: BSONDateTime) = new DateTime(t.value)
 9 | 
10 |     def write(t: DateTime) = BSONDateTime(t.getMillis)
11 |   }
12 | }


--------------------------------------------------------------------------------
/src/main/scala/com/chimpler/sparkstreaminglogaggregation/Constants.scala:
--------------------------------------------------------------------------------
 1 | package com.chimpler.sparkstreaminglogaggregation
 2 | 
 3 | object Constants {
 4 |   val NumPublishers = 5
 5 |   val NumAdvertisers = 3
 6 | 
 7 |   val Publishers = (0 to NumPublishers).map("publisher_" +)
 8 |   val Advertisers = (0 to NumAdvertisers).map("advertiser_" +)
 9 |   val UnknownGeo = "unknown"
10 |   val Geos = Seq("NY", "CA", "FL", "MI", "HI", UnknownGeo)
11 |   val NumWebsites = 10000
12 |   val NumCookies = 10000
13 | 
14 |   val KafkaTopic = "adnetwork-topic"
15 | }


--------------------------------------------------------------------------------
/src/main/scala/com/chimpler/sparkstreaminglogaggregation/Models.scala:
--------------------------------------------------------------------------------
 1 | package com.chimpler.sparkstreaminglogaggregation
 2 | 
 3 | import com.twitter.algebird.HLL
 4 | import org.joda.time.DateTime
 5 | 
 6 | case class ImpressionLog(timestamp: Long, publisher: String, advertiser: String, website: String, geo: String, bid: Double, cookie: String)
 7 | 
 8 | // intermediate result used in reducer
 9 | case class AggregationLog(timestamp: Long, sumBids: Double, imps: Int = 1, uniquesHll: HLL)
10 | 
11 | // result to be stored in MongoDB
12 | case class AggregationResult(date: DateTime, publisher: String, geo: String, imps: Int, uniques: Int, avgBids: Double)
13 | 
14 | case class PublisherGeoKey(publisher: String, geo: String)
15 | 
16 | 


--------------------------------------------------------------------------------
/src/main/scala/com/chimpler/sparkstreaminglogaggregation/Codec.scala:
--------------------------------------------------------------------------------
 1 | package com.chimpler.sparkstreaminglogaggregation
 2 | 
 3 | 
 4 | import com.novus.salat
 5 | import com.novus.salat.global._
 6 | import kafka.serializer.{Decoder, Encoder}
 7 | import kafka.utils.VerifiableProperties
 8 | import org.apache.commons.io.Charsets
 9 | 
10 | // encode and decode logs in JSON (in this tuto for readability purpose) but it would be better to consider something like AVRO or protobuf)
11 | class ImpressionLogDecoder(props: VerifiableProperties) extends Decoder[ImpressionLog] {
12 |   def fromBytes(bytes: Array[Byte]): ImpressionLog = {
13 |     salat.grater[ImpressionLog].fromJSON(new String(bytes, Charsets.UTF_8))
14 |   }
15 | }
16 | 
17 | class ImpressionLogEncoder(props: VerifiableProperties) extends Encoder[ImpressionLog] {
18 |   def toBytes(impressionLog: ImpressionLog): Array[Byte] = {
19 |     salat.grater[ImpressionLog].toCompactJSON(impressionLog).getBytes(Charsets.UTF_8)
20 |   }
21 | }


--------------------------------------------------------------------------------
/src/main/scala/com/chimpler/sparkstreaminglogaggregation/RandomLogGenerator.scala:
--------------------------------------------------------------------------------
 1 | package com.chimpler.sparkstreaminglogaggregation
 2 | 
 3 | import java.util.{Date, Properties}
 4 | import kafka.javaapi.producer.Producer
 5 | import kafka.producer.{KeyedMessage, ProducerConfig}
 6 | import Constants._
 7 | import kafka.producer
 8 | import org.joda.time.DateTime
 9 | 
10 | import scala.collection.JavaConversions._
11 | 
12 | import kafka.Kafka
13 | 
14 | import scala.util.Random
15 | 
16 | /**
17 |  * Publish random logs to Kafka
18 |  */
19 | object RandomLogGenerator extends App {
20 |   val random = new Random()
21 | 
22 |   val props = new Properties()
23 |   props ++= Map(
24 |     "serializer.class" -> "com.chimpler.sparkstreaminglogaggregation.ImpressionLogEncoder",
25 |     "metadata.broker.list" -> "127.0.0.1:9093"
26 |   )
27 | 
28 |   val config = new ProducerConfig(props)
29 |   val producer = new Producer[String, ImpressionLog](config)
30 | 
31 |   println("Sending messages...")
32 |   var i = 0
33 |   // infinite loop
34 |   while(true) {
35 |     val timestamp = System.currentTimeMillis()
36 |     val publisher = Publishers(random.nextInt(NumPublishers))
37 |     val advertiser = Advertisers(random.nextInt(NumAdvertisers))
38 |     val website = s"website_${random.nextInt(Constants.NumWebsites)}.com"
39 |     val cookie = s"cookie_${random.nextInt(Constants.NumCookies)}"
40 |     val geo = Geos(random.nextInt(Geos.size))
41 |     val bid = math.abs(random.nextDouble()) % 1
42 |     val log = ImpressionLog(timestamp, publisher, advertiser, website, geo, bid, cookie)
43 |     producer.send(new KeyedMessage[String, ImpressionLog](Constants.KafkaTopic, log))
44 |     i = i + 1
45 |     if (i % 10000 == 0) {
46 |       println(s"Sent $i messages!")
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Simple example consuming an adserver logs stream from Kafka.
 2 | 
 3 | More information on our blog: http://chimpler.wordpress.com/2014/07/01/implementing-a-real-time-data-pipeline-with-spark-streaming/
 4 | 
 5 | In order to run our example, we need to install the followings:
 6 | 
 7 | * [Scala 2.10+](http://www.scala-lang.org/)
 8 | * [SBT](http://www.scala-sbt.org/)
 9 | * [Apache Zookeeper](http://zookeeper.apache.org/)
10 | * [Apache Kafka](http://kafka.apache.org/)
11 | * [MongoDB](http://www.mongodb.org/)
12 | 
13 | 
14 | Building the examples:
15 |     
16 |     $ sbt pack
17 | 
18 | Create a topic “adnetwork-topic”:
19 |     
20 |     $ kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic adnetwork-topic
21 |     
22 | Start Zookeeper:
23 |    
24 |     $ zookeeper-server-start.sh config/zookeeper.properties
25 |     
26 | Start Kafka:
27 | 
28 |     $ kafka-server-start.sh config/kafka-server1.properties
29 | 
30 | Run MongoDB:
31 |     $ sudo mongod
32 |     
33 | On one window, run the aggregator:
34 | 
35 |     $ target/pack/bin/aggregator
36 | 
37 | On the other one, run the adserver log random generator:
38 | 
39 |     $ target/pack/bin/generator
40 |     
41 | You can also see the messages that are sent using the Kafka console consumer:
42 | 
43 |     $ kafka-console-consumer.sh --topic adnetwork-topic --zookeeper localhost:2181
44 |     
45 | After a few seconds, you should see the results in MongoDB:
46 | 
47 |     $ mongoexport -d adlogdb -c impsPerPubGeo --csv -f date,publisher,geo,imps,uniques,avgBids
48 |     connected to: 127.0.0.1
49 |      
50 |     date,publisher,geo,imps,uniques,avgBids
51 |     2014-07-01T03:24:39.679Z,"publisher_4","CA",3980,3248,0.50062253292876
52 |     2014-07-01T03:24:39.681Z,"publisher_4","MI",3958,3229,0.505213545705667
53 |     2014-07-01T03:24:39.681Z,"publisher_1","HI",3886,3218,0.4984981221446526
54 |     2014-07-01T03:24:39.681Z,"publisher_3","CA",3937,3226,0.5038157362872939
55 |     2014-07-01T03:24:39.679Z,"publisher_4","NY",3894,3200,0.5022389599376207
56 |     2014-07-01T03:24:39.679Z,"publisher_2","HI",3906,3240,0.4988378174961185
57 |     2014-07-01T03:24:39.679Z,"publisher_3","HI",3989,3309,0.4975347625823641
58 |     2014-07-01T03:24:39.681Z,"publisher_3","FL",3957,3167,0.4993339490605483
59 | 


--------------------------------------------------------------------------------
/src/main/scala/com/chimpler/sparkstreaminglogaggregation/LogAggregator.scala:
--------------------------------------------------------------------------------
 1 | package com.chimpler.sparkstreaminglogaggregation
 2 | 
 3 | import com.github.nscala_time.time.Imports._
 4 | import com.twitter.algebird.HyperLogLogMonoid
 5 | import kafka.serializer.StringDecoder
 6 | import org.apache.commons.io.Charsets
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.storage.StorageLevel
10 | import org.apache.spark.streaming.kafka.KafkaUtils
11 | import org.apache.spark.streaming.{Seconds, StreamingContext}
12 | import reactivemongo.api._
13 | import reactivemongo.api.collections.default.BSONCollection
14 | import reactivemongo.bson._
15 | import MongoConversions._
16 | 
17 | import scala.concurrent.ExecutionContext.Implicits.global
18 | 
19 | object LogAggregator extends App {
20 |   val BatchDuration = Seconds(10)
21 | 
22 |   val driver = new MongoDriver
23 |   val connection = driver.connection(List("localhost"))
24 | 
25 |   implicit val aggHandler = Macros.handler[AggregationResult]
26 | 
27 |   val db = connection("adlogdb")
28 |   val collection = db[BSONCollection]("impsPerPubGeo")
29 | 
30 |   val sparkContext = new SparkContext("local[4]", "logAggregator")
31 | 
32 |   // we discretize the stream in BatchDuration seconds intervals
33 |   val streamingContext = new StreamingContext(sparkContext, BatchDuration)
34 | 
35 |   val kafkaParams = Map(
36 |     "zookeeper.connect" -> "localhost:2181",
37 |     "zookeeper.connection.timeout.ms" -> "10000",
38 |     "group.id" -> "myGroup"
39 |   )
40 | 
41 |   val topics = Map(
42 |     Constants.KafkaTopic -> 1
43 |   )
44 | 
45 |   // stream of (topic, ImpressionLog)
46 |   val messages = KafkaUtils.createStream[String, ImpressionLog, StringDecoder, ImpressionLogDecoder](streamingContext, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK)
47 | 
48 |   // to count uniques
49 |   lazy val hyperLogLog = new HyperLogLogMonoid(12)
50 | 
51 |   // we filter out non resolved geo (unknown) and map (pub, geo) -> AggLog that will be reduced
52 |   val logsByPubGeo = messages.map(_._2).filter(_.geo != Constants.UnknownGeo).map {
53 |     log =>
54 |       val key = PublisherGeoKey(log.publisher, log.geo)
55 |       val agg = AggregationLog(
56 |         timestamp = log.timestamp,
57 |         sumBids = log.bid,
58 |         imps = 1,
59 |         uniquesHll = hyperLogLog(log.cookie.getBytes(Charsets.UTF_8))
60 |       )
61 |       (key, agg)
62 |   }
63 | 
64 |   // Reduce to generate imps, uniques, sumBid per pub and geo per interval of BatchDuration seconds
65 |   import org.apache.spark.streaming.StreamingContext._
66 |   val aggLogs = logsByPubGeo.reduceByKeyAndWindow(reduceAggregationLogs, BatchDuration)
67 | 
68 |   // Store in MongoDB
69 |   aggLogs.foreachRDD(saveLogs(_))
70 | 
71 |   // start rolling!
72 |   streamingContext.start()
73 | 
74 |   private def saveLogs(logRdd: RDD[(PublisherGeoKey, AggregationLog)]) {
75 |     val logs = logRdd.map {
76 |       case (PublisherGeoKey(pub, geo), AggregationLog(timestamp, sumBids, imps, uniquesHll)) =>
77 |         AggregationResult(new DateTime(timestamp), pub, geo, imps, uniquesHll.estimatedSize.toInt, sumBids / imps)
78 |     }.collect()
79 | 
80 |     // save in MongoDB
81 |     logs.foreach(collection.save(_))
82 |   }
83 | 
84 |   private def reduceAggregationLogs(aggLog1: AggregationLog, aggLog2: AggregationLog) = {
85 |     aggLog1.copy(
86 |       timestamp = math.min(aggLog1.timestamp, aggLog2.timestamp),
87 |       sumBids = aggLog1.sumBids + aggLog2.sumBids,
88 |       imps = aggLog1.imps + aggLog2.imps,
89 |       uniquesHll = aggLog1.uniquesHll + aggLog2.uniquesHll
90 |     )
91 |   }
92 | }


--------------------------------------------------------------------------------