├── docs ├── architecture.dia └── architecture.png ├── config ├── kafka-server1.properties └── zookeeper.properties ├── project └── plugins.sbt ├── .gitignore ├── src └── main │ └── scala │ └── com │ └── chimpler │ └── sparkstreaminglogaggregation │ ├── MongoConversions.scala │ ├── Constants.scala │ ├── Models.scala │ ├── Codec.scala │ ├── RandomLogGenerator.scala │ └── LogAggregator.scala └── README.md /docs/architecture.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chimpler/blog-spark-streaming-log-aggregation/HEAD/docs/architecture.dia -------------------------------------------------------------------------------- /docs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chimpler/blog-spark-streaming-log-aggregation/HEAD/docs/architecture.png -------------------------------------------------------------------------------- /config/kafka-server1.properties: -------------------------------------------------------------------------------- 1 | broker.id=1 2 | port=9093 3 | log.dir=/tmp/kafka-logs-1 4 | zookeeper.connect=localhost:2181 5 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 4 | 5 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.5.0") 6 | 7 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.4.0") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | -------------------------------------------------------------------------------- /config/zookeeper.properties: -------------------------------------------------------------------------------- 1 | # the directory where the snapshot is stored. 2 | dataDir=/tmp/zookeeper 3 | # the port at which the clients will connect 4 | clientPort=2181 5 | # disable the per-ip limit on the number of connections since this is a non-production config 6 | maxClientCnxns=0 -------------------------------------------------------------------------------- /src/main/scala/com/chimpler/sparkstreaminglogaggregation/MongoConversions.scala: -------------------------------------------------------------------------------- 1 | package com.chimpler.sparkstreaminglogaggregation 2 | 3 | import org.joda.time.DateTime 4 | import reactivemongo.bson._ 5 | 6 | object MongoConversions { 7 | implicit object DateTimeHandler extends BSONHandler[BSONDateTime, DateTime] { 8 | def read(t: BSONDateTime) = new DateTime(t.value) 9 | 10 | def write(t: DateTime) = BSONDateTime(t.getMillis) 11 | } 12 | } -------------------------------------------------------------------------------- /src/main/scala/com/chimpler/sparkstreaminglogaggregation/Constants.scala: -------------------------------------------------------------------------------- 1 | package com.chimpler.sparkstreaminglogaggregation 2 | 3 | object Constants { 4 | val NumPublishers = 5 5 | val NumAdvertisers = 3 6 | 7 | val Publishers = (0 to NumPublishers).map("publisher_" +) 8 | val Advertisers = (0 to NumAdvertisers).map("advertiser_" +) 9 | val UnknownGeo = "unknown" 10 | val Geos = Seq("NY", "CA", "FL", "MI", "HI", UnknownGeo) 11 | val NumWebsites = 10000 12 | val NumCookies = 10000 13 | 14 | val KafkaTopic = "adnetwork-topic" 15 | } -------------------------------------------------------------------------------- /src/main/scala/com/chimpler/sparkstreaminglogaggregation/Models.scala: -------------------------------------------------------------------------------- 1 | package com.chimpler.sparkstreaminglogaggregation 2 | 3 | import com.twitter.algebird.HLL 4 | import org.joda.time.DateTime 5 | 6 | case class ImpressionLog(timestamp: Long, publisher: String, advertiser: String, website: String, geo: String, bid: Double, cookie: String) 7 | 8 | // intermediate result used in reducer 9 | case class AggregationLog(timestamp: Long, sumBids: Double, imps: Int = 1, uniquesHll: HLL) 10 | 11 | // result to be stored in MongoDB 12 | case class AggregationResult(date: DateTime, publisher: String, geo: String, imps: Int, uniques: Int, avgBids: Double) 13 | 14 | case class PublisherGeoKey(publisher: String, geo: String) 15 | 16 | -------------------------------------------------------------------------------- /src/main/scala/com/chimpler/sparkstreaminglogaggregation/Codec.scala: -------------------------------------------------------------------------------- 1 | package com.chimpler.sparkstreaminglogaggregation 2 | 3 | 4 | import com.novus.salat 5 | import com.novus.salat.global._ 6 | import kafka.serializer.{Decoder, Encoder} 7 | import kafka.utils.VerifiableProperties 8 | import org.apache.commons.io.Charsets 9 | 10 | // encode and decode logs in JSON (in this tuto for readability purpose) but it would be better to consider something like AVRO or protobuf) 11 | class ImpressionLogDecoder(props: VerifiableProperties) extends Decoder[ImpressionLog] { 12 | def fromBytes(bytes: Array[Byte]): ImpressionLog = { 13 | salat.grater[ImpressionLog].fromJSON(new String(bytes, Charsets.UTF_8)) 14 | } 15 | } 16 | 17 | class ImpressionLogEncoder(props: VerifiableProperties) extends Encoder[ImpressionLog] { 18 | def toBytes(impressionLog: ImpressionLog): Array[Byte] = { 19 | salat.grater[ImpressionLog].toCompactJSON(impressionLog).getBytes(Charsets.UTF_8) 20 | } 21 | } -------------------------------------------------------------------------------- /src/main/scala/com/chimpler/sparkstreaminglogaggregation/RandomLogGenerator.scala: -------------------------------------------------------------------------------- 1 | package com.chimpler.sparkstreaminglogaggregation 2 | 3 | import java.util.{Date, Properties} 4 | import kafka.javaapi.producer.Producer 5 | import kafka.producer.{KeyedMessage, ProducerConfig} 6 | import Constants._ 7 | import kafka.producer 8 | import org.joda.time.DateTime 9 | 10 | import scala.collection.JavaConversions._ 11 | 12 | import kafka.Kafka 13 | 14 | import scala.util.Random 15 | 16 | /** 17 | * Publish random logs to Kafka 18 | */ 19 | object RandomLogGenerator extends App { 20 | val random = new Random() 21 | 22 | val props = new Properties() 23 | props ++= Map( 24 | "serializer.class" -> "com.chimpler.sparkstreaminglogaggregation.ImpressionLogEncoder", 25 | "metadata.broker.list" -> "127.0.0.1:9093" 26 | ) 27 | 28 | val config = new ProducerConfig(props) 29 | val producer = new Producer[String, ImpressionLog](config) 30 | 31 | println("Sending messages...") 32 | var i = 0 33 | // infinite loop 34 | while(true) { 35 | val timestamp = System.currentTimeMillis() 36 | val publisher = Publishers(random.nextInt(NumPublishers)) 37 | val advertiser = Advertisers(random.nextInt(NumAdvertisers)) 38 | val website = s"website_${random.nextInt(Constants.NumWebsites)}.com" 39 | val cookie = s"cookie_${random.nextInt(Constants.NumCookies)}" 40 | val geo = Geos(random.nextInt(Geos.size)) 41 | val bid = math.abs(random.nextDouble()) % 1 42 | val log = ImpressionLog(timestamp, publisher, advertiser, website, geo, bid, cookie) 43 | producer.send(new KeyedMessage[String, ImpressionLog](Constants.KafkaTopic, log)) 44 | i = i + 1 45 | if (i % 10000 == 0) { 46 | println(s"Sent $i messages!") 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Simple example consuming an adserver logs stream from Kafka. 2 | 3 | More information on our blog: http://chimpler.wordpress.com/2014/07/01/implementing-a-real-time-data-pipeline-with-spark-streaming/ 4 | 5 | In order to run our example, we need to install the followings: 6 | 7 | * [Scala 2.10+](http://www.scala-lang.org/) 8 | * [SBT](http://www.scala-sbt.org/) 9 | * [Apache Zookeeper](http://zookeeper.apache.org/) 10 | * [Apache Kafka](http://kafka.apache.org/) 11 | * [MongoDB](http://www.mongodb.org/) 12 | 13 | 14 | Building the examples: 15 | 16 | $ sbt pack 17 | 18 | Create a topic “adnetwork-topic”: 19 | 20 | $ kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic adnetwork-topic 21 | 22 | Start Zookeeper: 23 | 24 | $ zookeeper-server-start.sh config/zookeeper.properties 25 | 26 | Start Kafka: 27 | 28 | $ kafka-server-start.sh config/kafka-server1.properties 29 | 30 | Run MongoDB: 31 | $ sudo mongod 32 | 33 | On one window, run the aggregator: 34 | 35 | $ target/pack/bin/aggregator 36 | 37 | On the other one, run the adserver log random generator: 38 | 39 | $ target/pack/bin/generator 40 | 41 | You can also see the messages that are sent using the Kafka console consumer: 42 | 43 | $ kafka-console-consumer.sh --topic adnetwork-topic --zookeeper localhost:2181 44 | 45 | After a few seconds, you should see the results in MongoDB: 46 | 47 | $ mongoexport -d adlogdb -c impsPerPubGeo --csv -f date,publisher,geo,imps,uniques,avgBids 48 | connected to: 127.0.0.1 49 | 50 | date,publisher,geo,imps,uniques,avgBids 51 | 2014-07-01T03:24:39.679Z,"publisher_4","CA",3980,3248,0.50062253292876 52 | 2014-07-01T03:24:39.681Z,"publisher_4","MI",3958,3229,0.505213545705667 53 | 2014-07-01T03:24:39.681Z,"publisher_1","HI",3886,3218,0.4984981221446526 54 | 2014-07-01T03:24:39.681Z,"publisher_3","CA",3937,3226,0.5038157362872939 55 | 2014-07-01T03:24:39.679Z,"publisher_4","NY",3894,3200,0.5022389599376207 56 | 2014-07-01T03:24:39.679Z,"publisher_2","HI",3906,3240,0.4988378174961185 57 | 2014-07-01T03:24:39.679Z,"publisher_3","HI",3989,3309,0.4975347625823641 58 | 2014-07-01T03:24:39.681Z,"publisher_3","FL",3957,3167,0.4993339490605483 59 | -------------------------------------------------------------------------------- /src/main/scala/com/chimpler/sparkstreaminglogaggregation/LogAggregator.scala: -------------------------------------------------------------------------------- 1 | package com.chimpler.sparkstreaminglogaggregation 2 | 3 | import com.github.nscala_time.time.Imports._ 4 | import com.twitter.algebird.HyperLogLogMonoid 5 | import kafka.serializer.StringDecoder 6 | import org.apache.commons.io.Charsets 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.storage.StorageLevel 10 | import org.apache.spark.streaming.kafka.KafkaUtils 11 | import org.apache.spark.streaming.{Seconds, StreamingContext} 12 | import reactivemongo.api._ 13 | import reactivemongo.api.collections.default.BSONCollection 14 | import reactivemongo.bson._ 15 | import MongoConversions._ 16 | 17 | import scala.concurrent.ExecutionContext.Implicits.global 18 | 19 | object LogAggregator extends App { 20 | val BatchDuration = Seconds(10) 21 | 22 | val driver = new MongoDriver 23 | val connection = driver.connection(List("localhost")) 24 | 25 | implicit val aggHandler = Macros.handler[AggregationResult] 26 | 27 | val db = connection("adlogdb") 28 | val collection = db[BSONCollection]("impsPerPubGeo") 29 | 30 | val sparkContext = new SparkContext("local[4]", "logAggregator") 31 | 32 | // we discretize the stream in BatchDuration seconds intervals 33 | val streamingContext = new StreamingContext(sparkContext, BatchDuration) 34 | 35 | val kafkaParams = Map( 36 | "zookeeper.connect" -> "localhost:2181", 37 | "zookeeper.connection.timeout.ms" -> "10000", 38 | "group.id" -> "myGroup" 39 | ) 40 | 41 | val topics = Map( 42 | Constants.KafkaTopic -> 1 43 | ) 44 | 45 | // stream of (topic, ImpressionLog) 46 | val messages = KafkaUtils.createStream[String, ImpressionLog, StringDecoder, ImpressionLogDecoder](streamingContext, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK) 47 | 48 | // to count uniques 49 | lazy val hyperLogLog = new HyperLogLogMonoid(12) 50 | 51 | // we filter out non resolved geo (unknown) and map (pub, geo) -> AggLog that will be reduced 52 | val logsByPubGeo = messages.map(_._2).filter(_.geo != Constants.UnknownGeo).map { 53 | log => 54 | val key = PublisherGeoKey(log.publisher, log.geo) 55 | val agg = AggregationLog( 56 | timestamp = log.timestamp, 57 | sumBids = log.bid, 58 | imps = 1, 59 | uniquesHll = hyperLogLog(log.cookie.getBytes(Charsets.UTF_8)) 60 | ) 61 | (key, agg) 62 | } 63 | 64 | // Reduce to generate imps, uniques, sumBid per pub and geo per interval of BatchDuration seconds 65 | import org.apache.spark.streaming.StreamingContext._ 66 | val aggLogs = logsByPubGeo.reduceByKeyAndWindow(reduceAggregationLogs, BatchDuration) 67 | 68 | // Store in MongoDB 69 | aggLogs.foreachRDD(saveLogs(_)) 70 | 71 | // start rolling! 72 | streamingContext.start() 73 | 74 | private def saveLogs(logRdd: RDD[(PublisherGeoKey, AggregationLog)]) { 75 | val logs = logRdd.map { 76 | case (PublisherGeoKey(pub, geo), AggregationLog(timestamp, sumBids, imps, uniquesHll)) => 77 | AggregationResult(new DateTime(timestamp), pub, geo, imps, uniquesHll.estimatedSize.toInt, sumBids / imps) 78 | }.collect() 79 | 80 | // save in MongoDB 81 | logs.foreach(collection.save(_)) 82 | } 83 | 84 | private def reduceAggregationLogs(aggLog1: AggregationLog, aggLog2: AggregationLog) = { 85 | aggLog1.copy( 86 | timestamp = math.min(aggLog1.timestamp, aggLog2.timestamp), 87 | sumBids = aggLog1.sumBids + aggLog2.sumBids, 88 | imps = aggLog1.imps + aggLog2.imps, 89 | uniquesHll = aggLog1.uniquesHll + aggLog2.uniquesHll 90 | ) 91 | } 92 | } --------------------------------------------------------------------------------