├── .gitignore
├── README.md
├── build.sbt
├── docker-compose.yml
├── log4j.properties
├── project
    └── assembly.sbt
└── src
    └── main
        └── scala
            └── com
                └── example
                    └── spark
                        └── DirectKafkaWordCount.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Spark Streaming with Kafka Demo App
 2 | ===================================
 3 | 
 4 | **Update:** This repository is outdated, for more recent demo please check out
 5 | [`docker-spark-streaming-sql` repository](https://github.com/antlypls/spark-demos/tree/master/docker-spark-streaming-sql).
 6 | 
 7 | This is a demo project that shows how to run Spark Streaming App with Kafka
 8 | using Docker and Docker Compose.
 9 | 
10 | For more details see
11 | http://blog.antlypls.com/blog/2015/10/05/getting-started-with-spark-streaming-using-docker/
12 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "direct_kafka_word_count"
 2 | 
 3 | scalaVersion := "2.10.5"
 4 | 
 5 | val sparkVersion = "1.6.2"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
 9 |   "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided",
10 |   ("org.apache.spark" %% "spark-streaming-kafka" % sparkVersion) exclude ("org.spark-project.spark", "unused")
11 | )
12 | 
13 | assemblyJarName in assembly := name.value + ".jar"
14 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | kafka:
 2 |   image: antlypls/kafka:0.10.0.1
 3 |   environment:
 4 |     - KAFKA=localhost:9092
 5 |     - ZOOKEEPER=localhost:2181
 6 |   expose:
 7 |     - "2181"
 8 |     - "9092"
 9 | 
10 | spark:
11 |   image: antlypls/spark:1.6.2
12 |   command: bash
13 |   volumes:
14 |     - ./target/scala-2.10:/app
15 |   links:
16 |    - kafka
17 | 


--------------------------------------------------------------------------------
/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootCategory=INFO, console
2 | log4j.appender.console=org.apache.log4j.ConsoleAppender
3 | log4j.appender.console.target=System.err
4 | log4j.appender.console.threshold=ERROR
5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
7 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com/example/spark/DirectKafkaWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.example.spark
 2 | 
 3 | import kafka.serializer.StringDecoder
 4 | import org.apache.spark.{TaskContext, SparkConf}
 5 | import org.apache.spark.streaming.kafka.{OffsetRange, HasOffsetRanges, KafkaUtils}
 6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 7 | 
 8 | object DirectKafkaWordCount {
 9 |   def main(args: Array[String]): Unit = {
10 |     if (args.length < 2) {
11 |       System.err.println(s"""
12 |         |Usage: DirectKafkaWordCount <brokers> <topics>
13 |         |  <brokers> is a list of one or more Kafka brokers
14 |         |  <topics> is a list of one or more kafka topics to consume from
15 |         |
16 |         """.stripMargin)
17 |       System.exit(1)
18 |     }
19 | 
20 |     val Array(brokers, topics) = args
21 | 
22 |     // Create context with 10 second batch interval
23 |     val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
24 |     val ssc = new StreamingContext(sparkConf, Seconds(10))
25 | 
26 |     // Create direct kafka stream with brokers and topics
27 |     val topicsSet = topics.split(",").toSet
28 |     val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
29 |     val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
30 |       ssc, kafkaParams, topicsSet)
31 | 
32 |     // Get the lines, split them into words, count the words and print
33 |     val lines = messages.map(_._2)
34 |     val words = lines.flatMap(_.split(" "))
35 |     val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
36 |     wordCounts.print()
37 | 
38 |     // Start the computation
39 |     ssc.start()
40 |     ssc.awaitTermination()
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------