├── .gitignore ├── README.md ├── build.sbt ├── docker-compose.yml ├── log4j.properties ├── project └── assembly.sbt └── src └── main └── scala └── com └── example └── spark └── DirectKafkaWordCount.scala /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Spark Streaming with Kafka Demo App 2 | =================================== 3 | 4 | **Update:** This repository is outdated, for more recent demo please check out 5 | [`docker-spark-streaming-sql` repository](https://github.com/antlypls/spark-demos/tree/master/docker-spark-streaming-sql). 6 | 7 | This is a demo project that shows how to run Spark Streaming App with Kafka 8 | using Docker and Docker Compose. 9 | 10 | For more details see 11 | http://blog.antlypls.com/blog/2015/10/05/getting-started-with-spark-streaming-using-docker/ 12 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "direct_kafka_word_count" 2 | 3 | scalaVersion := "2.10.5" 4 | 5 | val sparkVersion = "1.6.2" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided", 9 | "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided", 10 | ("org.apache.spark" %% "spark-streaming-kafka" % sparkVersion) exclude ("org.spark-project.spark", "unused") 11 | ) 12 | 13 | assemblyJarName in assembly := name.value + ".jar" 14 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | kafka: 2 | image: antlypls/kafka:0.10.0.1 3 | environment: 4 | - KAFKA=localhost:9092 5 | - ZOOKEEPER=localhost:2181 6 | expose: 7 | - "2181" 8 | - "9092" 9 | 10 | spark: 11 | image: antlypls/spark:1.6.2 12 | command: bash 13 | volumes: 14 | - ./target/scala-2.10:/app 15 | links: 16 | - kafka 17 | -------------------------------------------------------------------------------- /log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=INFO, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.err 4 | log4j.appender.console.threshold=ERROR 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") 2 | -------------------------------------------------------------------------------- /src/main/scala/com/example/spark/DirectKafkaWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.example.spark 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.{TaskContext, SparkConf} 5 | import org.apache.spark.streaming.kafka.{OffsetRange, HasOffsetRanges, KafkaUtils} 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | 8 | object DirectKafkaWordCount { 9 | def main(args: Array[String]): Unit = { 10 | if (args.length < 2) { 11 | System.err.println(s""" 12 | |Usage: DirectKafkaWordCount 13 | | is a list of one or more Kafka brokers 14 | | is a list of one or more kafka topics to consume from 15 | | 16 | """.stripMargin) 17 | System.exit(1) 18 | } 19 | 20 | val Array(brokers, topics) = args 21 | 22 | // Create context with 10 second batch interval 23 | val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") 24 | val ssc = new StreamingContext(sparkConf, Seconds(10)) 25 | 26 | // Create direct kafka stream with brokers and topics 27 | val topicsSet = topics.split(",").toSet 28 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) 29 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 30 | ssc, kafkaParams, topicsSet) 31 | 32 | // Get the lines, split them into words, count the words and print 33 | val lines = messages.map(_._2) 34 | val words = lines.flatMap(_.split(" ")) 35 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) 36 | wordCounts.print() 37 | 38 | // Start the computation 39 | ssc.start() 40 | ssc.awaitTermination() 41 | } 42 | } 43 | --------------------------------------------------------------------------------