├── .gitignore ├── src └── main │ └── scala │ └── com │ └── ippontech │ └── kafka │ ├── util │ └── Stopwatch.scala │ ├── stores │ ├── OffsetsStore.scala │ └── ZooKeeperOffsetsStore.scala │ ├── KafkaSourcePythonHelper.scala │ └── KafkaSource.scala ├── LICENSE └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | 3 | .idea/ 4 | *.iml 5 | -------------------------------------------------------------------------------- /src/main/scala/com/ippontech/kafka/util/Stopwatch.scala: -------------------------------------------------------------------------------- 1 | package com.ippontech.kafka.util 2 | 3 | // very simple stop watch to avoid using Guava's one 4 | class Stopwatch { 5 | 6 | private val start = System.currentTimeMillis() 7 | 8 | override def toString() = (System.currentTimeMillis() - start) + " ms" 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/com/ippontech/kafka/stores/OffsetsStore.scala: -------------------------------------------------------------------------------- 1 | package com.ippontech.kafka.stores 2 | 3 | import kafka.common.TopicAndPartition 4 | import org.apache.spark.rdd.RDD 5 | 6 | trait OffsetsStore { 7 | 8 | def readOffsets(topic: String): Option[Map[TopicAndPartition, Long]] 9 | 10 | def saveOffsets(topic: String, rdd: RDD[_]): Unit 11 | 12 | } 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Ippon Technologies 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/scala/com/ippontech/kafka/KafkaSourcePythonHelper.scala: -------------------------------------------------------------------------------- 1 | package com.ippontech.kafka 2 | 3 | import com.ippontech.kafka.stores.{OffsetsStore, ZooKeeperOffsetsStore} 4 | import kafka.serializer.StringDecoder 5 | import org.apache.spark.streaming.api.java.{JavaDStream, JavaStreamingContext} 6 | 7 | object KafkaSourcePythonHelper { 8 | 9 | def kafkaStream(jssc: JavaStreamingContext, brokers: String, offsetsStore: OffsetsStore, 10 | topic: String): JavaDStream[(String, String)] = { 11 | val dstream = KafkaSource.kafkaStream[String, String, StringDecoder, StringDecoder](jssc.ssc, brokers, offsetsStore, topic) 12 | val jdstream = new JavaDStream(dstream) 13 | jdstream 14 | } 15 | 16 | def kafkaStream(jssc: JavaStreamingContext, brokers: String, zkHosts: String, zkPath: String, 17 | topic: String): JavaDStream[(String, String)] = { 18 | val offsetsStore = new ZooKeeperOffsetsStore(zkHosts, zkPath) 19 | val dstream = KafkaSource.kafkaStream[String, String, StringDecoder, StringDecoder](jssc.ssc, brokers, offsetsStore, topic) 20 | val jdstream = new JavaDStream(dstream) 21 | jdstream 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/ippontech/kafka/KafkaSource.scala: -------------------------------------------------------------------------------- 1 | package com.ippontech.kafka 2 | 3 | import com.ippontech.kafka.stores.OffsetsStore 4 | import com.typesafe.scalalogging.slf4j.LazyLogging 5 | import kafka.message.MessageAndMetadata 6 | import kafka.serializer.Decoder 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.InputDStream 9 | import org.apache.spark.streaming.kafka.KafkaUtils 10 | 11 | import scala.reflect.ClassTag 12 | 13 | object KafkaSource extends LazyLogging { 14 | 15 | def kafkaStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag] 16 | (ssc: StreamingContext, kafkaParams: Map[String, String], offsetsStore: OffsetsStore, topic: String): InputDStream[(K, V)] = { 17 | 18 | val topics = Set(topic) 19 | 20 | val storedOffsets = offsetsStore.readOffsets(topic) 21 | val kafkaStream = storedOffsets match { 22 | case None => 23 | // start from the latest offsets 24 | KafkaUtils.createDirectStream[K, V, KD, VD](ssc, kafkaParams, topics) 25 | case Some(fromOffsets) => 26 | // start from previously saved offsets 27 | val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message) 28 | KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](ssc, kafkaParams, fromOffsets, messageHandler) 29 | } 30 | 31 | // save the offsets 32 | kafkaStream.foreachRDD(rdd => offsetsStore.saveOffsets(topic, rdd)) 33 | 34 | kafkaStream 35 | } 36 | 37 | // Kafka input stream 38 | def kafkaStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag] 39 | (ssc: StreamingContext, brokers: String, offsetsStore: OffsetsStore, topic: String): InputDStream[(K, V)] = 40 | kafkaStream(ssc, Map("metadata.broker.list" -> brokers), offsetsStore, topic) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/ippontech/kafka/stores/ZooKeeperOffsetsStore.scala: -------------------------------------------------------------------------------- 1 | package com.ippontech.kafka.stores 2 | 3 | import com.ippontech.kafka.util.Stopwatch 4 | import com.typesafe.scalalogging.slf4j.LazyLogging 5 | import kafka.common.TopicAndPartition 6 | import kafka.utils.{ZKStringSerializer, ZkUtils} 7 | import org.I0Itec.zkclient.ZkClient 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.streaming.kafka.HasOffsetRanges 10 | 11 | class ZooKeeperOffsetsStore(zkHosts: String, zkPath: String) extends OffsetsStore with LazyLogging { 12 | 13 | private val zkClient = new ZkClient(zkHosts, 10000, 10000, ZKStringSerializer) 14 | 15 | // Read the previously saved offsets from Zookeeper 16 | override def readOffsets(topic: String): Option[Map[TopicAndPartition, Long]] = { 17 | 18 | logger.info("Reading offsets from ZooKeeper") 19 | val stopwatch = new Stopwatch() 20 | 21 | val (offsetsRangesStrOpt, _) = ZkUtils.readDataMaybeNull(zkClient, zkPath) 22 | 23 | offsetsRangesStrOpt match { 24 | case Some(offsetsRangesStr) => 25 | logger.debug(s"Read offset ranges: ${offsetsRangesStr}") 26 | 27 | val offsets = offsetsRangesStr.split(",") 28 | .map(s => s.split(":")) 29 | .map { case Array(partitionStr, offsetStr) => (TopicAndPartition(topic, partitionStr.toInt) -> offsetStr.toLong) } 30 | .toMap 31 | 32 | logger.info("Done reading offsets from ZooKeeper. Took " + stopwatch) 33 | 34 | Some(offsets) 35 | case None => 36 | logger.info("No offsets found in ZooKeeper. Took " + stopwatch) 37 | None 38 | } 39 | 40 | } 41 | 42 | // Save the offsets back to ZooKeeper 43 | // 44 | // IMPORTANT: We're not saving the offset immediately but instead save the offset from the previous batch. This is 45 | // because the extraction of the offsets has to be done at the beginning of the stream processing, before the real 46 | // logic is applied. Instead, we want to save the offsets once we have successfully processed a batch, hence the 47 | // workaround. 48 | override def saveOffsets(topic: String, rdd: RDD[_]): Unit = { 49 | 50 | logger.info("Saving offsets to ZooKeeper") 51 | val stopwatch = new Stopwatch() 52 | 53 | val offsetsRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 54 | offsetsRanges.foreach(offsetRange => logger.debug(s"Using ${offsetRange}")) 55 | 56 | val offsetsRangesStr = offsetsRanges.map(offsetRange => s"${offsetRange.partition}:${offsetRange.fromOffset}") 57 | .mkString(",") 58 | logger.debug(s"Writing offsets to ZooKeeper: ${offsetsRangesStr}") 59 | ZkUtils.updatePersistentPath(zkClient, zkPath, offsetsRangesStr) 60 | 61 | logger.info("Done updating offsets in ZooKeeper. Took " + stopwatch) 62 | 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.ippontech.kafka 6 | spark-kafka-source 7 | jar 8 | Kafka stream for Spark with storage of the offsets in ZooKeeper 9 | 0.2.0-SNAPSHOT 10 | 11 | 12 | 1.8 13 | 2.10.6 14 | 2.10 15 | 1.5.2 16 | UTF-8 17 | 18 | 19 | 20 | src/main/scala 21 | 22 | 23 | 24 | 25 | 26 | org.apache.maven.plugins 27 | maven-compiler-plugin 28 | 3.5.1 29 | 30 | ${java.version} 31 | ${java.version} 32 | 33 | 34 | 35 | net.alchim31.maven 36 | scala-maven-plugin 37 | 3.2.0 38 | 39 | 40 | 41 | compile 42 | testCompile 43 | 44 | 45 | 46 | 47 | ${scala.version} 48 | ${scala.dep.version} 49 | incremental 50 | scalatest:test 51 | 52 | 53 | 54 | 55 | 56 | org.apache.maven.plugins 57 | maven-source-plugin 58 | 3.0.0 59 | 60 | 61 | attach-source 62 | 63 | jar 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | org.scala-lang 76 | scala-library 77 | ${scala.version} 78 | 79 | 80 | 81 | 82 | org.apache.spark 83 | spark-streaming_${scala.dep.version} 84 | ${spark.version} 85 | provided 86 | 87 | 88 | org.apache.spark 89 | spark-streaming-kafka_${scala.dep.version} 90 | ${spark.version} 91 | provided 92 | 93 | 94 | 95 | 96 | com.typesafe.scala-logging 97 | scala-logging-slf4j_${scala.dep.version} 98 | 2.1.2 99 | 100 | 101 | 102 | 103 | 104 | 105 | uber-jar 106 | 107 | 108 | 109 | 110 | 111 | org.apache.maven.plugins 112 | maven-shade-plugin 113 | 2.4.3 114 | 115 | 116 | package 117 | 118 | shade 119 | 120 | 121 | 122 | 123 | com.google 124 | shaded.guava 125 | 126 | com.google.** 127 | 128 | 129 | com.google.common.base.Optional 130 | com.google.common.base.Absent 131 | com.google.common.base.Present 132 | 133 | 134 | 135 | 136 | 137 | *:* 138 | 139 | META-INF/*.SF 140 | META-INF/*.DSA 141 | META-INF/*.RSA 142 | 143 | 144 | 145 | false 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | org.scala-lang 158 | scala-library 159 | ${scala.version} 160 | provided 161 | 162 | 163 | 164 | org.apache.spark 165 | spark-streaming-kafka_${scala.dep.version} 166 | ${spark.version} 167 | 168 | 169 | 170 | org.spark-project.spark 171 | unused 172 | 1.0.0 173 | provided 174 | 175 | 176 | 177 | log4j 178 | log4j 179 | 1.2.7 180 | provided 181 | 182 | 183 | 184 | 185 | 186 | 187 | --------------------------------------------------------------------------------