├── .gitignore
├── src
    └── main
    │   └── scala
    │       └── com
    │           └── ippontech
    │               └── kafka
    │                   ├── util
    │                       └── Stopwatch.scala
    │                   ├── stores
    │                       ├── OffsetsStore.scala
    │                       └── ZooKeeperOffsetsStore.scala
    │                   ├── KafkaSourcePythonHelper.scala
    │                   └── KafkaSource.scala
├── LICENSE
└── pom.xml


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | 
3 | .idea/
4 | *.iml
5 | 


--------------------------------------------------------------------------------
/src/main/scala/com/ippontech/kafka/util/Stopwatch.scala:
--------------------------------------------------------------------------------
 1 | package com.ippontech.kafka.util
 2 | 
 3 | // very simple stop watch to avoid using Guava's one
 4 | class Stopwatch {
 5 | 
 6 |   private val start = System.currentTimeMillis()
 7 | 
 8 |   override def toString() = (System.currentTimeMillis() - start) + " ms"
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/com/ippontech/kafka/stores/OffsetsStore.scala:
--------------------------------------------------------------------------------
 1 | package com.ippontech.kafka.stores
 2 | 
 3 | import kafka.common.TopicAndPartition
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | trait OffsetsStore {
 7 | 
 8 |   def readOffsets(topic: String): Option[Map[TopicAndPartition, Long]]
 9 | 
10 |   def saveOffsets(topic: String, rdd: RDD[_]): Unit
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Ippon Technologies
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/main/scala/com/ippontech/kafka/KafkaSourcePythonHelper.scala:
--------------------------------------------------------------------------------
 1 | package com.ippontech.kafka
 2 | 
 3 | import com.ippontech.kafka.stores.{OffsetsStore, ZooKeeperOffsetsStore}
 4 | import kafka.serializer.StringDecoder
 5 | import org.apache.spark.streaming.api.java.{JavaDStream, JavaStreamingContext}
 6 | 
 7 | object KafkaSourcePythonHelper {
 8 | 
 9 |   def kafkaStream(jssc: JavaStreamingContext, brokers: String, offsetsStore: OffsetsStore,
10 |                   topic: String): JavaDStream[(String, String)] = {
11 |     val dstream = KafkaSource.kafkaStream[String, String, StringDecoder, StringDecoder](jssc.ssc, brokers, offsetsStore, topic)
12 |     val jdstream = new JavaDStream(dstream)
13 |     jdstream
14 |   }
15 | 
16 |   def kafkaStream(jssc: JavaStreamingContext, brokers: String, zkHosts: String, zkPath: String,
17 |                   topic: String): JavaDStream[(String, String)] = {
18 |     val offsetsStore = new ZooKeeperOffsetsStore(zkHosts, zkPath)
19 |     val dstream = KafkaSource.kafkaStream[String, String, StringDecoder, StringDecoder](jssc.ssc, brokers, offsetsStore, topic)
20 |     val jdstream = new JavaDStream(dstream)
21 |     jdstream
22 |   }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/ippontech/kafka/KafkaSource.scala:
--------------------------------------------------------------------------------
 1 | package com.ippontech.kafka
 2 | 
 3 | import com.ippontech.kafka.stores.OffsetsStore
 4 | import com.typesafe.scalalogging.slf4j.LazyLogging
 5 | import kafka.message.MessageAndMetadata
 6 | import kafka.serializer.Decoder
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.InputDStream
 9 | import org.apache.spark.streaming.kafka.KafkaUtils
10 | 
11 | import scala.reflect.ClassTag
12 | 
13 | object KafkaSource extends LazyLogging {
14 | 
15 |   def kafkaStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag]
16 |   (ssc: StreamingContext, kafkaParams: Map[String, String], offsetsStore: OffsetsStore, topic: String): InputDStream[(K, V)] = {
17 | 
18 |     val topics = Set(topic)
19 | 
20 |     val storedOffsets = offsetsStore.readOffsets(topic)
21 |     val kafkaStream = storedOffsets match {
22 |       case None =>
23 |         // start from the latest offsets
24 |         KafkaUtils.createDirectStream[K, V, KD, VD](ssc, kafkaParams, topics)
25 |       case Some(fromOffsets) =>
26 |         // start from previously saved offsets
27 |         val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
28 |         KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](ssc, kafkaParams, fromOffsets, messageHandler)
29 |     }
30 | 
31 |     // save the offsets
32 |     kafkaStream.foreachRDD(rdd => offsetsStore.saveOffsets(topic, rdd))
33 | 
34 |     kafkaStream
35 |   }
36 | 
37 |   // Kafka input stream
38 |   def kafkaStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag]
39 |   (ssc: StreamingContext, brokers: String, offsetsStore: OffsetsStore, topic: String): InputDStream[(K, V)] =
40 |     kafkaStream(ssc, Map("metadata.broker.list" -> brokers), offsetsStore, topic)
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/com/ippontech/kafka/stores/ZooKeeperOffsetsStore.scala:
--------------------------------------------------------------------------------
 1 | package com.ippontech.kafka.stores
 2 | 
 3 | import com.ippontech.kafka.util.Stopwatch
 4 | import com.typesafe.scalalogging.slf4j.LazyLogging
 5 | import kafka.common.TopicAndPartition
 6 | import kafka.utils.{ZKStringSerializer, ZkUtils}
 7 | import org.I0Itec.zkclient.ZkClient
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.streaming.kafka.HasOffsetRanges
10 | 
11 | class ZooKeeperOffsetsStore(zkHosts: String, zkPath: String) extends OffsetsStore with LazyLogging {
12 | 
13 |   private val zkClient = new ZkClient(zkHosts, 10000, 10000, ZKStringSerializer)
14 | 
15 |   // Read the previously saved offsets from Zookeeper
16 |   override def readOffsets(topic: String): Option[Map[TopicAndPartition, Long]] = {
17 | 
18 |     logger.info("Reading offsets from ZooKeeper")
19 |     val stopwatch = new Stopwatch()
20 | 
21 |     val (offsetsRangesStrOpt, _) = ZkUtils.readDataMaybeNull(zkClient, zkPath)
22 | 
23 |     offsetsRangesStrOpt match {
24 |       case Some(offsetsRangesStr) =>
25 |         logger.debug(s"Read offset ranges: ${offsetsRangesStr}")
26 | 
27 |         val offsets = offsetsRangesStr.split(",")
28 |           .map(s => s.split(":"))
29 |           .map { case Array(partitionStr, offsetStr) => (TopicAndPartition(topic, partitionStr.toInt) -> offsetStr.toLong) }
30 |           .toMap
31 | 
32 |         logger.info("Done reading offsets from ZooKeeper. Took " + stopwatch)
33 | 
34 |         Some(offsets)
35 |       case None =>
36 |         logger.info("No offsets found in ZooKeeper. Took " + stopwatch)
37 |         None
38 |     }
39 | 
40 |   }
41 | 
42 |   // Save the offsets back to ZooKeeper
43 |   //
44 |   // IMPORTANT: We're not saving the offset immediately but instead save the offset from the previous batch. This is
45 |   // because the extraction of the offsets has to be done at the beginning of the stream processing, before the real
46 |   // logic is applied. Instead, we want to save the offsets once we have successfully processed a batch, hence the
47 |   // workaround.
48 |   override def saveOffsets(topic: String, rdd: RDD[_]): Unit = {
49 | 
50 |     logger.info("Saving offsets to ZooKeeper")
51 |     val stopwatch = new Stopwatch()
52 | 
53 |     val offsetsRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
54 |     offsetsRanges.foreach(offsetRange => logger.debug(s"Using ${offsetRange}"))
55 | 
56 |     val offsetsRangesStr = offsetsRanges.map(offsetRange => s"${offsetRange.partition}:${offsetRange.fromOffset}")
57 |       .mkString(",")
58 |     logger.debug(s"Writing offsets to ZooKeeper: ${offsetsRangesStr}")
59 |     ZkUtils.updatePersistentPath(zkClient, zkPath, offsetsRangesStr)
60 | 
61 |     logger.info("Done updating offsets in ZooKeeper. Took " + stopwatch)
62 | 
63 |   }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <groupId>com.ippontech.kafka</groupId>
  6 |     <artifactId>spark-kafka-source</artifactId>
  7 |     <packaging>jar</packaging>
  8 |     <description>Kafka stream for Spark with storage of the offsets in ZooKeeper</description>
  9 |     <version>0.2.0-SNAPSHOT</version>
 10 | 
 11 |     <properties>
 12 |         <java.version>1.8</java.version>
 13 |         <scala.version>2.10.6</scala.version>
 14 |         <scala.dep.version>2.10</scala.dep.version>
 15 |         <spark.version>1.5.2</spark.version>
 16 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 17 |     </properties>
 18 | 
 19 |     <build>
 20 |         <sourceDirectory>src/main/scala</sourceDirectory>
 21 | 
 22 |         <plugins>
 23 | 
 24 |             <!-- Scala -->
 25 |             <plugin>
 26 |                 <groupId>org.apache.maven.plugins</groupId>
 27 |                 <artifactId>maven-compiler-plugin</artifactId>
 28 |                 <version>3.5.1</version>
 29 |                 <configuration>
 30 |                     <source>${java.version}</source>
 31 |                     <target>${java.version}</target>
 32 |                 </configuration>
 33 |             </plugin>
 34 |             <plugin>
 35 |                 <groupId>net.alchim31.maven</groupId>
 36 |                 <artifactId>scala-maven-plugin</artifactId>
 37 |                 <version>3.2.0</version>
 38 |                 <executions>
 39 |                     <execution>
 40 |                         <goals>
 41 |                             <goal>compile</goal>
 42 |                             <goal>testCompile</goal>
 43 |                         </goals>
 44 |                     </execution>
 45 |                 </executions>
 46 |                 <configuration>
 47 |                     <scalaVersion>${scala.version}</scalaVersion>
 48 |                     <scalaCompatVersion>${scala.dep.version}</scalaCompatVersion>
 49 |                     <recompileMode>incremental</recompileMode>
 50 |                     <ccTestGoals>scalatest:test</ccTestGoals>
 51 |                 </configuration>
 52 |             </plugin>
 53 | 
 54 |             <!-- source code in the JAR -->
 55 |             <plugin>
 56 |                 <groupId>org.apache.maven.plugins</groupId>
 57 |                 <artifactId>maven-source-plugin</artifactId>
 58 |                 <version>3.0.0</version>
 59 |                 <executions>
 60 |                     <execution>
 61 |                         <id>attach-source</id>
 62 |                         <goals>
 63 |                             <goal>jar</goal>
 64 |                         </goals>
 65 |                     </execution>
 66 |                 </executions>
 67 |             </plugin>
 68 | 
 69 |         </plugins>
 70 |     </build>
 71 | 
 72 |     <dependencies>
 73 | 
 74 |         <dependency>
 75 |             <groupId>org.scala-lang</groupId>
 76 |             <artifactId>scala-library</artifactId>
 77 |             <version>${scala.version}</version>
 78 |         </dependency>
 79 | 
 80 |         <!-- Spark -->
 81 |         <dependency>
 82 |             <groupId>org.apache.spark</groupId>
 83 |             <artifactId>spark-streaming_${scala.dep.version}</artifactId>
 84 |             <version>${spark.version}</version>
 85 |             <scope>provided</scope>
 86 |         </dependency>
 87 |         <dependency>
 88 |             <groupId>org.apache.spark</groupId>
 89 |             <artifactId>spark-streaming-kafka_${scala.dep.version}</artifactId>
 90 |             <version>${spark.version}</version>
 91 |             <scope>provided</scope>
 92 |         </dependency>
 93 | 
 94 |         <!-- Logging -->
 95 |         <dependency>
 96 |             <groupId>com.typesafe.scala-logging</groupId>
 97 |             <artifactId>scala-logging-slf4j_${scala.dep.version}</artifactId>
 98 |             <version>2.1.2</version>
 99 |         </dependency>
100 | 
101 |     </dependencies>
102 | 
103 |     <profiles>
104 |         <profile>
105 |             <id>uber-jar</id>
106 | 
107 |             <build>
108 |                 <plugins>
109 | 
110 |                     <plugin>
111 |                         <groupId>org.apache.maven.plugins</groupId>
112 |                         <artifactId>maven-shade-plugin</artifactId>
113 |                         <version>2.4.3</version>
114 |                         <executions>
115 |                             <execution>
116 |                                 <phase>package</phase>
117 |                                 <goals>
118 |                                     <goal>shade</goal>
119 |                                 </goals>
120 |                                 <configuration>
121 |                                     <relocations>
122 |                                         <relocation>
123 |                                             <pattern>com.google</pattern>
124 |                                             <shadedPattern>shaded.guava</shadedPattern>
125 |                                             <includes>
126 |                                                 <include>com.google.**</include>
127 |                                             </includes>
128 |                                             <excludes>
129 |                                                 <exclude>com.google.common.base.Optional</exclude>
130 |                                                 <exclude>com.google.common.base.Absent</exclude>
131 |                                                 <exclude>com.google.common.base.Present</exclude>
132 |                                             </excludes>
133 |                                         </relocation>
134 |                                     </relocations>
135 |                                     <filters>
136 |                                         <filter>
137 |                                             <artifact>*:*</artifact>
138 |                                             <excludes>
139 |                                                 <exclude>META-INF/*.SF</exclude>
140 |                                                 <exclude>META-INF/*.DSA</exclude>
141 |                                                 <exclude>META-INF/*.RSA</exclude>
142 |                                             </excludes>
143 |                                         </filter>
144 |                                     </filters>
145 |                                     <createDependencyReducedPom>false</createDependencyReducedPom>
146 |                                 </configuration>
147 |                             </execution>
148 |                         </executions>
149 |                     </plugin>
150 | 
151 |                 </plugins>
152 |             </build>
153 | 
154 |             <dependencies>
155 | 
156 |                 <dependency>
157 |                     <groupId>org.scala-lang</groupId>
158 |                     <artifactId>scala-library</artifactId>
159 |                     <version>${scala.version}</version>
160 |                     <scope>provided</scope>
161 |                 </dependency>
162 | 
163 |                 <dependency>
164 |                     <groupId>org.apache.spark</groupId>
165 |                     <artifactId>spark-streaming-kafka_${scala.dep.version}</artifactId>
166 |                     <version>${spark.version}</version>
167 |                 </dependency>
168 | 
169 |                 <dependency>
170 |                     <groupId>org.spark-project.spark</groupId>
171 |                     <artifactId>unused</artifactId>
172 |                     <version>1.0.0</version>
173 |                     <scope>provided</scope>
174 |                 </dependency>
175 | 
176 |                 <dependency>
177 |                     <groupId>log4j</groupId>
178 |                     <artifactId>log4j</artifactId>
179 |                     <version>1.2.7</version>
180 |                     <scope>provided</scope>
181 |                 </dependency>
182 | 
183 |             </dependencies>
184 |         </profile>
185 |     </profiles>
186 | 
187 | </project>


--------------------------------------------------------------------------------