├── .gitignore
├── README.md
├── build.sbt
├── project
    ├── assembly.sbt
    ├── build.properties
    └── plugins.sbt
└── src
    └── main
        ├── resources
            └── hbase-site.xml
        └── scala
            └── KafkaOffsetsBlogStreamingDriver.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache
 6 | .history
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | 
19 | #Macbook specific stuff
20 | .DS_Store/
21 | 
22 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
23 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
24 | 
25 | # Entire .idea directory
26 | .idea/
27 | 
28 | # User-specific stuff:
29 | .idea/**/workspace.xml
30 | .idea/**/tasks.xml
31 | .idea/dictionaries
32 | 
33 | # Sensitive or high-churn files:
34 | .idea/**/dataSources/
35 | .idea/**/dataSources.ids
36 | .idea/**/dataSources.xml
37 | .idea/**/dataSources.local.xml
38 | .idea/**/sqlDataSources.xml
39 | .idea/**/dynamic.xml
40 | .idea/**/uiDesigner.xml
41 | 
42 | # Gradle:
43 | .idea/**/gradle.xml
44 | .idea/**/libraries
45 | 
46 | # CMake
47 | cmake-build-debug/
48 | 
49 | # Mongo Explorer plugin:
50 | .idea/**/mongoSettings.xml
51 | 
52 | ## File-based project format:
53 | *.iws
54 | 
55 | ## Plugin-specific files:
56 | 
57 | # IntelliJ
58 | /out/
59 | 
60 | # mpeltonen/sbt-idea plugin
61 | .idea_modules/
62 | 
63 | # JIRA plugin
64 | atlassian-ide-plugin.xml
65 | 
66 | # Cursive Clojure plugin
67 | .idea/replstate.xml
68 | 
69 | # Crashlytics plugin (for Android Studio and IntelliJ)
70 | com_crashlytics_export_strings.xml
71 | crashlytics.properties
72 | crashlytics-build.properties
73 | fabric.properties
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Offset Management For Apache Kafka With Apache Spark Streaming
2 | 
3 | Spark Streaming,Kafka and HBase code accompanying the blog 'Offset Management For Apache Kafka With Apache Spark Streaming'.
4 | 
5 | Link to the blog - http://blog.cloudera.com/blog/2017/06/offset-management-for-apache-kafka-with-apache-spark-streaming/
6 | 
7 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import _root_.sbtassembly.AssemblyPlugin.autoImport._
 2 | import _root_.sbtassembly.PathList
 3 | 
 4 | name := "spark-streaming-kafka-cdh511-testing"
 5 | 
 6 | version := "1.0"
 7 | 
 8 | scalaVersion := "2.11.9"
 9 | 
10 | 
11 | libraryDependencies ++= Seq(
12 |   "org.apache.spark" %% "spark-streaming" % "2.1.0.cloudera1",
13 |   "org.apache.spark" %% "spark-core" % "2.1.0.cloudera1" excludeAll ExclusionRule(organization = "javax.servlet"),
14 |   "org.apache.spark" %% "spark-streaming-kafka-0-10" % "2.1.0.cloudera1",
15 |   "org.apache.hbase" % "hbase-client" %  "1.2.0-cdh5.11.0",
16 |   "org.apache.hbase" % "hbase-common" % "1.2.0-cdh5.11.0"
17 | )
18 | 
19 | 
20 | assemblyMergeStrategy in assembly := {
21 |   case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
22 |   case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
23 |   case PathList("org", "apache", xs @ _*) => MergeStrategy.last
24 |   case PathList("com", "google", xs @ _*) => MergeStrategy.last
25 |   case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
26 |   case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
27 |   case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
28 |   case "about.html" => MergeStrategy.rename
29 |   case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
30 |   case "META-INF/mailcap" => MergeStrategy.last
31 |   case "META-INF/mimetypes.default" => MergeStrategy.last
32 |   case "plugin.properties" => MergeStrategy.last
33 |   case "log4j.properties" => MergeStrategy.last
34 |   case x =>
35 |     val oldStrategy = (assemblyMergeStrategy in assembly).value
36 |     oldStrategy(x)
37 | }
38 | 
39 | resolvers ++= Seq(
40 |   "Akka Repository" at "http://repo.akka.io/releases/",
41 |   "Maven Central Server" at "http://repo1.maven.org/maven2",
42 |   "Cloudera" at "https://repository.cloudera.com/artifactory/cloudera-repos",
43 |   "Typesafe Repo" at "http://repo.typesafe.com/typesafe/releases/"
44 | )


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn


--------------------------------------------------------------------------------
/src/main/resources/hbase-site.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <!--Autogenerated by Cloudera Manager-->
  4 | <configuration>
  5 |     <property>
  6 |         <name>hbase.rootdir</name>
  7 |         <value>hdfs://bottou02.sjc.cloudera.com:8020/hbase</value>
  8 |     </property>
  9 |     <property>
 10 |         <name>hbase.master.port</name>
 11 |         <value>60000</value>
 12 |     </property>
 13 |     <property>
 14 |         <name>hbase.master.ipc.address</name>
 15 |         <value>0.0.0.0</value>
 16 |     </property>
 17 |     <property>
 18 |         <name>hbase.cluster.distributed</name>
 19 |         <value>true</value>
 20 |     </property>
 21 |     <property>
 22 |         <name>hbase.master.info.port</name>
 23 |         <value>60010</value>
 24 |     </property>
 25 |     <property>
 26 |         <name>hbase.client.write.buffer</name>
 27 |         <value>2097152</value>
 28 |     </property>
 29 |     <property>
 30 |         <name>hbase.client.pause</name>
 31 |         <value>100</value>
 32 |     </property>
 33 |     <property>
 34 |         <name>hbase.client.retries.number</name>
 35 |         <value>35</value>
 36 |     </property>
 37 |     <property>
 38 |         <name>hbase.client.scanner.caching</name>
 39 |         <value>100</value>
 40 |     </property>
 41 |     <property>
 42 |         <name>hbase.client.keyvalue.maxsize</name>
 43 |         <value>10485760</value>
 44 |     </property>
 45 |     <property>
 46 |         <name>hbase.ipc.client.allowsInterrupt</name>
 47 |         <value>true</value>
 48 |     </property>
 49 |     <property>
 50 |         <name>hbase.client.primaryCallTimeout.get</name>
 51 |         <value>10</value>
 52 |     </property>
 53 |     <property>
 54 |         <name>hbase.client.primaryCallTimeout.multiget</name>
 55 |         <value>10</value>
 56 |     </property>
 57 |     <property>
 58 |         <name>hbase.master.handler.count</name>
 59 |         <value>25</value>
 60 |     </property>
 61 |     <property>
 62 |         <name>hbase.master.executor.openregion.threads</name>
 63 |         <value>5</value>
 64 |     </property>
 65 |     <property>
 66 |         <name>hbase.master.executor.closeregion.threads</name>
 67 |         <value>5</value>
 68 |     </property>
 69 |     <property>
 70 |         <name>hbase.master.executor.serverops.threads</name>
 71 |         <value>5</value>
 72 |     </property>
 73 |     <property>
 74 |         <name>hbase.splitlog.manager.timeout</name>
 75 |         <value>120000</value>
 76 |     </property>
 77 |     <property>
 78 |         <name>hbase.master.logcleaner.ttl</name>
 79 |         <value>60000</value>
 80 |     </property>
 81 |     <property>
 82 |         <name>hbase.regionserver.info.port</name>
 83 |         <value>60030</value>
 84 |     </property>
 85 |     <property>
 86 |         <name>hbase.regionserver.handler.count</name>
 87 |         <value>30</value>
 88 |     </property>
 89 |     <property>
 90 |         <name>hbase.regionserver.metahandler.count</name>
 91 |         <value>10</value>
 92 |     </property>
 93 |     <property>
 94 |         <name>hbase.server.thread.wakefrequency</name>
 95 |         <value>10000</value>
 96 |     </property>
 97 |     <property>
 98 |         <name>hbase.coprocessor.abortonerror</name>
 99 |         <value>false</value>
100 |     </property>
101 |     <property>
102 |         <name>hbase.superuser</name>
103 |         <value></value>
104 |     </property>
105 |     <property>
106 |         <name>hbase.rpc.timeout</name>
107 |         <value>60000</value>
108 |     </property>
109 |     <property>
110 |         <name>hbase.snapshot.enabled</name>
111 |         <value>true</value>
112 |     </property>
113 |     <property>
114 |         <name>hbase.snapshot.master.timeoutMillis</name>
115 |         <value>60000</value>
116 |     </property>
117 |     <property>
118 |         <name>hbase.snapshot.region.timeout</name>
119 |         <value>60000</value>
120 |     </property>
121 |     <property>
122 |         <name>hbase.snapshot.master.timeout.millis</name>
123 |         <value>60000</value>
124 |     </property>
125 |     <property>
126 |         <name>hbase.security.authentication</name>
127 |         <value>simple</value>
128 |     </property>
129 |     <property>
130 |         <name>hbase.security.authorization</name>
131 |         <value>false</value>
132 |     </property>
133 |     <property>
134 |         <name>hbase.row.level.authorization</name>
135 |         <value>false</value>
136 |     </property>
137 |     <property>
138 |         <name>hbase.rpc.protection</name>
139 |         <value>authentication</value>
140 |     </property>
141 |     <property>
142 |         <name>zookeeper.session.timeout</name>
143 |         <value>60000</value>
144 |     </property>
145 |     <property>
146 |         <name>zookeeper.znode.parent</name>
147 |         <value>/hbase</value>
148 |     </property>
149 |     <property>
150 |         <name>zookeeper.znode.rootserver</name>
151 |         <value>root-region-server</value>
152 |     </property>
153 |     <property>
154 |         <name>hbase.zookeeper.quorum</name>
155 |         <value>bottou03.sjc.cloudera.com,bottou04.sjc.cloudera.com,bottou05.sjc.cloudera.com</value>
156 |     </property>
157 |     <property>
158 |         <name>hbase.zookeeper.property.clientPort</name>
159 |         <value>2181</value>
160 |     </property>
161 | </configuration>


--------------------------------------------------------------------------------
/src/main/scala/KafkaOffsetsBlogStreamingDriver.scala:
--------------------------------------------------------------------------------
  1 | import kafka.utils.ZkUtils
  2 | import org.apache.hadoop.hbase.filter.PrefixFilter
  3 | import org.apache.hadoop.hbase.util.Bytes
  4 | import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
  5 | import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory}
  6 | import org.apache.kafka.clients.consumer.ConsumerRecord
  7 | import org.apache.kafka.common.TopicPartition
  8 | import org.apache.kafka.common.serialization.StringDeserializer
  9 | import org.apache.spark.streaming.kafka010.ConsumerStrategies._
 10 | import org.apache.spark.streaming.kafka010.{OffsetRange, HasOffsetRanges, KafkaUtils}
 11 | import org.apache.spark.streaming.kafka010.LocationStrategies._
 12 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 13 | import org.apache.spark.{SparkContext, SparkConf}
 14 | 
 15 | 
 16 | 
 17 | /**
 18 |  * Created by gmedasani on 6/10/17.
 19 |  */
 20 | object KafkaOffsetsBlogStreamingDriver {
 21 | 
 22 |   def main(args: Array[String]) {
 23 | 
 24 |     if (args.length < 6) {
 25 |       System.err.println("Usage: KafkaDirectStreamTest <batch-duration-in-seconds> <kafka-bootstrap-servers> " +
 26 |         "<kafka-topics> <kafka-consumer-group-id> <hbase-table-name> <kafka-zookeeper-quorum>")
 27 |       System.exit(1)
 28 |     }
 29 | 
 30 |     val batchDuration = args(0)
 31 |     val bootstrapServers = args(1).toString
 32 |     val topicsSet = args(2).toString.split(",").toSet
 33 |     val consumerGroupID = args(3)
 34 |     val hbaseTableName = args(4)
 35 |     val zkQuorum = args(5)
 36 |     val zkKafkaRootDir = "kafka"
 37 |     val zkSessionTimeOut = 10000
 38 |     val zkConnectionTimeOut = 10000
 39 | 
 40 |     val sparkConf = new SparkConf().setAppName("Kafka-Offset-Management-Blog")
 41 |                                   .setMaster("local[4]")//Uncomment this line to test while developing on a workstation
 42 |     val sc = new SparkContext(sparkConf)
 43 |     val ssc = new StreamingContext(sc, Seconds(batchDuration.toLong))
 44 |     val topics = topicsSet.toArray
 45 |     val topic = topics(0)
 46 | 
 47 |     val kafkaParams = Map[String, Object](
 48 |       "bootstrap.servers" -> bootstrapServers,
 49 |       "key.deserializer" -> classOf[StringDeserializer],
 50 |       "value.deserializer" -> classOf[StringDeserializer],
 51 |       "group.id" -> consumerGroupID,
 52 |       "auto.offset.reset" -> "earliest",
 53 |       "enable.auto.commit" -> (false: java.lang.Boolean)
 54 |     )
 55 | 
 56 |     /*
 57 |     Create a dummy process that simply returns the message as is.
 58 |      */
 59 |     def processMessage(message:ConsumerRecord[String,String]):ConsumerRecord[String,String]={
 60 |       message
 61 |     }
 62 | 
 63 |     /*
 64 |     Save Offsets into HBase
 65 |      */
 66 |     def saveOffsets(TOPIC_NAME:String,GROUP_ID:String,offsetRanges:Array[OffsetRange],hbaseTableName:String,
 67 |                     batchTime: org.apache.spark.streaming.Time) ={
 68 |       val hbaseConf = HBaseConfiguration.create()
 69 |       hbaseConf.addResource("src/main/resources/hbase-site.xml")
 70 |       val conn = ConnectionFactory.createConnection(hbaseConf)
 71 |       val table = conn.getTable(TableName.valueOf(hbaseTableName))
 72 |       val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
 73 |       val put = new Put(rowKey.getBytes)
 74 |       for(offset <- offsetRanges){
 75 |         put.addColumn(Bytes.toBytes("offsets"),Bytes.toBytes(offset.partition.toString),
 76 |           Bytes.toBytes(offset.untilOffset.toString))
 77 |       }
 78 |       table.put(put)
 79 |       conn.close()
 80 |     }
 81 | 
 82 |     /*
 83 |     Returns last committed offsets for all the partitions of a given topic from HBase in following cases.
 84 |       - CASE 1: SparkStreaming job is started for the first time. This function gets the number of topic partitions from
 85 |         Zookeeper and for each partition returns the last committed offset as 0
 86 |       - CASE 2: SparkStreaming is restarted and there are no changes to the number of partitions in a topic. Last
 87 |         committed offsets for each topic-partition is returned as is from HBase.
 88 |       - CASE 3: SparkStreaming is restarted and the number of partitions in a topic increased. For old partitions, last
 89 |         committed offsets for each topic-partition is returned as is from HBase as is. For newly added partitions,
 90 |         function returns last committed offsets as 0
 91 |      */
 92 |     def getLastCommittedOffsets(TOPIC_NAME:String,GROUP_ID:String,hbaseTableName:String,zkQuorum:String,
 93 |                                 zkRootDir:String, sessionTimeout:Int,connectionTimeOut:Int):Map[TopicPartition,Long] ={
 94 | 
 95 |       val hbaseConf = HBaseConfiguration.create()
 96 |       hbaseConf.addResource("src/main/resources/hbase-site.xml")
 97 |       val zkUrl = zkQuorum+"/"+zkRootDir
 98 |       val zkClientAndConnection = ZkUtils.createZkClientAndConnection(zkUrl,sessionTimeout,connectionTimeOut)
 99 |       val zkUtils = new ZkUtils(zkClientAndConnection._1, zkClientAndConnection._2,false)
100 |       val zKNumberOfPartitionsForTopic = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size
101 | 
102 |       //Connect to HBase to retrieve last committed offsets
103 |       val conn = ConnectionFactory.createConnection(hbaseConf)
104 |       val table = conn.getTable(TableName.valueOf(hbaseTableName))
105 |       val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
106 |       val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
107 |       val scan = new Scan()
108 |       val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
109 |       val result = scanner.next()
110 | 
111 |       var hbaseNumberOfPartitionsForTopic = 0 //Set the number of partitions discovered for a topic in HBase to 0
112 |       if (result != null){
113 |         //If the result from hbase scanner is not null, set number of partitions from hbase to the number of cells
114 |         hbaseNumberOfPartitionsForTopic = result.listCells().size()
115 |       }
116 | 
117 |       val fromOffsets = collection.mutable.Map[TopicPartition,Long]()
118 | 
119 |       if(hbaseNumberOfPartitionsForTopic == 0){
120 |         // initialize fromOffsets to beginning
121 |           for (partition <- 0 to zKNumberOfPartitionsForTopic-1){
122 |             fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
123 |       } else if(zKNumberOfPartitionsForTopic > hbaseNumberOfPartitionsForTopic){
124 |         // handle scenario where new partitions have been added to existing kafka topic
125 |           for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1){
126 |             val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),Bytes.toBytes(partition.toString)))
127 |             fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
128 |           for (partition <- hbaseNumberOfPartitionsForTopic to zKNumberOfPartitionsForTopic-1){
129 |             fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
130 |       } else {
131 |         //initialize fromOffsets from last run
132 |           for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1 ){
133 |             val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),Bytes.toBytes(partition.toString)))
134 |             fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
135 |       }
136 |       scanner.close()
137 |       conn.close()
138 |       fromOffsets.toMap
139 |     }
140 | 
141 | 
142 |     val fromOffsets= getLastCommittedOffsets(topic,consumerGroupID,hbaseTableName,zkQuorum,zkKafkaRootDir,
143 |       zkSessionTimeOut,zkConnectionTimeOut)
144 |     val inputDStream = KafkaUtils.createDirectStream[String, String](ssc,PreferConsistent,Assign[String, String](
145 |       fromOffsets.keys,kafkaParams,fromOffsets))
146 | 
147 |     /*
148 |       For each RDD in a DStream apply a map transformation that processes the message.
149 |     */
150 |     inputDStream.foreachRDD((rdd,batchTime) => {
151 |       val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
152 |       offsetRanges.foreach(offset => println(offset.topic, offset.partition, offset.fromOffset,offset.untilOffset))
153 |       val newRDD = rdd.map(message => processMessage(message))
154 |       newRDD.count()
155 |       saveOffsets(topic,consumerGroupID,offsetRanges,hbaseTableName,batchTime) //save the offsets to HBase
156 |     })
157 | 
158 |     println("Number of messages processed " + inputDStream.count())
159 |     ssc.start()
160 |     ssc.awaitTermination()
161 |   }
162 | }
163 | 


--------------------------------------------------------------------------------