├── .gitignore
├── README.md
├── build.sbt
├── project
├── assembly.sbt
├── build.properties
└── plugins.sbt
└── src
└── main
├── resources
└── hbase-site.xml
└── scala
└── KafkaOffsetsBlogStreamingDriver.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # sbt specific
5 | .cache
6 | .history
7 | .lib/
8 | dist/*
9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 |
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 |
19 | #Macbook specific stuff
20 | .DS_Store/
21 |
22 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
23 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
24 |
25 | # Entire .idea directory
26 | .idea/
27 |
28 | # User-specific stuff:
29 | .idea/**/workspace.xml
30 | .idea/**/tasks.xml
31 | .idea/dictionaries
32 |
33 | # Sensitive or high-churn files:
34 | .idea/**/dataSources/
35 | .idea/**/dataSources.ids
36 | .idea/**/dataSources.xml
37 | .idea/**/dataSources.local.xml
38 | .idea/**/sqlDataSources.xml
39 | .idea/**/dynamic.xml
40 | .idea/**/uiDesigner.xml
41 |
42 | # Gradle:
43 | .idea/**/gradle.xml
44 | .idea/**/libraries
45 |
46 | # CMake
47 | cmake-build-debug/
48 |
49 | # Mongo Explorer plugin:
50 | .idea/**/mongoSettings.xml
51 |
52 | ## File-based project format:
53 | *.iws
54 |
55 | ## Plugin-specific files:
56 |
57 | # IntelliJ
58 | /out/
59 |
60 | # mpeltonen/sbt-idea plugin
61 | .idea_modules/
62 |
63 | # JIRA plugin
64 | atlassian-ide-plugin.xml
65 |
66 | # Cursive Clojure plugin
67 | .idea/replstate.xml
68 |
69 | # Crashlytics plugin (for Android Studio and IntelliJ)
70 | com_crashlytics_export_strings.xml
71 | crashlytics.properties
72 | crashlytics-build.properties
73 | fabric.properties
74 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Offset Management For Apache Kafka With Apache Spark Streaming
2 |
3 | Spark Streaming,Kafka and HBase code accompanying the blog 'Offset Management For Apache Kafka With Apache Spark Streaming'.
4 |
5 | Link to the blog - http://blog.cloudera.com/blog/2017/06/offset-management-for-apache-kafka-with-apache-spark-streaming/
6 |
7 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import _root_.sbtassembly.AssemblyPlugin.autoImport._
2 | import _root_.sbtassembly.PathList
3 |
4 | name := "spark-streaming-kafka-cdh511-testing"
5 |
6 | version := "1.0"
7 |
8 | scalaVersion := "2.11.9"
9 |
10 |
11 | libraryDependencies ++= Seq(
12 | "org.apache.spark" %% "spark-streaming" % "2.1.0.cloudera1",
13 | "org.apache.spark" %% "spark-core" % "2.1.0.cloudera1" excludeAll ExclusionRule(organization = "javax.servlet"),
14 | "org.apache.spark" %% "spark-streaming-kafka-0-10" % "2.1.0.cloudera1",
15 | "org.apache.hbase" % "hbase-client" % "1.2.0-cdh5.11.0",
16 | "org.apache.hbase" % "hbase-common" % "1.2.0-cdh5.11.0"
17 | )
18 |
19 |
20 | assemblyMergeStrategy in assembly := {
21 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
22 | case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
23 | case PathList("org", "apache", xs @ _*) => MergeStrategy.last
24 | case PathList("com", "google", xs @ _*) => MergeStrategy.last
25 | case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
26 | case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
27 | case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
28 | case "about.html" => MergeStrategy.rename
29 | case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
30 | case "META-INF/mailcap" => MergeStrategy.last
31 | case "META-INF/mimetypes.default" => MergeStrategy.last
32 | case "plugin.properties" => MergeStrategy.last
33 | case "log4j.properties" => MergeStrategy.last
34 | case x =>
35 | val oldStrategy = (assemblyMergeStrategy in assembly).value
36 | oldStrategy(x)
37 | }
38 |
39 | resolvers ++= Seq(
40 | "Akka Repository" at "http://repo.akka.io/releases/",
41 | "Maven Central Server" at "http://repo1.maven.org/maven2",
42 | "Cloudera" at "https://repository.cloudera.com/artifactory/cloudera-repos",
43 | "Typesafe Repo" at "http://repo.typesafe.com/typesafe/releases/"
44 | )
--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
--------------------------------------------------------------------------------
/src/main/resources/hbase-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | hbase.rootdir
7 | hdfs://bottou02.sjc.cloudera.com:8020/hbase
8 |
9 |
10 | hbase.master.port
11 | 60000
12 |
13 |
14 | hbase.master.ipc.address
15 | 0.0.0.0
16 |
17 |
18 | hbase.cluster.distributed
19 | true
20 |
21 |
22 | hbase.master.info.port
23 | 60010
24 |
25 |
26 | hbase.client.write.buffer
27 | 2097152
28 |
29 |
30 | hbase.client.pause
31 | 100
32 |
33 |
34 | hbase.client.retries.number
35 | 35
36 |
37 |
38 | hbase.client.scanner.caching
39 | 100
40 |
41 |
42 | hbase.client.keyvalue.maxsize
43 | 10485760
44 |
45 |
46 | hbase.ipc.client.allowsInterrupt
47 | true
48 |
49 |
50 | hbase.client.primaryCallTimeout.get
51 | 10
52 |
53 |
54 | hbase.client.primaryCallTimeout.multiget
55 | 10
56 |
57 |
58 | hbase.master.handler.count
59 | 25
60 |
61 |
62 | hbase.master.executor.openregion.threads
63 | 5
64 |
65 |
66 | hbase.master.executor.closeregion.threads
67 | 5
68 |
69 |
70 | hbase.master.executor.serverops.threads
71 | 5
72 |
73 |
74 | hbase.splitlog.manager.timeout
75 | 120000
76 |
77 |
78 | hbase.master.logcleaner.ttl
79 | 60000
80 |
81 |
82 | hbase.regionserver.info.port
83 | 60030
84 |
85 |
86 | hbase.regionserver.handler.count
87 | 30
88 |
89 |
90 | hbase.regionserver.metahandler.count
91 | 10
92 |
93 |
94 | hbase.server.thread.wakefrequency
95 | 10000
96 |
97 |
98 | hbase.coprocessor.abortonerror
99 | false
100 |
101 |
102 | hbase.superuser
103 |
104 |
105 |
106 | hbase.rpc.timeout
107 | 60000
108 |
109 |
110 | hbase.snapshot.enabled
111 | true
112 |
113 |
114 | hbase.snapshot.master.timeoutMillis
115 | 60000
116 |
117 |
118 | hbase.snapshot.region.timeout
119 | 60000
120 |
121 |
122 | hbase.snapshot.master.timeout.millis
123 | 60000
124 |
125 |
126 | hbase.security.authentication
127 | simple
128 |
129 |
130 | hbase.security.authorization
131 | false
132 |
133 |
134 | hbase.row.level.authorization
135 | false
136 |
137 |
138 | hbase.rpc.protection
139 | authentication
140 |
141 |
142 | zookeeper.session.timeout
143 | 60000
144 |
145 |
146 | zookeeper.znode.parent
147 | /hbase
148 |
149 |
150 | zookeeper.znode.rootserver
151 | root-region-server
152 |
153 |
154 | hbase.zookeeper.quorum
155 | bottou03.sjc.cloudera.com,bottou04.sjc.cloudera.com,bottou05.sjc.cloudera.com
156 |
157 |
158 | hbase.zookeeper.property.clientPort
159 | 2181
160 |
161 |
--------------------------------------------------------------------------------
/src/main/scala/KafkaOffsetsBlogStreamingDriver.scala:
--------------------------------------------------------------------------------
1 | import kafka.utils.ZkUtils
2 | import org.apache.hadoop.hbase.filter.PrefixFilter
3 | import org.apache.hadoop.hbase.util.Bytes
4 | import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
5 | import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory}
6 | import org.apache.kafka.clients.consumer.ConsumerRecord
7 | import org.apache.kafka.common.TopicPartition
8 | import org.apache.kafka.common.serialization.StringDeserializer
9 | import org.apache.spark.streaming.kafka010.ConsumerStrategies._
10 | import org.apache.spark.streaming.kafka010.{OffsetRange, HasOffsetRanges, KafkaUtils}
11 | import org.apache.spark.streaming.kafka010.LocationStrategies._
12 | import org.apache.spark.streaming.{Seconds, StreamingContext}
13 | import org.apache.spark.{SparkContext, SparkConf}
14 |
15 |
16 |
17 | /**
18 | * Created by gmedasani on 6/10/17.
19 | */
20 | object KafkaOffsetsBlogStreamingDriver {
21 |
22 | def main(args: Array[String]) {
23 |
24 | if (args.length < 6) {
25 | System.err.println("Usage: KafkaDirectStreamTest " +
26 | " ")
27 | System.exit(1)
28 | }
29 |
30 | val batchDuration = args(0)
31 | val bootstrapServers = args(1).toString
32 | val topicsSet = args(2).toString.split(",").toSet
33 | val consumerGroupID = args(3)
34 | val hbaseTableName = args(4)
35 | val zkQuorum = args(5)
36 | val zkKafkaRootDir = "kafka"
37 | val zkSessionTimeOut = 10000
38 | val zkConnectionTimeOut = 10000
39 |
40 | val sparkConf = new SparkConf().setAppName("Kafka-Offset-Management-Blog")
41 | .setMaster("local[4]")//Uncomment this line to test while developing on a workstation
42 | val sc = new SparkContext(sparkConf)
43 | val ssc = new StreamingContext(sc, Seconds(batchDuration.toLong))
44 | val topics = topicsSet.toArray
45 | val topic = topics(0)
46 |
47 | val kafkaParams = Map[String, Object](
48 | "bootstrap.servers" -> bootstrapServers,
49 | "key.deserializer" -> classOf[StringDeserializer],
50 | "value.deserializer" -> classOf[StringDeserializer],
51 | "group.id" -> consumerGroupID,
52 | "auto.offset.reset" -> "earliest",
53 | "enable.auto.commit" -> (false: java.lang.Boolean)
54 | )
55 |
56 | /*
57 | Create a dummy process that simply returns the message as is.
58 | */
59 | def processMessage(message:ConsumerRecord[String,String]):ConsumerRecord[String,String]={
60 | message
61 | }
62 |
63 | /*
64 | Save Offsets into HBase
65 | */
66 | def saveOffsets(TOPIC_NAME:String,GROUP_ID:String,offsetRanges:Array[OffsetRange],hbaseTableName:String,
67 | batchTime: org.apache.spark.streaming.Time) ={
68 | val hbaseConf = HBaseConfiguration.create()
69 | hbaseConf.addResource("src/main/resources/hbase-site.xml")
70 | val conn = ConnectionFactory.createConnection(hbaseConf)
71 | val table = conn.getTable(TableName.valueOf(hbaseTableName))
72 | val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
73 | val put = new Put(rowKey.getBytes)
74 | for(offset <- offsetRanges){
75 | put.addColumn(Bytes.toBytes("offsets"),Bytes.toBytes(offset.partition.toString),
76 | Bytes.toBytes(offset.untilOffset.toString))
77 | }
78 | table.put(put)
79 | conn.close()
80 | }
81 |
82 | /*
83 | Returns last committed offsets for all the partitions of a given topic from HBase in following cases.
84 | - CASE 1: SparkStreaming job is started for the first time. This function gets the number of topic partitions from
85 | Zookeeper and for each partition returns the last committed offset as 0
86 | - CASE 2: SparkStreaming is restarted and there are no changes to the number of partitions in a topic. Last
87 | committed offsets for each topic-partition is returned as is from HBase.
88 | - CASE 3: SparkStreaming is restarted and the number of partitions in a topic increased. For old partitions, last
89 | committed offsets for each topic-partition is returned as is from HBase as is. For newly added partitions,
90 | function returns last committed offsets as 0
91 | */
92 | def getLastCommittedOffsets(TOPIC_NAME:String,GROUP_ID:String,hbaseTableName:String,zkQuorum:String,
93 | zkRootDir:String, sessionTimeout:Int,connectionTimeOut:Int):Map[TopicPartition,Long] ={
94 |
95 | val hbaseConf = HBaseConfiguration.create()
96 | hbaseConf.addResource("src/main/resources/hbase-site.xml")
97 | val zkUrl = zkQuorum+"/"+zkRootDir
98 | val zkClientAndConnection = ZkUtils.createZkClientAndConnection(zkUrl,sessionTimeout,connectionTimeOut)
99 | val zkUtils = new ZkUtils(zkClientAndConnection._1, zkClientAndConnection._2,false)
100 | val zKNumberOfPartitionsForTopic = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size
101 |
102 | //Connect to HBase to retrieve last committed offsets
103 | val conn = ConnectionFactory.createConnection(hbaseConf)
104 | val table = conn.getTable(TableName.valueOf(hbaseTableName))
105 | val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
106 | val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
107 | val scan = new Scan()
108 | val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
109 | val result = scanner.next()
110 |
111 | var hbaseNumberOfPartitionsForTopic = 0 //Set the number of partitions discovered for a topic in HBase to 0
112 | if (result != null){
113 | //If the result from hbase scanner is not null, set number of partitions from hbase to the number of cells
114 | hbaseNumberOfPartitionsForTopic = result.listCells().size()
115 | }
116 |
117 | val fromOffsets = collection.mutable.Map[TopicPartition,Long]()
118 |
119 | if(hbaseNumberOfPartitionsForTopic == 0){
120 | // initialize fromOffsets to beginning
121 | for (partition <- 0 to zKNumberOfPartitionsForTopic-1){
122 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
123 | } else if(zKNumberOfPartitionsForTopic > hbaseNumberOfPartitionsForTopic){
124 | // handle scenario where new partitions have been added to existing kafka topic
125 | for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1){
126 | val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),Bytes.toBytes(partition.toString)))
127 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
128 | for (partition <- hbaseNumberOfPartitionsForTopic to zKNumberOfPartitionsForTopic-1){
129 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)}
130 | } else {
131 | //initialize fromOffsets from last run
132 | for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1 ){
133 | val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),Bytes.toBytes(partition.toString)))
134 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)}
135 | }
136 | scanner.close()
137 | conn.close()
138 | fromOffsets.toMap
139 | }
140 |
141 |
142 | val fromOffsets= getLastCommittedOffsets(topic,consumerGroupID,hbaseTableName,zkQuorum,zkKafkaRootDir,
143 | zkSessionTimeOut,zkConnectionTimeOut)
144 | val inputDStream = KafkaUtils.createDirectStream[String, String](ssc,PreferConsistent,Assign[String, String](
145 | fromOffsets.keys,kafkaParams,fromOffsets))
146 |
147 | /*
148 | For each RDD in a DStream apply a map transformation that processes the message.
149 | */
150 | inputDStream.foreachRDD((rdd,batchTime) => {
151 | val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
152 | offsetRanges.foreach(offset => println(offset.topic, offset.partition, offset.fromOffset,offset.untilOffset))
153 | val newRDD = rdd.map(message => processMessage(message))
154 | newRDD.count()
155 | saveOffsets(topic,consumerGroupID,offsetRanges,hbaseTableName,batchTime) //save the offsets to HBase
156 | })
157 |
158 | println("Number of messages processed " + inputDStream.count())
159 | ssc.start()
160 | ssc.awaitTermination()
161 | }
162 | }
163 |
--------------------------------------------------------------------------------