├── .gitignore ├── README.md ├── build.sbt ├── project ├── assembly.sbt ├── build.properties └── plugins.sbt └── src └── main ├── resources └── hbase-site.xml └── scala └── KafkaOffsetsBlogStreamingDriver.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | 19 | #Macbook specific stuff 20 | .DS_Store/ 21 | 22 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 23 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 24 | 25 | # Entire .idea directory 26 | .idea/ 27 | 28 | # User-specific stuff: 29 | .idea/**/workspace.xml 30 | .idea/**/tasks.xml 31 | .idea/dictionaries 32 | 33 | # Sensitive or high-churn files: 34 | .idea/**/dataSources/ 35 | .idea/**/dataSources.ids 36 | .idea/**/dataSources.xml 37 | .idea/**/dataSources.local.xml 38 | .idea/**/sqlDataSources.xml 39 | .idea/**/dynamic.xml 40 | .idea/**/uiDesigner.xml 41 | 42 | # Gradle: 43 | .idea/**/gradle.xml 44 | .idea/**/libraries 45 | 46 | # CMake 47 | cmake-build-debug/ 48 | 49 | # Mongo Explorer plugin: 50 | .idea/**/mongoSettings.xml 51 | 52 | ## File-based project format: 53 | *.iws 54 | 55 | ## Plugin-specific files: 56 | 57 | # IntelliJ 58 | /out/ 59 | 60 | # mpeltonen/sbt-idea plugin 61 | .idea_modules/ 62 | 63 | # JIRA plugin 64 | atlassian-ide-plugin.xml 65 | 66 | # Cursive Clojure plugin 67 | .idea/replstate.xml 68 | 69 | # Crashlytics plugin (for Android Studio and IntelliJ) 70 | com_crashlytics_export_strings.xml 71 | crashlytics.properties 72 | crashlytics-build.properties 73 | fabric.properties 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Offset Management For Apache Kafka With Apache Spark Streaming 2 | 3 | Spark Streaming,Kafka and HBase code accompanying the blog 'Offset Management For Apache Kafka With Apache Spark Streaming'. 4 | 5 | Link to the blog - http://blog.cloudera.com/blog/2017/06/offset-management-for-apache-kafka-with-apache-spark-streaming/ 6 | 7 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import _root_.sbtassembly.AssemblyPlugin.autoImport._ 2 | import _root_.sbtassembly.PathList 3 | 4 | name := "spark-streaming-kafka-cdh511-testing" 5 | 6 | version := "1.0" 7 | 8 | scalaVersion := "2.11.9" 9 | 10 | 11 | libraryDependencies ++= Seq( 12 | "org.apache.spark" %% "spark-streaming" % "2.1.0.cloudera1", 13 | "org.apache.spark" %% "spark-core" % "2.1.0.cloudera1" excludeAll ExclusionRule(organization = "javax.servlet"), 14 | "org.apache.spark" %% "spark-streaming-kafka-0-10" % "2.1.0.cloudera1", 15 | "org.apache.hbase" % "hbase-client" % "1.2.0-cdh5.11.0", 16 | "org.apache.hbase" % "hbase-common" % "1.2.0-cdh5.11.0" 17 | ) 18 | 19 | 20 | assemblyMergeStrategy in assembly := { 21 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last 22 | case PathList("javax", "activation", xs @ _*) => MergeStrategy.last 23 | case PathList("org", "apache", xs @ _*) => MergeStrategy.last 24 | case PathList("com", "google", xs @ _*) => MergeStrategy.last 25 | case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last 26 | case PathList("com", "codahale", xs @ _*) => MergeStrategy.last 27 | case PathList("com", "yammer", xs @ _*) => MergeStrategy.last 28 | case "about.html" => MergeStrategy.rename 29 | case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last 30 | case "META-INF/mailcap" => MergeStrategy.last 31 | case "META-INF/mimetypes.default" => MergeStrategy.last 32 | case "plugin.properties" => MergeStrategy.last 33 | case "log4j.properties" => MergeStrategy.last 34 | case x => 35 | val oldStrategy = (assemblyMergeStrategy in assembly).value 36 | oldStrategy(x) 37 | } 38 | 39 | resolvers ++= Seq( 40 | "Akka Repository" at "http://repo.akka.io/releases/", 41 | "Maven Central Server" at "http://repo1.maven.org/maven2", 42 | "Cloudera" at "https://repository.cloudera.com/artifactory/cloudera-repos", 43 | "Typesafe Repo" at "http://repo.typesafe.com/typesafe/releases/" 44 | ) -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /src/main/resources/hbase-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | hbase.rootdir 7 | hdfs://bottou02.sjc.cloudera.com:8020/hbase 8 | 9 | 10 | hbase.master.port 11 | 60000 12 | 13 | 14 | hbase.master.ipc.address 15 | 0.0.0.0 16 | 17 | 18 | hbase.cluster.distributed 19 | true 20 | 21 | 22 | hbase.master.info.port 23 | 60010 24 | 25 | 26 | hbase.client.write.buffer 27 | 2097152 28 | 29 | 30 | hbase.client.pause 31 | 100 32 | 33 | 34 | hbase.client.retries.number 35 | 35 36 | 37 | 38 | hbase.client.scanner.caching 39 | 100 40 | 41 | 42 | hbase.client.keyvalue.maxsize 43 | 10485760 44 | 45 | 46 | hbase.ipc.client.allowsInterrupt 47 | true 48 | 49 | 50 | hbase.client.primaryCallTimeout.get 51 | 10 52 | 53 | 54 | hbase.client.primaryCallTimeout.multiget 55 | 10 56 | 57 | 58 | hbase.master.handler.count 59 | 25 60 | 61 | 62 | hbase.master.executor.openregion.threads 63 | 5 64 | 65 | 66 | hbase.master.executor.closeregion.threads 67 | 5 68 | 69 | 70 | hbase.master.executor.serverops.threads 71 | 5 72 | 73 | 74 | hbase.splitlog.manager.timeout 75 | 120000 76 | 77 | 78 | hbase.master.logcleaner.ttl 79 | 60000 80 | 81 | 82 | hbase.regionserver.info.port 83 | 60030 84 | 85 | 86 | hbase.regionserver.handler.count 87 | 30 88 | 89 | 90 | hbase.regionserver.metahandler.count 91 | 10 92 | 93 | 94 | hbase.server.thread.wakefrequency 95 | 10000 96 | 97 | 98 | hbase.coprocessor.abortonerror 99 | false 100 | 101 | 102 | hbase.superuser 103 | 104 | 105 | 106 | hbase.rpc.timeout 107 | 60000 108 | 109 | 110 | hbase.snapshot.enabled 111 | true 112 | 113 | 114 | hbase.snapshot.master.timeoutMillis 115 | 60000 116 | 117 | 118 | hbase.snapshot.region.timeout 119 | 60000 120 | 121 | 122 | hbase.snapshot.master.timeout.millis 123 | 60000 124 | 125 | 126 | hbase.security.authentication 127 | simple 128 | 129 | 130 | hbase.security.authorization 131 | false 132 | 133 | 134 | hbase.row.level.authorization 135 | false 136 | 137 | 138 | hbase.rpc.protection 139 | authentication 140 | 141 | 142 | zookeeper.session.timeout 143 | 60000 144 | 145 | 146 | zookeeper.znode.parent 147 | /hbase 148 | 149 | 150 | zookeeper.znode.rootserver 151 | root-region-server 152 | 153 | 154 | hbase.zookeeper.quorum 155 | bottou03.sjc.cloudera.com,bottou04.sjc.cloudera.com,bottou05.sjc.cloudera.com 156 | 157 | 158 | hbase.zookeeper.property.clientPort 159 | 2181 160 | 161 | -------------------------------------------------------------------------------- /src/main/scala/KafkaOffsetsBlogStreamingDriver.scala: -------------------------------------------------------------------------------- 1 | import kafka.utils.ZkUtils 2 | import org.apache.hadoop.hbase.filter.PrefixFilter 3 | import org.apache.hadoop.hbase.util.Bytes 4 | import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} 5 | import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory} 6 | import org.apache.kafka.clients.consumer.ConsumerRecord 7 | import org.apache.kafka.common.TopicPartition 8 | import org.apache.kafka.common.serialization.StringDeserializer 9 | import org.apache.spark.streaming.kafka010.ConsumerStrategies._ 10 | import org.apache.spark.streaming.kafka010.{OffsetRange, HasOffsetRanges, KafkaUtils} 11 | import org.apache.spark.streaming.kafka010.LocationStrategies._ 12 | import org.apache.spark.streaming.{Seconds, StreamingContext} 13 | import org.apache.spark.{SparkContext, SparkConf} 14 | 15 | 16 | 17 | /** 18 | * Created by gmedasani on 6/10/17. 19 | */ 20 | object KafkaOffsetsBlogStreamingDriver { 21 | 22 | def main(args: Array[String]) { 23 | 24 | if (args.length < 6) { 25 | System.err.println("Usage: KafkaDirectStreamTest " + 26 | " ") 27 | System.exit(1) 28 | } 29 | 30 | val batchDuration = args(0) 31 | val bootstrapServers = args(1).toString 32 | val topicsSet = args(2).toString.split(",").toSet 33 | val consumerGroupID = args(3) 34 | val hbaseTableName = args(4) 35 | val zkQuorum = args(5) 36 | val zkKafkaRootDir = "kafka" 37 | val zkSessionTimeOut = 10000 38 | val zkConnectionTimeOut = 10000 39 | 40 | val sparkConf = new SparkConf().setAppName("Kafka-Offset-Management-Blog") 41 | .setMaster("local[4]")//Uncomment this line to test while developing on a workstation 42 | val sc = new SparkContext(sparkConf) 43 | val ssc = new StreamingContext(sc, Seconds(batchDuration.toLong)) 44 | val topics = topicsSet.toArray 45 | val topic = topics(0) 46 | 47 | val kafkaParams = Map[String, Object]( 48 | "bootstrap.servers" -> bootstrapServers, 49 | "key.deserializer" -> classOf[StringDeserializer], 50 | "value.deserializer" -> classOf[StringDeserializer], 51 | "group.id" -> consumerGroupID, 52 | "auto.offset.reset" -> "earliest", 53 | "enable.auto.commit" -> (false: java.lang.Boolean) 54 | ) 55 | 56 | /* 57 | Create a dummy process that simply returns the message as is. 58 | */ 59 | def processMessage(message:ConsumerRecord[String,String]):ConsumerRecord[String,String]={ 60 | message 61 | } 62 | 63 | /* 64 | Save Offsets into HBase 65 | */ 66 | def saveOffsets(TOPIC_NAME:String,GROUP_ID:String,offsetRanges:Array[OffsetRange],hbaseTableName:String, 67 | batchTime: org.apache.spark.streaming.Time) ={ 68 | val hbaseConf = HBaseConfiguration.create() 69 | hbaseConf.addResource("src/main/resources/hbase-site.xml") 70 | val conn = ConnectionFactory.createConnection(hbaseConf) 71 | val table = conn.getTable(TableName.valueOf(hbaseTableName)) 72 | val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds) 73 | val put = new Put(rowKey.getBytes) 74 | for(offset <- offsetRanges){ 75 | put.addColumn(Bytes.toBytes("offsets"),Bytes.toBytes(offset.partition.toString), 76 | Bytes.toBytes(offset.untilOffset.toString)) 77 | } 78 | table.put(put) 79 | conn.close() 80 | } 81 | 82 | /* 83 | Returns last committed offsets for all the partitions of a given topic from HBase in following cases. 84 | - CASE 1: SparkStreaming job is started for the first time. This function gets the number of topic partitions from 85 | Zookeeper and for each partition returns the last committed offset as 0 86 | - CASE 2: SparkStreaming is restarted and there are no changes to the number of partitions in a topic. Last 87 | committed offsets for each topic-partition is returned as is from HBase. 88 | - CASE 3: SparkStreaming is restarted and the number of partitions in a topic increased. For old partitions, last 89 | committed offsets for each topic-partition is returned as is from HBase as is. For newly added partitions, 90 | function returns last committed offsets as 0 91 | */ 92 | def getLastCommittedOffsets(TOPIC_NAME:String,GROUP_ID:String,hbaseTableName:String,zkQuorum:String, 93 | zkRootDir:String, sessionTimeout:Int,connectionTimeOut:Int):Map[TopicPartition,Long] ={ 94 | 95 | val hbaseConf = HBaseConfiguration.create() 96 | hbaseConf.addResource("src/main/resources/hbase-site.xml") 97 | val zkUrl = zkQuorum+"/"+zkRootDir 98 | val zkClientAndConnection = ZkUtils.createZkClientAndConnection(zkUrl,sessionTimeout,connectionTimeOut) 99 | val zkUtils = new ZkUtils(zkClientAndConnection._1, zkClientAndConnection._2,false) 100 | val zKNumberOfPartitionsForTopic = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size 101 | 102 | //Connect to HBase to retrieve last committed offsets 103 | val conn = ConnectionFactory.createConnection(hbaseConf) 104 | val table = conn.getTable(TableName.valueOf(hbaseTableName)) 105 | val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis()) 106 | val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0 107 | val scan = new Scan() 108 | val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true)) 109 | val result = scanner.next() 110 | 111 | var hbaseNumberOfPartitionsForTopic = 0 //Set the number of partitions discovered for a topic in HBase to 0 112 | if (result != null){ 113 | //If the result from hbase scanner is not null, set number of partitions from hbase to the number of cells 114 | hbaseNumberOfPartitionsForTopic = result.listCells().size() 115 | } 116 | 117 | val fromOffsets = collection.mutable.Map[TopicPartition,Long]() 118 | 119 | if(hbaseNumberOfPartitionsForTopic == 0){ 120 | // initialize fromOffsets to beginning 121 | for (partition <- 0 to zKNumberOfPartitionsForTopic-1){ 122 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)} 123 | } else if(zKNumberOfPartitionsForTopic > hbaseNumberOfPartitionsForTopic){ 124 | // handle scenario where new partitions have been added to existing kafka topic 125 | for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1){ 126 | val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),Bytes.toBytes(partition.toString))) 127 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)} 128 | for (partition <- hbaseNumberOfPartitionsForTopic to zKNumberOfPartitionsForTopic-1){ 129 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> 0)} 130 | } else { 131 | //initialize fromOffsets from last run 132 | for (partition <- 0 to hbaseNumberOfPartitionsForTopic-1 ){ 133 | val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),Bytes.toBytes(partition.toString))) 134 | fromOffsets += (new TopicPartition(TOPIC_NAME,partition) -> fromOffset.toLong)} 135 | } 136 | scanner.close() 137 | conn.close() 138 | fromOffsets.toMap 139 | } 140 | 141 | 142 | val fromOffsets= getLastCommittedOffsets(topic,consumerGroupID,hbaseTableName,zkQuorum,zkKafkaRootDir, 143 | zkSessionTimeOut,zkConnectionTimeOut) 144 | val inputDStream = KafkaUtils.createDirectStream[String, String](ssc,PreferConsistent,Assign[String, String]( 145 | fromOffsets.keys,kafkaParams,fromOffsets)) 146 | 147 | /* 148 | For each RDD in a DStream apply a map transformation that processes the message. 149 | */ 150 | inputDStream.foreachRDD((rdd,batchTime) => { 151 | val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 152 | offsetRanges.foreach(offset => println(offset.topic, offset.partition, offset.fromOffset,offset.untilOffset)) 153 | val newRDD = rdd.map(message => processMessage(message)) 154 | newRDD.count() 155 | saveOffsets(topic,consumerGroupID,offsetRanges,hbaseTableName,batchTime) //save the offsets to HBase 156 | }) 157 | 158 | println("Number of messages processed " + inputDStream.count()) 159 | ssc.start() 160 | ssc.awaitTermination() 161 | } 162 | } 163 | --------------------------------------------------------------------------------