├── .gitignore ├── README.md ├── pom.xml └── src └── main ├── resources └── conf │ ├── dev │ ├── application.properties │ └── log4j.properties │ ├── prod │ └── application.properties │ └── test │ ├── application.properties │ └── log4j.properties └── scala └── com ├── server ├── AppA.java └── StreamingOffsetLauncher.scala └── utils ├── CommonUtils.scala ├── GraceCloseUtils.scala └── KafkaOffsetManager.scala /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .idea 3 | *.iml -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 项目背景 2 | --------- 3 | 公司核心的实时业务用的是spark streaming2.3.0+kafka1.3的流式技术来开发的。在这里我把它做成了一个骨架项目并开源出来,希望后来的朋友可以借阅和参考,尽量少走些弯路。 4 | 5 | 下面是使用过程中记录的一些心得和博客,感兴趣的朋友可以了解下: 6 | 7 | - [如何管理Spark Streaming消费Kafka的偏移量(一)](https://www.jianshu.com/p/9fc343879bbc) 8 | - [如何管理Spark Streaming消费Kafka的偏移量(二)](https://www.jianshu.com/p/9bb983f86415) 9 | - [如何管理Spark Streaming消费Kafka的偏移量(三)](https://www.jianshu.com/p/bf422de60e8b) 10 | - [Spark Streaming 重启后Kafka数据堆积调优](https://www.jianshu.com/p/63f52743ae77) 11 | - [spark stream冷启动处理kafka中积压的数据](https://www.jianshu.com/p/8f13735d40bd) 12 | - [SparkStreaming如何优雅的停止服务](https://www.jianshu.com/p/e92bd93fa1bc) 13 | - [Spark Streaming优雅的关闭策略优化](https://www.jianshu.com/p/2a7ec7e57130) 14 | 15 | # 项目简介 16 | ----------- 17 | 该项目提供了一个在使用spark streaming2.3+kafka1.3的版本集成时,手动存储偏移量到zookeeper中,因为自带的checkpoint弊端太多,不利于项目升级发布,并修复了一些遇到的bug,例子中的代码已经在我们生产环境运行,所以大家可以参考一下。 18 | 19 | # 主要功能 20 | ------------ 21 | 1. 提供了快速使用 spark streaming + kafka 开发流式程序的骨架,示例中的代码大部分都加上了详细的注释 22 | 2. 提供了手动管理kafka的offset存储到zookeeper的方法,并解决了一些bug,如kafka扩容分区,重启实时流不识别新增分区的问题。 23 | 3. 提供了比较简单和优雅的关闭spark streaming流式程序功能 24 | 25 | # 个人博客: 26 | -------- 27 | 简书:https://www.jianshu.com/u/41307d187d27 -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com 4 | sparkStreaming-offset-to-zk 5 | 1.0-SNAPSHOT 6 | 2008 7 | 8 | 9 | 10 | dev 11 | 12 | conf/dev 13 | 14 | 15 | 16 | test 17 | 18 | conf/test 19 | 20 | 21 | true 22 | 23 | 24 | 25 | prod 26 | 27 | conf/prod 28 | 29 | 30 | 31 | 32 | 33 | 34 | 2.3.0 35 | 2.11.12 36 | 1.2.15 37 | 0.10.2.1 38 | 1.2.1 39 | 0.10 40 | 18.0 41 | 42 | 43 | 44 | 45 | scala-tools.org 46 | Scala-Tools Maven2 Repository 47 | http://scala-tools.org/repo-releases 48 | 49 | 50 | 51 | 52 | 53 | scala-tools.org 54 | Scala-Tools Maven2 Repository 55 | http://scala-tools.org/repo-releases 56 | 57 | 58 | 59 | 60 | 61 | 62 | org.scala-lang 63 | scala-library 64 | ${scala.version} 65 | 66 | 67 | 68 | org.apache.spark 69 | spark-core_2.11 70 | ${spark.version} 71 | 72 | 73 | org.apache.spark 74 | spark-streaming_2.11 75 | ${spark.version} 76 | 77 | 78 | org.apache.spark 79 | spark-streaming-kafka-0-8_2.11 80 | ${spark.version} 81 | 82 | 83 | 84 | org.apache.kafka 85 | kafka-clients 86 | ${kafka.version} 87 | 88 | 89 | 90 | com.101tec 91 | zkclient 92 | ${zkclient.version} 93 | 94 | 95 | 96 | com.alibaba 97 | fastjson 98 | ${fast.version} 99 | 100 | 101 | 102 | log4j 103 | log4j 104 | 1.2.17 105 | 106 | 107 | com.google.guava 108 | guava 109 | ${guava.version} 110 | 111 | 112 | com.typesafe 113 | config 114 | ${config.version} 115 | 116 | 117 | 118 | 119 | src/main/scala 120 | 121 | 122 | 123 | true 124 | ${basedir}/src/main/resources/${env} 125 | 126 | *.properties 127 | 128 | 129 | 130 | 131 | 132 | 133 | maven-assembly-plugin 134 | 135 | 136 | 137 | 138 | jar-with-dependencies 139 | 140 | 141 | 142 | 143 | make-assembly 144 | package 145 | 146 | single 147 | 148 | 149 | 150 | 151 | 152 | org.scala-tools 153 | maven-scala-plugin 154 | 155 | 156 | 157 | compile 158 | testCompile 159 | 160 | 161 | 162 | 163 | ${scala.version} 164 | 165 | -target:jvm-1.8 166 | 167 | 168 | 169 | 170 | org.apache.maven.plugins 171 | maven-eclipse-plugin 172 | 173 | true 174 | 175 | ch.epfl.lamp.sdt.core.scalabuilder 176 | 177 | 178 | ch.epfl.lamp.sdt.core.scalanature 179 | 180 | 181 | org.eclipse.jdt.launching.JRE_CONTAINER 182 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | org.scala-tools 192 | maven-scala-plugin 193 | 194 | ${scala.version} 195 | 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /src/main/resources/conf/dev/application.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangfengchao/sparkStreaming-offset-to-zk/1779910387ffc27f9e100dfa579ba89d230aef81/src/main/resources/conf/dev/application.properties -------------------------------------------------------------------------------- /src/main/resources/conf/dev/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,console,kafka 2 | 3 | #log4j.logger.com.demo.kafka=DEBUG,kafka 4 | # appender kafka 5 | log4j.appender.kafka=kafka.producer.KafkaLog4jAppender 6 | log4j.appender.kafka.topic=elk_log_topic 7 | # multiple brokers are separated by comma ",". 8 | log4j.appender.kafka.brokerList=10.108.4.203:9092,10.108.4.204:9092,10.108.4.205:9092 9 | log4j.appender.kafka.compressionType=none 10 | log4j.appender.kafka.syncSend=false 11 | log4j.appender.kafka.layout=org.apache.log4j.PatternLayout 12 | #log4j.appender.kafka.layout.ConversionPattern=%d [%-5p] [%t] - [%l] %m%n 13 | log4j.appender.kafka.layout.ConversionPattern=[%d] [%p] [%t] %m%n 14 | 15 | # appender console 16 | log4j.appender.console=org.apache.log4j.ConsoleAppender 17 | log4j.appender.console.target=System.out 18 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 19 | log4j.appender.console.layout.ConversionPattern=[%d] [%p] [%t] %m%n 20 | #log4j.appender.console.layout.ConversionPattern=%d [%-5p] [%t] - [%l] %m%n -------------------------------------------------------------------------------- /src/main/resources/conf/prod/application.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangfengchao/sparkStreaming-offset-to-zk/1779910387ffc27f9e100dfa579ba89d230aef81/src/main/resources/conf/prod/application.properties -------------------------------------------------------------------------------- /src/main/resources/conf/test/application.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangfengchao/sparkStreaming-offset-to-zk/1779910387ffc27f9e100dfa579ba89d230aef81/src/main/resources/conf/test/application.properties -------------------------------------------------------------------------------- /src/main/resources/conf/test/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,console,kafka 2 | 3 | #log4j.logger.com.demo.kafka=DEBUG,kafka 4 | # appender kafka 5 | log4j.appender.kafka=kafka.producer.KafkaLog4jAppender 6 | log4j.appender.kafka.topic=elk_log_topic 7 | # multiple brokers are separated by comma ",". 8 | log4j.appender.kafka.brokerList=10.108.4.203:9092,10.108.4.204:9092,10.108.4.205:9092 9 | log4j.appender.kafka.compressionType=none 10 | log4j.appender.kafka.syncSend=false 11 | log4j.appender.kafka.layout=org.apache.log4j.PatternLayout 12 | #log4j.appender.kafka.layout.ConversionPattern=%d [%-5p] [%t] - [%l] %m%n 13 | log4j.appender.kafka.layout.ConversionPattern=[%d] [%p] [%t] %m%n 14 | 15 | # appender console 16 | log4j.appender.console=org.apache.log4j.ConsoleAppender 17 | log4j.appender.console.target=System.out 18 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 19 | log4j.appender.console.layout.ConversionPattern=[%d] [%p] [%t] %m%n 20 | #log4j.appender.console.layout.ConversionPattern=%d [%-5p] [%t] - [%l] %m%n -------------------------------------------------------------------------------- /src/main/scala/com/server/AppA.java: -------------------------------------------------------------------------------- 1 | package com.server; 2 | 3 | import org.apache.log4j.Logger; 4 | 5 | /** 6 | * Created by fc.w on 2018/05/31 7 | */ 8 | public class AppA { 9 | 10 | private static final Logger LOGGER = Logger.getLogger(AppA.class); 11 | public static void main(String[] args) throws InterruptedException { 12 | for (int i = 0; i < 20; i++) { 13 | LOGGER.info("Info [" + i + "]"); 14 | Thread.sleep(1000); 15 | } 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/server/StreamingOffsetLauncher.scala: -------------------------------------------------------------------------------- 1 | package com.server 2 | 3 | 4 | import com.utils.{CommonUtils, GraceCloseUtils, KafkaOffsetManager} 5 | import kafka.api.OffsetRequest 6 | import kafka.message.MessageAndMetadata 7 | import kafka.serializer.StringDecoder 8 | import kafka.utils.ZKStringSerializer 9 | import org.I0Itec.zkclient.ZkClient 10 | import org.apache.log4j.LogManager 11 | import org.apache.spark.SparkConf 12 | import org.apache.spark.streaming.dstream.InputDStream 13 | import org.apache.spark.streaming.kafka.KafkaUtils 14 | import org.apache.spark.streaming.{Seconds, StreamingContext} 15 | 16 | /** 17 | * Spark Streaming优雅关闭服务策略 和 冷启动时Kafka数据堆积优化 18 | * Created by fc.w on 2018/05/30 19 | */ 20 | object StreamingOffsetLauncher { 21 | 22 | lazy val log = LogManager.getLogger(StreamingOffsetLauncher.getClass) 23 | 24 | /** 25 | * 程序入口 26 | * @param args 27 | */ 28 | def main(args: Array[String]): Unit = { 29 | val ssc = StreamingContext.getOrCreate(CommonUtils.checkpoint, functionToCreateContext) 30 | 31 | ssc.start() 32 | 33 | // 方式一: 通过Http方式优雅的关闭策略 34 | GraceCloseUtils.daemonHttpServer(8012,ssc) 35 | // 方式二: 通过扫描HDFS文件来优雅的关闭 36 | // GraceCloseUtils.stopByMarkFile(ssc) 37 | //等待任务终止 38 | ssc.awaitTermination() 39 | 40 | // 系统提供的优雅关闭 41 | // sys.ShutdownHookThread 42 | // { 43 | // ssc.stop(true, true) 44 | // } 45 | // Runtime.getRuntime().addShutdownHook(new Thread() { 46 | // override def run() { 47 | // ssc.stop(true, true) 48 | // } 49 | // }) 50 | 51 | } 52 | 53 | /** 54 | * 主逻辑 55 | * @return 56 | */ 57 | def functionToCreateContext(): StreamingContext = { 58 | val conf = new SparkConf().setAppName("streaming_offset_to_zk_app") 59 | // if (CommonUtils.isLocal) conf.setMaster("local[1]") // local模式 60 | 61 | /* 启动优雅关闭服务 */ 62 | conf.set("spark.streaming.stopGracefullyOnShutdown", "true") 63 | /* Spark Streaming 重启后Kafka数据堆积调优 */ 64 | conf.set("spark.streaming.backpressure.enabled", "true") // 激活反压功能 65 | conf.set("spark.streaming.backpressure.initialRate", "5000") // 启动反压功能后,读取的最大数据量 66 | conf.set("spark.streaming.kafka.maxRatePerPartition", "2000") // 设置每秒每个分区最大获取日志数,控制处理数据量,保证数据均匀处理。 67 | 68 | var kafkaParams = Map[String, String]("bootstrap.servers" -> CommonUtils.kafkaServers) // 创建一个kafkaParams 69 | if (CommonUtils.firstReadLastest) kafkaParams += ("auto.offset.reset" -> OffsetRequest.LargestTimeString) // 从最新的开始消费 70 | // 创建zkClient注意最后一个参数最好是ZKStringSerializer类型的,不然写进去zk里面的偏移量是乱码 71 | val zkClient = new ZkClient(CommonUtils.zkServer, 30000, 30000, ZKStringSerializer) 72 | val zkOffsetPath = CommonUtils.zkOffsetPath 73 | val topicsSet = CommonUtils.topicSet 74 | 75 | val ssc = new StreamingContext(conf, Seconds(10)) 76 | val rdds: InputDStream[(String, String)] = createKafkaStream(ssc, kafkaParams,zkClient,zkOffsetPath, topicsSet) 77 | rdds.foreachRDD(rdd => { 78 | // 只处理有数据的rdd,没有数据的直接跳过 79 | if(!rdd.isEmpty()){ 80 | 81 | // 迭代分区,里面的代码是运行在executor上面 82 | rdd.foreachPartition(partitions => { 83 | 84 | //如果没有使用广播变量,连接资源就在这个地方初始化 85 | //比如数据库连接,hbase,elasticsearch,solr,等等 86 | partitions.foreach(msg => { 87 | log.warn("数据读取成功。。。") 88 | log.info("读取的数据:" + msg) 89 | }) 90 | }) 91 | } 92 | 93 | // 更新每个批次的偏移量到zk中,注意这段代码是在driver上执行的 94 | KafkaOffsetManager.saveOffsets(zkClient,zkOffsetPath,rdd) 95 | }) 96 | 97 | ssc 98 | } 99 | 100 | /** 101 | * 获取Kafka数据偏移量 102 | * @param ssc StreamingContext 103 | * @param kafkaParams 配置kafka的参数 104 | * @param zkClient zk连接的client 105 | * @param zkOffsetPath zk里面偏移量的路径 106 | * @param topics 需要处理的topic 107 | * @return InputDStream[(String, String)] 返回输入流 108 | */ 109 | def createKafkaStream(ssc: StreamingContext, 110 | kafkaParams: Map[String, String], 111 | zkClient: ZkClient, 112 | zkOffsetPath: String, 113 | topics: Set[String]): InputDStream[(String, String)] = { 114 | // 目前仅支持一个topic的偏移量处理,读取zk里面偏移量字符串 115 | val zkOffsetData = KafkaOffsetManager.readOffsets(zkClient, zkOffsetPath, topics.last) 116 | val kafkaStream = zkOffsetData match { 117 | case None => //如果从zk里面没有读到偏移量,就说明是系统第一次启动 118 | log.info("系统第一次启动,没有读取到偏移量,默认就最新的offset开始消费") 119 | // 使用最新的偏移量创建DirectStream 120 | KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) 121 | 122 | case Some(lastStopOffset) => 123 | log.info("从zk中读取到偏移量,从上次的偏移量开始消费数据......") 124 | val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message) 125 | // 使用上次停止时候的偏移量创建DirectStream 126 | KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, lastStopOffset, messageHandler) 127 | 128 | } 129 | 130 | kafkaStream // 返回创建的kafkaStream 131 | } 132 | 133 | } 134 | -------------------------------------------------------------------------------- /src/main/scala/com/utils/CommonUtils.scala: -------------------------------------------------------------------------------- 1 | package com.utils 2 | 3 | import com.typesafe.config.ConfigFactory 4 | 5 | /** 6 | * 读取配置文件 7 | * Created by fc.w on 2018/05/29 8 | */ 9 | object CommonUtils { 10 | 11 | val config = ConfigFactory.load() 12 | 13 | val env = config.getString("env") 14 | val isLocal = if (env == "dev") true else false // 是否使用本地模式 15 | val checkpoint = config.getString("checkpoint") 16 | val firstReadLastest = true // 第一次启动是否从最新位置开始消费 17 | val topicSet = config.getString("kafka_topics").split(",").toSet 18 | val zkOffsetPath =config.getString("zkOffsetPath") // zk的路径 19 | val kafkaServers = config.getString("kafkaServers") // Kafka服务地址 20 | val zkServer = config.getString("zkServer") // ZK服务地址 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/utils/GraceCloseUtils.scala: -------------------------------------------------------------------------------- 1 | package com.utils 2 | 3 | import javax.servlet.http.{HttpServletRequest, HttpServletResponse} 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.hadoop.fs.Path 6 | import org.apache.log4j.LogManager 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.spark_project.jetty.server.handler.{AbstractHandler, ContextHandler} 9 | import org.spark_project.jetty.server.{Request, Server} 10 | 11 | /** 12 | * Streaming 两种优雅的停止策略: 13 | * 1. 通过http服务 14 | * 2. 通过扫描hdfs文件 15 | * Created by fc.w on 2018/05/30 16 | */ 17 | object GraceCloseUtils { 18 | 19 | lazy val log = LogManager.getLogger("GraceCloseUtils") 20 | 21 | /** 22 | * 1. HTTP方式 23 | * 负责启动守护的jetty服务 24 | * @param port 对外暴露的端口号 25 | * @param ssc Stream上下文 26 | */ 27 | def daemonHttpServer(port:Int, ssc: StreamingContext) = { 28 | val server = new Server(port) 29 | val context = new ContextHandler() 30 | context.setContextPath("/close") 31 | context.setHandler(new CloseStreamHandler(ssc)) 32 | server.setHandler(context) 33 | server.start() 34 | } 35 | 36 | /** 37 | * 负责接受http请求来优雅的关闭流 38 | * @param ssc Stream上下文 39 | */ 40 | class CloseStreamHandler(ssc:StreamingContext) extends AbstractHandler { 41 | override def handle(s: String, baseRequest: Request, req: HttpServletRequest, response: HttpServletResponse): Unit = { 42 | log.warn("开始关闭......") 43 | // 优雅的关闭 44 | ssc.stop(true, true) 45 | response.setContentType("text/html; charset=utf-8") 46 | response.setStatus(HttpServletResponse.SC_OK) 47 | val out = response.getWriter 48 | out.println("Close Success") 49 | baseRequest.setHandled(true) 50 | log.warn("关闭成功.....") 51 | } 52 | 53 | } 54 | 55 | 56 | /** 57 | * 2. HDFS文件检测方式 58 | * 通过一个消息文件来定时触发是否需要关闭流程序 59 | * @param ssc StreamingContext 60 | */ 61 | def stopByMarkFile(ssc:StreamingContext): Unit = { 62 | val intervalMills = 10 * 1000 // 每隔10秒扫描一次消息是否存在 63 | var isStop = false 64 | val hdfsFilePath = "/spark/streaming/stop" // 判断消息文件是否存在 65 | while (!isStop) { 66 | isStop = ssc.awaitTerminationOrTimeout(intervalMills) 67 | if (! isStop && isExistsMarkFile(hdfsFilePath)) { 68 | log.warn("2秒后开始关闭sparstreaming程序.....") 69 | Thread.sleep(2000) 70 | ssc.stop(true, true) 71 | } 72 | } 73 | } 74 | 75 | /** 76 | * 判断是否存在mark file 77 | * @param hdfsFilePath mark文件的路径 78 | * @return 79 | */ 80 | def isExistsMarkFile(hdfsFilePath: String): Boolean = { 81 | val conf = new Configuration() 82 | val path = new Path(hdfsFilePath) 83 | val fs = path.getFileSystem(conf) 84 | fs.exists(path) 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/com/utils/KafkaOffsetManager.scala: -------------------------------------------------------------------------------- 1 | package com.utils 2 | 3 | import kafka.common.TopicAndPartition 4 | import kafka.utils.ZkUtils 5 | import org.I0Itec.zkclient.ZkClient 6 | import org.apache.log4j.LogManager 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.streaming.kafka.HasOffsetRanges 9 | 10 | /** 11 | * 负责kafka偏移量的读取和保存 12 | * Created by fc.w on 2018/05/30 13 | */ 14 | object KafkaOffsetManager { 15 | 16 | lazy val log = LogManager.getLogger(KafkaOffsetManager.getClass) 17 | 18 | /** 19 | * 读取zk里面的偏移量,如果有就返回对应的分区和偏移量 20 | * 如果没有就返回None 21 | * @param zkClient zk连接的client 22 | * @param zkOffsetPath 偏移量路径 23 | * @param topic topic名字 24 | * @return 偏移量Map or None 25 | */ 26 | def readOffsets(zkClient: ZkClient, zkOffsetPath: String, topic: String): Option[Map[TopicAndPartition, Long]] = { 27 | //(偏移量字符串,zk元数据) 28 | val (offsetsRangesStrOpt, _) = ZkUtils.readDataMaybeNull(zkClient, zkOffsetPath) 29 | offsetsRangesStrOpt match { 30 | case Some(offsetsRangesStr) => 31 | // 获取这个topic在ZK里面最新的分区数量 32 | val lastest_partitions = ZkUtils.getPartitionsForTopics(zkClient,Seq(topic)).get(topic).get 33 | var offsets = offsetsRangesStr.split(",") // 按逗号split成数组 34 | .map(s => s.split(":")) // 按冒号拆分每个分区和偏移量 35 | .map{case Array(partitionStr, offsetStr) => (TopicAndPartition(topic, partitionStr.toInt) -> offsetStr.toLong)} 36 | .toMap 37 | 38 | // 说明有分区扩展了 39 | if (offsets.size < lastest_partitions.size) { 40 | // 得到旧的所有分区序号 41 | val oldPartitions = offsets.keys.map(p => p.partition).toArray 42 | // 通过做差集得出来多的分区数量数组 43 | val addPartitions=lastest_partitions.diff(oldPartitions) 44 | if(addPartitions.size > 0){ 45 | log.warn("发现kafka新增分区:"+addPartitions.mkString(",")) 46 | addPartitions.foreach(partitionId => { 47 | offsets += (TopicAndPartition(topic,partitionId) -> 0) 48 | log.warn("新增分区id:"+partitionId+"添加完毕....") 49 | }) 50 | } 51 | } else { 52 | log.warn("没有发现新增的kafka分区:"+lastest_partitions.mkString(",")) 53 | } 54 | 55 | Some(offsets)// 将Map返回 56 | case None => 57 | None // 如果是null,就返回None 58 | 59 | } 60 | } 61 | 62 | /** 63 | * 保存每个批次的rdd的offset到zk中 64 | * @param zkClient zk连接的client 65 | * @param zkOffsetPath 偏移量路径 66 | * @param rdd 每个批次的rdd 67 | */ 68 | def saveOffsets(zkClient: ZkClient, zkOffsetPath: String, rdd: RDD[_]): Unit = { 69 | // 转换rdd为Array[OffsetRange] 70 | val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 71 | // 转换每个OffsetRange为存储到zk时的字符串格式 : 分区序号1:偏移量1, 分区序号2:偏移量2,...... 72 | val offsetsRangesStr = offsetRanges.map(offsetRange => s"${offsetRange.partition}:${offsetRange.untilOffset}").mkString(",") 73 | log.debug(" 保存的偏移量: " + offsetsRangesStr) 74 | // 将最终的字符串结果保存到zk里面 75 | ZkUtils.updatePersistentPath(zkClient, zkOffsetPath, offsetsRangesStr) 76 | } 77 | 78 | 79 | class Stopwatch { 80 | private val start = System.currentTimeMillis() 81 | def get():Long = (System.currentTimeMillis() - start) 82 | } 83 | 84 | } 85 | --------------------------------------------------------------------------------