├── README.md ├── pom.xml └── src └── main ├── resources ├── advUrlCount.log ├── bs_log │ ├── 19735E1C66.log │ ├── DDE7970F68.log │ └── E549D940E0.log └── ip.txt └── scala ├── com └── zxl │ ├── spark1_6 │ ├── dataframe │ │ └── SQLDemo.scala │ ├── elastic │ │ └── ElasticSpark.scala │ ├── flume │ │ └── FlumePushWordCount.scala │ ├── jedis │ │ └── JedisConnectionPool.scala │ ├── kafka │ │ ├── DirectKafkaWordCount.scala │ │ ├── KafkaWordCount.scala │ │ └── LoggerLevels.scala │ ├── my_partitioner │ │ └── UrlCountPartition.scala │ ├── my_sort │ │ └── CustomSort.scala │ ├── mysql │ │ └── JdbcRDDDemo.scala │ ├── simple │ │ ├── AdvUrlCount.scala │ │ ├── IpDemo.scala │ │ ├── UserLocation.scala │ │ └── WordCount.scala │ └── streaming │ │ ├── LoggerLevels.scala │ │ ├── StateFulWordCount.scala │ │ ├── StreamingWordCount.scala │ │ └── WindowOpts.scala │ └── spark2_2 │ ├── dataset │ ├── actions.scala │ ├── basicAction.scala │ └── createDataSet.scala │ ├── kafka │ ├── StreamingKafka10.scala │ └── StreamingKafka8.scala │ ├── streaming │ └── StreamingToMysql.scala │ └── structured │ ├── JDBCSink.scala │ ├── MySqlPool.scala │ └── StructuredStreamingKafka.scala └── org └── apache └── spark └── streaming └── kafka └── KafkaManager.scala /README.md: -------------------------------------------------------------------------------- 1 | # Spark-Example 2 | com.zxl.spark2_2.kafka 3 | 4 | StreamingKafka8: 5 | 6 | SparkStreaming从kafka中读取数据 7 | 8 | kafka版本0.8 9 | 10 | 采取直连方式 11 | 12 | StreamingKafka10: 13 | 14 | SparkStreaming从kafka中读取数据 15 | 16 | kafka版本0.10 17 | 18 | 采取直连方式 19 | 20 | com.zxl.spark2_2.streaming 21 | 22 | StreamingToMysql: 23 | 24 | SparkStreaming读取数据,存储到Mysql中 25 | 26 | com.zxl.spark2_2.structured 27 | 28 | JDBCSink: 29 | 30 | 处理从StructuredStreaming中向mysql中写入数据 31 | 32 | MySqlPool: 33 | 34 | 从mysql连接池中获取连接 35 | 36 | StructuredStreamingKafka: 37 | 38 | 结构化流从kafka中读取数据存储到关系型数据库mysql 39 | 40 | 目前结构化流对kafka的要求版本0.10及以上 41 | 42 | com.zxl.spark2_2.dataset 43 | 44 | createDataSet: 45 | 46 | DataSet创建的多种方式 47 | 48 | basicAction: 49 | 50 | DataSet的基本操作 51 | 52 | actions: 53 | 54 | DataSet的Action操作 55 | 1.map操作,flatMap操作 56 | 2.filter操作,where操作 57 | 3.去重操作 58 | 4.加法/减法操作 59 | 5.select操作 60 | 6.排序操作 61 | 7.分割抽样操作 62 | 8.列操作 63 | 9.join操作 64 | 10.分组聚合操作 65 | 66 | com.zxl.spark1_6.dataframe 67 | 68 | SQLDemo: 69 | 70 | 从hdfs中读取数据,转化为DataFrame,执行简单操作 71 | 72 | com.zxl.spark1_6.elastic 73 | 74 | ElasticSpark: 75 | 76 | Elasticsearch是一个基于Lucene的实时地分布式搜索和分析引擎。 77 | 78 | 设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便。 79 | 80 | com.zxl.spark1_6.flume 81 | 82 | FlumePushWordCount: 83 | 84 | flume向spark发送数据 85 | 86 | 添加三个jar包 87 | 88 | - commons-lang3-3.3.2.jar 89 | 90 | - scala-library-2.10.5.jar 91 | 92 | - spark-streaming-flume-sink_2.10-1.6.1.jar 93 | 94 | 打成jar包上传到集群中运行 95 | 96 | 集群命令如下: 97 | 98 | bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.flume.FlumePushWordCount /jar/____.jar 192.168.13.131 8888 99 | 100 | com.zxl.spark1_6.jedis 101 | 102 | JedisConnectionPool: 103 | 104 | 获得Jedis连接,进行简单操作 105 | 106 | com.zxl.spark1_6.kafka 107 | 108 | DirectKafkaWordCount: 109 | 110 | Spark Streaming维护偏移量相关的信息,实现零数据丢失,保证不重复消费 111 | 112 | 采用直连的方式有一个缺点,就是不再向zookeeper中更新offset信息。 113 | 114 | 因此,在采用直连的方式消费kafka中的数据的时候,大体思路是首先获取保存在zookeeper中的偏移量信息, 115 | 116 | 根据偏移量信息去创建stream,消费数据后再把当前的偏移量写入zookeeper中 117 | 118 | 在2.0以前的版本中KafkaManager这个类是private权限,需要把它拷贝到项目里使用。 119 | org.apache.spark.streaming.kafka 120 | 121 | KafkaWordCount: 122 | 123 | 从集群中的kafka读取数据操作 124 | 125 | 运行时参数: 126 | 127 | node1:2181,node2:2181,node3:2181 g1 test 2 128 | 129 | 其中g1为组名,此处随意写,test为topic名,kafka中的topic名要一致 130 | 131 | 集群命令(需先启动完成): 132 | 133 | 1.启动kafak 134 | 135 | bin/kafka-server-start.sh config/server.properties > /dev/null 2>&1 & 136 | 137 | 2.创建topic 138 | 139 | bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 3 --partitions 3 --topic test 140 | 141 | 3.向topic中添加数据 142 | 143 | bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test 144 | 145 | com.zxl.spark1_6.my_partitioner 146 | 147 | UrlCountPartition: 148 | 149 | 自定义分区 150 | 151 | 数据格式(时间点 url地址),例如: 152 | 20160321101954 http://net.zxl.cn/net/video.shtml 153 | 154 | 处理成数据(k, v) 155 | 156 | 对于数据(k, v) 157 | 158 | 重写自己的 partitioner 159 | 160 | com.zxl.spark1_6.my_sort 161 | 162 | CustomSort:自定义排序 163 | 164 | com.zxl.spark1_6.mysql 165 | 166 | JdbcRDDDemo:简单连接数据库操作 167 | 168 | com.zxl.spark1_6.simple 169 | 170 | AdvUrlCount: 171 | 172 | 读取文本内容,根据指定的学科, 取出点击量前三的 173 | 174 | 文本内容为某广告链接点击量,格式为:(时间点 某学科url链接) 175 | 176 | 举例:(20160321101957 http://net.zxl.cn/net/course.shtml) 177 | 178 | IpDemo: 179 | 180 | 数据格式如下: 181 | (1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302) 182 | 183 | 根据ip地址转换为数字,从数据集中找出详细信息. 184 | 185 | 为了简化查找速率,采用二分查找. 186 | 187 | UserLocation: 188 | 189 | 根据日志统计出每个用户在站点所呆时间最长的前2个的信息 190 | 191 | 日志内容格式为(手机号,时间点,基站站点,事件类型),事件类型为1时是进入基站,0是出基站。 192 | 193 | 1, 先根据"手机号_站点"为唯一标识, 算一次进站出站的时间, 返回(手机号_站点, 时间间隔) 194 | 195 | 2, 以"手机号_站点"为key, 统计每个站点的时间总和, ("手机号_站点", 时间总和) 196 | 197 | 3, ("手机号_站点", 时间总和) --> (手机号, 站点, 时间总和) 198 | 199 | 4, (手机号, 站点, 时间总和) --> groupBy().mapValues(以时间排序,取出前2个) --> (手机->((m,s,t)(m,s,t))) 200 | 201 | WordCount: 202 | 203 | 简单WordCount实现 204 | 205 | 集群上执行示例,指定相关配置 206 | 207 | bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.simple.WordCount --executor-memory 512m --total-executor-cores 2 /opt/soft/jar/hello-spark-1.0.jar hdfs://node1:9000/wc hdfs://node1:9000/out 208 | 209 | com.zxl.spark1_6.streaming 210 | 211 | LoggerLevels: 212 | 213 | 设置打印的log的级别 214 | 215 | StateFulWordCount: 216 | 217 | Spark Streaming累加器操作(updateStateByKey) 218 | 219 | StreamingWordCount: 220 | 221 | 通过SparkStreaming简单实现WordCount 222 | 223 | WindowOpts: 224 | 225 | SparkStreaming窗口函数的实现 226 | 227 | org.apache.spark.streaming.kafka 228 | 229 | KafkaManager: 230 | 231 | SparkStreaming直连kafka获取数据,自己编写偏移量offset,用于spark2.0以前 232 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.zxl 8 | spark-example 9 | 1.0 10 | 11 | 12 | 1.8 13 | 1.8 14 | UTF-8 15 | 2.11.8 16 | 2.2.0 17 | 2.6.4 18 | 19 | 20 | 21 | 22 | org.scala-lang 23 | scala-library 24 | ${scala.version} 25 | 26 | 27 | 28 | org.apache.spark 29 | spark-core_2.11 30 | ${spark.version} 31 | 32 | 33 | 34 | org.apache.hadoop 35 | hadoop-client 36 | ${hadoop.version} 37 | 38 | 39 | 40 | mysql 41 | mysql-connector-java 42 | 5.1.32 43 | 44 | 45 | 46 | org.apache.spark 47 | spark-sql_2.11 48 | ${spark.version} 49 | 50 | 51 | 52 | org.apache.spark 53 | spark-hive_2.11 54 | ${spark.version} 55 | 56 | 57 | org.apache.hive 58 | hive-jdbc 59 | ${spark.version} 60 | 61 | 62 | 63 | org.apache.spark 64 | spark-streaming_2.11 65 | ${spark.version} 66 | 67 | 68 | 69 | org.apache.spark 70 | spark-streaming-flume_2.11 71 | ${spark.version} 72 | 73 | 74 | 75 | org.apache.spark 76 | spark-streaming-kafka-0-8_2.11 77 | ${spark.version} 78 | 79 | 80 | 81 | org.apache.spark 82 | spark-streaming-kafka-0-10_2.11 83 | ${spark.version} 84 | 85 | 86 | 87 | org.apache.spark 88 | spark-sql-kafka-0-10_2.11 89 | ${spark.version} 90 | 91 | 92 | 93 | org.apache.spark 94 | spark-graphx_2.11 95 | ${spark.version} 96 | 97 | 98 | 99 | org.apache.spark 100 | spark-mllib_2.11 101 | ${spark.version} 102 | 103 | 104 | 105 | org.scalanlp 106 | breeze_2.11 107 | 0.12 108 | 109 | 110 | 111 | redis.clients 112 | jedis 113 | 2.8.1 114 | 115 | 116 | 117 | org.elasticsearch 118 | elasticsearch 119 | 2.3.1 120 | 121 | 122 | 123 | org.elasticsearch 124 | elasticsearch-spark_2.11 125 | 2.3.0 126 | 127 | 128 | 129 | mysql 130 | mysql-connector-java 131 | 5.1.35 132 | 133 | 134 | 135 | 136 | src/main/scala 137 | 138 | 139 | net.alchim31.maven 140 | scala-maven-plugin 141 | 3.2.2 142 | 143 | 144 | 145 | compile 146 | testCompile 147 | 148 | 149 | 150 | -make:transitive 151 | -dependencyfile 152 | ${project.build.directory}/.scala_dependencies 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | org.apache.maven.plugins 161 | maven-shade-plugin 162 | 2.4.3 163 | 164 | 165 | package 166 | 167 | shade 168 | 169 | 170 | 171 | 172 | *:* 173 | 174 | META-INF/*.SF 175 | META-INF/*.DSA 176 | META-INF/*.RSA 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /src/main/resources/bs_log/19735E1C66.log: -------------------------------------------------------------------------------- 1 | 18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1 2 | 18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1 3 | 18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0 4 | 18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0 5 | -------------------------------------------------------------------------------- /src/main/resources/bs_log/DDE7970F68.log: -------------------------------------------------------------------------------- 1 | 18611132889,20160327075000,9F36407EAD0629FC166F14DDE7970F68,1 2 | 18688888888,20160327075100,9F36407EAD0629FC166F14DDE7970F68,1 3 | 18611132889,20160327081000,9F36407EAD0629FC166F14DDE7970F68,0 4 | 18688888888,20160327081300,9F36407EAD0629FC166F14DDE7970F68,0 5 | 18688888888,20160327175000,9F36407EAD0629FC166F14DDE7970F68,1 6 | 18611132889,20160327182000,9F36407EAD0629FC166F14DDE7970F68,1 7 | 18688888888,20160327220000,9F36407EAD0629FC166F14DDE7970F68,0 8 | 18611132889,20160327230000,9F36407EAD0629FC166F14DDE7970F68,0 9 | -------------------------------------------------------------------------------- /src/main/resources/bs_log/E549D940E0.log: -------------------------------------------------------------------------------- 1 | 18611132889,20160327081100,CC0710CC94ECC657A8561DE549D940E0,1 2 | 18688888888,20160327081200,CC0710CC94ECC657A8561DE549D940E0,1 3 | 18688888888,20160327081900,CC0710CC94ECC657A8561DE549D940E0,0 4 | 18611132889,20160327082000,CC0710CC94ECC657A8561DE549D940E0,0 5 | 18688888888,20160327171000,CC0710CC94ECC657A8561DE549D940E0,1 6 | 18688888888,20160327171600,CC0710CC94ECC657A8561DE549D940E0,0 7 | 18611132889,20160327180500,CC0710CC94ECC657A8561DE549D940E0,1 8 | 18611132889,20160327181500,CC0710CC94ECC657A8561DE549D940E0,0 9 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/dataframe/SQLDemo.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.dataframe 2 | 3 | import org.apache.spark.sql.SQLContext 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * 从hdfs中读取数据,转化为DataFrame,执行简单操作 8 | * Created by ZXL on 2017/10/23. 9 | */ 10 | object SQLDemo { 11 | 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setAppName("SQLDemo")//.setMaster("local") 14 | val sc = new SparkContext(conf) 15 | val sqlContext = new SQLContext(sc) 16 | // 设置可以读取集群中的hdfs中文件 17 | System.setProperty("user.name", "root") 18 | 19 | val personRdd = sc.textFile("hdfs://node1:9000/person.txt").map(line =>{ 20 | val fields = line.split(",") 21 | Person(fields(0).toLong, fields(1), fields(2).toInt) 22 | }) 23 | 24 | import sqlContext.implicits._ 25 | // 转为DataFrame 26 | val personDf = personRdd.toDF 27 | 28 | personDf.show() 29 | 30 | personDf.registerTempTable("person") 31 | 32 | sqlContext.sql("select * from person where age >= 20 order by age desc limit 2").show() 33 | 34 | sc.stop() 35 | 36 | } 37 | 38 | case class Person(id: Long, name: String, age: Int) 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/elastic/ElasticSpark.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.elastic 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.elasticsearch.spark._ 5 | 6 | /** 7 | * Elasticsearch是一个基于Lucene的实时地分布式搜索和分析引擎。 8 | * 设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便。 9 | * 10 | * Created by ZXL on 2017/10/23. 11 | */ 12 | object ElasticSpark { 13 | 14 | def main(args: Array[String]) { 15 | val conf = new SparkConf().setAppName("ElasticSpark").setMaster("local") 16 | conf.set("es.nodes", "192.168.13.131,192.168.13.132,192.168.13.133") 17 | conf.set("es.port", "9200") 18 | conf.set("es.index.auto.create", "true") 19 | val sc = new SparkContext(conf) 20 | //val query: String = "{\"query\":{\"match_all\":{}}}" 21 | val start = 1463998397 22 | val end = 1463998399 23 | // val query: String = 24 | // s"""{ 25 | // "query": {"match_all": {}}, 26 | // "filter": { 27 | // "bool": { 28 | // "must": { 29 | // "range": { 30 | // "access.time": { 31 | // "gte": "$start", 32 | // "lte": "$end" 33 | // } 34 | // } 35 | // } 36 | // } 37 | // } 38 | // }""" 39 | 40 | val tp = "1" 41 | val query: String = s"""{ 42 | "query": {"match_all": {}}, 43 | "filter" : { 44 | "bool": { 45 | "must": [ 46 | {"term" : {"access.type" : $tp}}, 47 | { 48 | "range": { 49 | "access.time": { 50 | "gte": "$start", 51 | "lte": "$end" 52 | } 53 | } 54 | } 55 | ] 56 | } 57 | } 58 | }""" 59 | val rdd1 = sc.esRDD("accesslogs", query) 60 | 61 | println(rdd1.collect().toBuffer) 62 | println(rdd1.collect().size) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/flume/FlumePushWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.flume 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.flume.FlumeUtils 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * flume向spark发送数据 9 | * 10 | * 添加三个jar包 11 | * - commons-lang3-3.3.2.jar 12 | * - scala-library-2.10.5.jar 13 | * - spark-streaming-flume-sink_2.10-1.6.1.jar 14 | * 15 | * 打成jar包上传到集群中运行 16 | * 集群命令如下: 17 | * bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.flume.FlumePushWordCount 18 | * /jar/____.jar 192.168.13.131 8888 19 | * 20 | * Created by ZXL on 2017/10/23. 21 | */ 22 | object FlumePushWordCount { 23 | 24 | def main(args: Array[String]) { 25 | val host = args(0) 26 | val port = args(1).toInt 27 | val conf = new SparkConf().setAppName("FlumeWordCount")//.setMaster("local[2]") 28 | val ssc = new StreamingContext(conf, Seconds(5)) 29 | //推送方式: flume向spark发送数据 30 | val flumeStream = FlumeUtils.createStream(ssc, host, port) 31 | //flume中的数据通过event.getBody()才能拿到真正的内容 32 | val words = flumeStream.flatMap(x => new String(x.event.getBody().array()).split(" ")).map((_, 1)) 33 | 34 | val results = words.reduceByKey(_ + _) 35 | results.print() 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/jedis/JedisConnectionPool.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.jedis 2 | 3 | import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig} 4 | 5 | /** 6 | * 获得Jedis连接,进行简单操作 7 | * Created by ZXL on 2016/5/24. 8 | */ 9 | object JedisConnectionPool{ 10 | 11 | val config = new JedisPoolConfig() 12 | //最大连接数, 13 | config.setMaxTotal(10) 14 | //最大空闲连接数, 15 | config.setMaxIdle(5) 16 | //当调用borrow Object方法时,是否进行有效性检查 --> 17 | config.setTestOnBorrow(true) 18 | val pool = new JedisPool(config, "172.16.0.101", 6379) 19 | 20 | def getConnection(): Jedis = { 21 | pool.getResource 22 | 23 | } 24 | 25 | def main(args: Array[String]) { 26 | val conn = JedisConnectionPool.getConnection() 27 | val r = conn.keys("*") 28 | println(r) 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/kafka/DirectKafkaWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.kafka 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.log4j.{Level, Logger} 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.streaming.kafka.KafkaManager 8 | import org.apache.spark.streaming.{Seconds, StreamingContext} 9 | 10 | /** 11 | * Spark Streaming维护偏移量相关的信息,实现零数据丢失,保证不重复消费 12 | * 采用直连的方式有一个缺点,就是不再向zookeeper中更新offset信息。 13 | * 因此,在采用直连的方式消费kafka中的数据的时候,大体思路是首先获取保存在zookeeper中的偏移量信息, 14 | * 根据偏移量信息去创建stream,消费数据后再把当前的偏移量写入zookeeper中 15 | * 16 | * 在2.0以前的版本中KafkaManager这个类是private权限的,需要把它拷贝到项目里使用。 17 | * org.apache.spark.streaming.kafka 18 | * 19 | * Created by ZXL on 2017/11/1. 20 | */ 21 | object DirectKafkaWordCount { 22 | 23 | /* def dealLine(line: String): String = { 24 | val list = line.split(',').toList 25 | // val list = AnalysisUtil.dealString(line, ',', '"')// 把dealString函数当做split即可 26 | list.get(0).substring(0, 10) + "-" + list.get(26) 27 | }*/ 28 | 29 | def processRdd(rdd: RDD[(String, String)]): Unit = { 30 | val lines = rdd.map(_._2) 31 | val words = lines.map(_.split(" ")) 32 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) 33 | wordCounts.foreach(println) 34 | } 35 | 36 | def main(args: Array[String]) { 37 | if (args.length < 3) { 38 | System.err.println( 39 | s""" 40 | |Usage: DirectKafkaWordCount 41 | | is a list of one or more Kafka brokers 42 | | is a list of one or more kafka topics to consume from 43 | | is a consume group 44 | | 45 | """.stripMargin) 46 | System.exit(1) 47 | } 48 | 49 | Logger.getLogger("org").setLevel(Level.WARN) 50 | 51 | val Array(brokers, topics, groupId) = args 52 | 53 | // Create context with 2 second batch interval 54 | val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") 55 | sparkConf.setMaster("local[*]") 56 | sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "5") 57 | sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 58 | 59 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 60 | 61 | // Create direct kafka stream with brokers and topics 62 | val topicsSet = topics.split(",").toSet 63 | val kafkaParams = Map[String, String]( 64 | "metadata.broker.list" -> brokers, 65 | "group.id" -> groupId, 66 | "auto.offset.reset" -> "smallest" 67 | ) 68 | 69 | val km = new KafkaManager(kafkaParams) 70 | 71 | val messages = km.createDirectStream[String, String, StringDecoder, StringDecoder]( 72 | ssc, kafkaParams, topicsSet) 73 | 74 | messages.foreachRDD(rdd => { 75 | if (!rdd.isEmpty()) { 76 | // 先处理消息 77 | processRdd(rdd) 78 | // 再更新offsets 79 | km.updateZKOffsets(rdd) 80 | } 81 | }) 82 | 83 | ssc.start() 84 | ssc.awaitTermination() 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/kafka/KafkaWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.kafka 2 | 3 | import org.apache.spark.storage.StorageLevel 4 | import org.apache.spark.streaming.kafka.KafkaUtils 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | import org.apache.spark.{HashPartitioner, SparkConf} 7 | 8 | /** 9 | * 从集群中的kafka读取数据操作 10 | * 11 | * 运行时参数: 12 | * node1:2181,node2:2181,node3:2181 g1 test 2 13 | * 其中g1为组名,此处随意写,test为topic名,kafka中的topic名要一致 14 | * 15 | * 集群命令(需先启动完成): 16 | * 1.启动kafak 17 | * bin/kafka-server-start.sh config/server.properties > /dev/null 2>&1 & 18 | * 2.创建topic 19 | * bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 3 --partitions 3 --topic test 20 | * 3.向topic中添加数据 21 | * bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test 22 | * 23 | * Created by ZXL on 2017/11/1. 24 | */ 25 | object KafkaWordCount { 26 | 27 | val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => { 28 | //iter.flatMap(it => Some(it._2.sum + it._3.getOrElse(0)).map(x => (it._1, x))) 29 | iter.flatMap{case(x, y, z) => Some(y.sum + z.getOrElse(0)).map(i => (x, i))} 30 | } 31 | 32 | def main(args: Array[String]) { 33 | 34 | LoggerLevels.setStreamingLogLevels() 35 | val Array(zkQuorum, group, topics, numThreads) = args 36 | val sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]") 37 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 38 | ssc.checkpoint("D:\\test\\spark\\checkpoint2") 39 | // 线程执行个数 40 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap 41 | val data = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK_SER) 42 | // 返回(K, V),_._2返回的是值,值得输入是按空格分开 43 | val words = data.map(_._2).flatMap(_.split(" ")) 44 | val wordCounts = words.map((_, 1)).updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true) 45 | wordCounts.print() 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/kafka/LoggerLevels.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.kafka 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.internal.Logging 5 | 6 | object LoggerLevels extends Logging { 7 | 8 | def setStreamingLogLevels() { 9 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 10 | if (!log4jInitialized) { 11 | logInfo("Setting log level to [WARN] for streaming example." + 12 | " To override add a custom log4j.properties to the classpath.") 13 | Logger.getRootLogger.setLevel(Level.WARN) 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/my_partitioner/UrlCountPartition.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.my_partitioner 2 | 3 | import java.net.URL 4 | 5 | import org.apache.spark.{Partitioner, SparkContext, SparkConf} 6 | import scala.collection.mutable 7 | 8 | /** 9 | * 自定义分区 10 | * 数据格式(时间点 url地址),例如: 11 | * 20160321101954 http://net.zxl.cn/net/video.shtml 12 | * 处理成数据(k, v) 13 | * 对于数据(k, v) 14 | * 重写自己的 partitioner 15 | * Created by ZXL on 2017/10/20. 16 | */ 17 | object UrlCountPartition { 18 | 19 | def main(args: Array[String]) { 20 | 21 | val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]") 22 | val sc = new SparkContext(conf) 23 | 24 | // rdd1将数据切分,元组中放的是(URL, 1) 25 | val rdd1 = sc.textFile("D://test//spark//adv_url_count.log").map(line => { 26 | val f = line.split("\t") 27 | (f(1), 1) 28 | }) 29 | 30 | val rdd2 = rdd1.reduceByKey(_ + _) 31 | 32 | // (URL, n) 33 | val rdd3 = rdd2.map(t => { 34 | val url = t._1 35 | val host = new URL(url).getHost 36 | // host返回的是如 php.zxl.cn 37 | (host, (url, t._2)) 38 | }) 39 | 40 | // 得到结果为 ArrayBuffer(net.zxl.cn, java.zxl.cn, php.zxl.cn) 41 | val ints = rdd3.map(_._1).distinct().collect() 42 | // rdd3.repartition(3).saveAsTextFile("D://test//spark//out//out1") 43 | // println(ints.toBuffer) 44 | 45 | val hostPartitioner = new HostPartitioner(ints) 46 | // 取出每个 partitioner 中的信息 47 | val rdd4 = rdd3.partitionBy(hostPartitioner).mapPartitions(it => { 48 | it.toList.sortBy(_._2._2).reverse.take(2).iterator 49 | }) 50 | 51 | rdd4.saveAsTextFile("D://test//spark//out//out3") 52 | 53 | sc.stop() 54 | } 55 | } 56 | 57 | class HostPartitioner(ins: Array[String]) extends Partitioner { 58 | 59 | val parMap = new mutable.HashMap[String, Int]() 60 | var count = 0 61 | for(i <- ins) { 62 | parMap += (i -> count) 63 | count += 1 64 | } 65 | 66 | override def numPartitions: Int = ins.length 67 | 68 | override def getPartition(key: Any): Int = { 69 | // 根据 key 值获得分区 70 | parMap.getOrElse(key.toString, 0) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/my_sort/CustomSort.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.my_sort 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | // 第二种方式 6 | object OrderContext { 7 | 8 | /** 9 | * 第一种形式 10 | 11 | implicit object GirlOrdering extends Ordering[Girl] { 12 | override def compare(x: Girl, y: Girl): Int = { 13 | if(x.faceValue > y.faceValue) 1 14 | else if(x.faceValue == y.faceValue) { 15 | if(x.age > y.age) -1 else 1 16 | } else -1 17 | } 18 | } 19 | */ 20 | 21 | /** 22 | * 第二种形式 23 | */ 24 | implicit val girlOrdering = new Ordering[Girl] { 25 | override def compare(x: Girl, y: Girl): Int = { 26 | if(x.faceValue > y.faceValue) 1 27 | else if(x.faceValue == y.faceValue) { 28 | if(x.age > y.age) -1 else 1 29 | } else -1 30 | } 31 | } 32 | } 33 | 34 | /** 35 | * Created by ZXL on 2017/10/21. 36 | * 自定义排序 37 | */ 38 | object CustomSort { 39 | 40 | def main(args: Array[String]) { 41 | val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]") 42 | val sc = new SparkContext(conf) 43 | val rdd1 = sc.parallelize(List(("zzz", 90, 28, 1), ("xxx", 90, 27, 2), ("lll", 95, 22, 3))) 44 | import OrderContext._ 45 | val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false) 46 | println(rdd2.collect().toBuffer) 47 | sc.stop() 48 | } 49 | } 50 | 51 | /** 52 | * 第一种方式 53 | * @param faceValue 54 | * @param age 55 | 56 | case class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable { 57 | override def compare(that: Girl): Int = { 58 | if(this.faceValue == that.faceValue) { 59 | that.age - this.age 60 | } else { 61 | this.faceValue - that.faceValue 62 | } 63 | } 64 | } 65 | */ 66 | 67 | // 第二种方式 68 | case class Girl(faceValue: Int, age: Int) extends Serializable 69 | 70 | 71 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/mysql/JdbcRDDDemo.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.mysql 2 | 3 | import java.sql.DriverManager 4 | 5 | import org.apache.spark.rdd.JdbcRDD 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | /** 9 | * 简单连接数据库操作 10 | * Created by ZXL on 2017/10/22. 11 | */ 12 | object JdbcRDDDemo { 13 | 14 | def main(args: Array[String]) { 15 | val conf = new SparkConf().setAppName("JdbcRDDDemo").setMaster("local[2]") 16 | val sc = new SparkContext(conf) 17 | val connection = () => { 18 | Class.forName("com.mysql.jdbc.Driver").newInstance() 19 | DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?useUnicode=true&characterEncoding=utf-8", "root", "1234") 20 | } 21 | val jdbcRDD = new JdbcRDD( 22 | sc, 23 | connection, 24 | "SELECT * FROM ta where id >= ? AND id <= ?", 25 | // 1,4分别为两个占位符赋值,2表示两个任务一起读取数据 26 | 1, 4, 2, 27 | // 返回的内容 28 | r => { 29 | val id = r.getInt(1) 30 | val code = r.getString(2) 31 | (id, code) 32 | } 33 | ) 34 | val data = jdbcRDD.collect() 35 | println(data.toBuffer) 36 | sc.stop() 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/simple/AdvUrlCount.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.simple 2 | 3 | import java.net.URL 4 | 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | /** 8 | * 读取文本内容,根据指定的学科, 取出点击量前三的 9 | * 文本内容为某广告链接点击量,格式为: 10 | * (时间点 某学科url链接) 11 | * 举例:(20160321101957 http://net.zxl.cn/net/course.shtml) 12 | * Created by ZXL on 2017/10/16. 13 | */ 14 | object AdvUrlCount { 15 | 16 | def main(args: Array[String]) { 17 | 18 | // 从数据库中加载规则 19 | val arr = Array("java.zxl.cn", "php.zxl.cn", "net.zxl.cn") 20 | 21 | val conf = new SparkConf().setAppName("AdvUrlCount").setMaster("local[2]") 22 | val sc = new SparkContext(conf) 23 | 24 | // rdd1将数据切分,元组中放的是(URL, 1) 25 | val rdd1 = sc.textFile("D://test//spark//advUrlCount.log").map(line => { 26 | val f = line.split("\t") 27 | (f(1), 1) 28 | }) 29 | val rdd2 = rdd1.reduceByKey(_ + _) 30 | 31 | val rdd3 = rdd2.map(t => { 32 | val url = t._1 33 | val host = new URL(url).getHost 34 | (host, url, t._2) 35 | }) 36 | 37 | // println(rdd3.collect().toBuffer) 38 | 39 | // val rddjava = rdd3.filter(_._1 == "java.zxl.cn") 40 | // val sortdjava = rddjava.sortBy(_._3, false).take(3) 41 | // val rddphp = rdd3.filter(_._1 == "php.zxl.cn") 42 | 43 | for (ins <- arr) { 44 | val rdd = rdd3.filter(_._1 == ins) 45 | val result= rdd.sortBy(_._3, false).take(3) 46 | //通过JDBC向数据库中存储数据 47 | //id,学院,URL,次数, 访问日期 48 | println(result.toBuffer) 49 | } 50 | 51 | //println(sortdjava.toBuffer) 52 | sc.stop() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/simple/IpDemo.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.simple 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | import scala.io.Source 5 | 6 | /** 7 | * 数据格式如下: 8 | * (1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302) 9 | * 根据ip地址转换为数字,从数据集中找出详细信息. 10 | * 为了简化查找速率,采用二分查找. 11 | * Created by ZXL on 2017/10/22. 12 | */ 13 | object IpDemo { 14 | 15 | // ip地址转换为数字 16 | // 如 100.101.102.103,从100开始向左移动8位 17 | def ip2Long(ip: String): Long = { 18 | val fragments = ip.split("[.]") 19 | var ipNum = 0L 20 | for (i <- 0 until fragments.length) { 21 | // | 二进制OR运算符 22 | // ipNum向左移动8位,相当于乘以256(即2^8) 23 | ipNum = fragments(i).toLong | ipNum << 8L 24 | } 25 | ipNum 26 | } 27 | 28 | // 从文件中读取数据 29 | def readData(path: String) = { 30 | 31 | val lines = new ArrayBuffer[String]() 32 | 33 | /** 34 | * java读取文件方式 35 | * val br = new BufferedReader(new InputStreamReader(new FileInputStream(path))) 36 | * var s: String = null 37 | * var flag = true 38 | * while (flag) { 39 | * s = br.readLine() 40 | * if (s != null) 41 | * lines += s 42 | * else 43 | * flag = false 44 | * } 45 | * lines 46 | */ 47 | 48 | val content = Source.fromFile(path) 49 | for (line <- content.getLines()) { 50 | lines += line 51 | } 52 | lines 53 | } 54 | 55 | // 二分查找ip的下标地址,ip地址已经转为十进制 56 | def binarySearch(lines: ArrayBuffer[String], ip: Long): Int = { 57 | var low = 0 58 | var high = lines.length - 1 59 | while (low <= high) { 60 | val middle = (low + high) / 2 61 | if ((ip >= lines(middle).split("\\|")(2).toLong) && (ip <= lines(middle).split("\\|")(3).toLong)) 62 | return middle 63 | if (ip < lines(middle).split("\\|")(2).toLong) 64 | high = middle - 1 65 | else 66 | low = middle + 1 67 | } 68 | -1 69 | } 70 | 71 | def main(args: Array[String]) { 72 | val ip = "120.55.185.61" 73 | val ipNum = ip2Long(ip) 74 | println(ipNum) 75 | val lines = readData("d://test//spark//ip//ip.txt") 76 | val index = binarySearch(lines, ipNum) 77 | print(lines(index)) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/simple/UserLocation.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.simple 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * 根据日志统计出每个用户在站点所呆时间最长的前2个的信息 7 | * 日志内容格式为(手机号,时间点,基站站点,事件类型),事件类型为1时是进入基站,0是出基站。 8 | * 1, 先根据"手机号_站点"为唯一标识, 算一次进站出站的时间, 返回(手机号_站点, 时间间隔) 9 | * 2, 以"手机号_站点"为key, 统计每个站点的时间总和, ("手机号_站点", 时间总和) 10 | * 3, ("手机号_站点", 时间总和) --> (手机号, 站点, 时间总和) 11 | * 4, (手机号, 站点, 时间总和) --> groupBy().mapValues(以时间排序,取出前2个) --> (手机->((m,s,t)(m,s,t))) 12 | * Created by ZXL on 2017/10/15. 13 | */ 14 | object UserLocation { 15 | 16 | def main(args: Array[String]) { 17 | val conf = new SparkConf().setAppName("UserLocation").setMaster("local[2]") 18 | val sc = new SparkContext(conf) 19 | //sc.textFile("D://test//spark//bs_log").map(_.split(",")).map(x => (x(0), x(1), x(2), x(3))) 20 | val mbt = sc.textFile("D://test//spark//bs_log").map( line => { 21 | val fields = line.split(",") 22 | val eventType = fields(3) 23 | val time = fields(1) 24 | val timeLong = if(eventType == "1") -time.toLong else time.toLong 25 | (fields(0) + "_" + fields(2), timeLong) 26 | }) 27 | //println(mbt.collect().toBuffer) 28 | //(18611132889_9F36407EAD0629FC166F14DDE7970F68,54000) 29 | val rdd1 = mbt.groupBy(_._1).mapValues(_.foldLeft(0L)(_ + _._2)) 30 | val rdd2 = rdd1.map( t => { 31 | val mobile_bs = t._1 32 | val mobile = mobile_bs.split("_")(0) 33 | val lac = mobile_bs.split("_")(1) 34 | val time = t._2 35 | (mobile, lac, time) 36 | }) 37 | val rdd3 = rdd2.groupBy(_._1) 38 | //ArrayBuffer((18688888888,List((18688888888,16030401EAFB68F1E3CDF819735E1C66,87600), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200))), (18611132889,List((18611132889,16030401EAFB68F1E3CDF819735E1C66,97500), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000)))) 39 | val rdd4 = rdd3.mapValues(it => { 40 | it.toList.sortBy(_._3).reverse.take(2) 41 | }) 42 | println(rdd4.collect().toBuffer) 43 | sc.stop() 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/simple/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.simple 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * 简单WordCount实现 7 | * Created by ZXL on 2017/10/12. 8 | * 9 | * 集群上执行示例,指定相关配置 10 | * bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.simple.WordCount --executor-memory 512m 11 | * --total-executor-cores 2 /opt/soft/jar/hello-spark-1.0.jar hdfs://node1:9000/wc hdfs://node1:9000/out 12 | */ 13 | object WordCount { 14 | 15 | def main(args: Array[String]) { 16 | // 非常重要,是通向Spark集群的入口 17 | val conf = new SparkConf().setAppName("WordCount") 18 | val sc = new SparkContext(conf) 19 | 20 | // reduceByKey(_+_, 1)指定partition的个数为1,即生成一个输出文件 21 | sc.textFile(args(0)).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_).sortBy(_._2, false).saveAsTextFile(args(1)) 22 | sc.stop() 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/streaming/LoggerLevels.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.streaming 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.internal.Logging 5 | 6 | /** 7 | * 设置打印的log的级别 8 | */ 9 | object LoggerLevels extends Logging { 10 | 11 | def setStreamingLogLevels() { 12 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 13 | if (!log4jInitialized) { 14 | logInfo("Setting log level to [WARN] for streaming example." + 15 | " To override add a custom log4j.properties to the classpath.") 16 | Logger.getRootLogger.setLevel(Level.WARN) 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/streaming/StateFulWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.streaming 2 | 3 | import org.apache.spark.streaming.{Seconds, StreamingContext} 4 | import org.apache.spark.{HashPartitioner, SparkConf, SparkContext} 5 | 6 | /** 7 | * Spark Streaming累加器操作(updateStateByKey) 8 | * Created by ZXL on 2017/11/1. 9 | */ 10 | object StateFulWordCount { 11 | 12 | // Seq这个批次某个单词的次数 13 | // Option[Int]:以前的结果 14 | // 分好组的数据 15 | // updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)] 16 | val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => { 17 | // 下面几种操作结果一致 18 | //iter.flatMap(it => Some(it._2.sum + it._3.getOrElse(0)).map(x => (it._1, x))) 19 | //iter.map(t => (t._1, t._2.sum + t._3.getOrElse(0))) 20 | //iter.map{case(x, y, z) => Some(y.sum + z.getOrElse(0)).map(m => (x, m))} 21 | iter.map{case(word, current_count, history_count) => (word, current_count.sum + history_count.getOrElse(0))} 22 | } 23 | 24 | def main(args: Array[String]) { 25 | LoggerLevels.setStreamingLogLevels() 26 | // StreamingContext 27 | val conf = new SparkConf().setAppName("StateFulWordCount").setMaster("local[2]") 28 | val sc = new SparkContext(conf) 29 | // updateStateByKey必须设置setCheckpointDir 30 | sc.setCheckpointDir("D:\\test\\spark\\checkpoint") 31 | val ssc = new StreamingContext(sc, Seconds(5)) 32 | 33 | val ds = ssc.socketTextStream("192.168.13.131", 8888) 34 | 35 | // DStream是一个特殊的RDD 36 | // hello tom hello jerry 37 | val result = ds.flatMap(_.split(" ")).map((_, 1)).updateStateByKey(updateFunc, new HashPartitioner(sc.defaultParallelism), true) 38 | result.print() 39 | ssc.start() 40 | ssc.awaitTermination() 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/streaming/StreamingWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.streaming 2 | 3 | import org.apache.spark.streaming.{Seconds, StreamingContext} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * 通过SparkStreaming简单实现WordCount 8 | * Created by ZXL on 2017/10/31. 9 | */ 10 | object StreamingWordCount { 11 | 12 | def main(args: Array[String]) { 13 | // 设置log level 14 | LoggerLevels.setStreamingLogLevels() 15 | 16 | // StreamingContext 17 | val conf = new SparkConf().setAppName("StreamingWordCount").setMaster("local[2]") 18 | val sc = new SparkContext(conf) 19 | val ssc = new StreamingContext(sc, Seconds(5)) 20 | 21 | // 接收数据,使用nc绑定ip和端口发送数据 22 | val ds = ssc.socketTextStream("192.168.13.131", 8888) 23 | 24 | // DStream是一个特殊的RDD 25 | // hello tom hello jerry 26 | val result = ds.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_) 27 | 28 | // 打印结果 29 | result.print() 30 | ssc.start() 31 | ssc.awaitTermination() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark1_6/streaming/WindowOpts.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark1_6.streaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext} 5 | 6 | /** 7 | * SparkStreaming窗口函数的实现 8 | * Created by ZXL on 2017/11/2. 9 | */ 10 | object WindowOpts { 11 | 12 | def main(args: Array[String]) { 13 | LoggerLevels.setStreamingLogLevels() 14 | val conf = new SparkConf().setAppName("WindowOpts").setMaster("local[2]") 15 | val ssc = new StreamingContext(conf, Milliseconds(5000)) 16 | val lines = ssc.socketTextStream("192.168.13.131", 9999) 17 | val pairs = lines.flatMap(_.split(" ")).map((_, 1)) 18 | // Seconds(15):窗口的宽度,Seconds(10):移动窗口的间隔 19 | val windowedWordCounts = pairs.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(15), Seconds(10)) 20 | windowedWordCounts.print() 21 | // Map((hello, 5), (jerry, 2), (kitty, 3)) 22 | val a = windowedWordCounts.map(_._2).reduce(_+_) 23 | a.foreachRDD(rdd => { 24 | println(rdd.take(0)) 25 | }) 26 | a.print() 27 | 28 | // windowedWordCounts.map(t => (t._1, t._2.toDouble / a.toD)) 29 | ssc.start() 30 | ssc.awaitTermination() 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/dataset/actions.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.dataset 2 | 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.sql.functions._ 5 | 6 | /** 7 | * DataSet的操作 8 | * Created by ZXL on 2018/1/28. 9 | */ 10 | object actions { 11 | 12 | // 构建Spark对象 13 | val spark = SparkSession.builder() 14 | .master("local[2]") 15 | .appName("createDataSet") 16 | .enableHiveSupport() 17 | .getOrCreate() 18 | 19 | // 导入操作需要的隐式函数 20 | import spark.implicits._ 21 | 22 | // 1.map操作,flatMap操作 23 | val seq1 = Seq(Peoples(21, "zxl,wr,hy"), Peoples(20, "cc,hw,lwq")) 24 | val ds1 = spark.createDataset(seq1) 25 | val ds2 = ds1.map{ x => (x.age + 1, x.names)}.show() 26 | val ds3 = ds1.flatMap{ x => 27 | val a = x.age 28 | val s = x.names.split(",").map{ x => (a, x)} 29 | s 30 | }.show() 31 | 32 | // 2.filter操作,where操作 33 | val seq2 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165)) 34 | val ds4 = spark.createDataset(seq2) 35 | ds4.filter("age >= 20 and height >= 170").show() 36 | ds4.filter($"age" >= 20 && $"height" >= 170).show() 37 | ds4.filter{x => x.age > 20 && x.height >= 170}.show() 38 | ds4.where("age >= 20 and height >= 170").show() 39 | ds4.where($"age" >= 20 && $"height" >= 170).show() 40 | 41 | // 3.去重操作 42 | ds4.distinct().show() 43 | ds4.dropDuplicates("age").show() 44 | ds4.dropDuplicates("age", "height").show() 45 | ds4.dropDuplicates(Seq("age", "height")).show() 46 | ds4.dropDuplicates(Array("age", "height")).show() 47 | 48 | // 4.加法/减法操作 49 | val seq3 = Seq(Person("zxl2", 29, 170), Person("wx2", 30, 165), Person("cc2", 30, 165)) 50 | val ds5 = spark.createDataset(seq3) 51 | ds4.except(ds5).show() 52 | ds4.union(ds5).show() 53 | ds4.intersect(ds5).show() 54 | 55 | // 5.select操作 56 | ds5.select("name", "age").show() 57 | ds5.select(expr("height + 1").as[Int]).show() 58 | 59 | // 6.排序操作 60 | ds5.sort("age").show() 61 | ds5.sort($"age".desc, $"height".desc).show() 62 | ds5.orderBy("age").show() 63 | ds5.orderBy($"age".desc, $"height".desc).show() 64 | 65 | // 7.分割抽样操作 66 | val ds6 = ds4.union(ds5) 67 | val rands = ds6.randomSplit(Array(0.3, 0.7)) 68 | rands(0).count() 69 | rands(1).count() 70 | rands(0).show() 71 | rands(1).show() 72 | val ds7 = ds6.sample(false, 0.5) 73 | ds7.count() 74 | ds7.show() 75 | 76 | // 8.列操作 77 | val ds8 = ds6.drop("height") 78 | ds8.columns 79 | ds8.show() 80 | val ds9 = ds6.withColumn("add2", $"age" + 2) // 对数据集增加列 81 | ds9.columns 82 | ds9.show() 83 | val ds10 = ds9.withColumnRenamed("add2", "age_new") 84 | ds10.columns 85 | ds10.show() 86 | ds6.withColumn("add_col", lit(1)).show() 87 | 88 | // 9.join操作 89 | val seq4 = Seq(Score("zxl", 85), Score("wr", 90), Score("hy", 95)) 90 | val ds11 = spark.createDataset(seq4) 91 | val ds12 = ds5.join(ds11, Seq("name"), "inner") 92 | ds12.show() 93 | val ds13 = ds5.join(ds11, Seq("name"), "left") 94 | ds13.show() 95 | 96 | // 10.分组聚合操作 97 | val ds14 = ds4.union(ds5).groupBy("height").agg(avg("age")).as("avg_agg") 98 | ds14.show() 99 | } 100 | 101 | case class Score(name: String, score: Int) -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/dataset/basicAction.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.dataset 2 | 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.sql.types._ 5 | import org.apache.spark.storage.StorageLevel._ 6 | 7 | /** 8 | * DataSet的基本操作 9 | * Created by ZXL on 2018/1/28. 10 | */ 11 | object basicAction { 12 | 13 | // 构建Spark对象 14 | val spark = SparkSession.builder() 15 | .master("local[2]") 16 | .appName("createDataSet") 17 | .enableHiveSupport() 18 | .getOrCreate() 19 | 20 | // 导入操作需要的隐式函数 21 | import spark.implicits._ 22 | 23 | // 1.DataSet存储类型 24 | val seq1 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165)) 25 | val ds1 = spark.createDataset(seq1) 26 | ds1.show() 27 | ds1.checkpoint() 28 | ds1.cache() 29 | ds1.persist(MEMORY_ONLY) 30 | ds1.count() 31 | ds1.show() 32 | ds1.unpersist(true) // 将DataSet删除 33 | 34 | // 2.获取数据集 35 | val c1 = ds1.collect() 36 | val c2 = ds1.collectAsList() 37 | val h1 = ds1.head() 38 | val h2 = ds1.head(3) 39 | val f1 = ds1.first() 40 | val t1 = ds1.take(2) 41 | val t2 = ds1.takeAsList(2) 42 | 43 | // 3.统计数据集 44 | ds1.count() 45 | ds1.describe().show() 46 | ds1.describe("age").show() 47 | ds1.describe("age", "height").show() 48 | 49 | // 4.聚集 50 | ds1.reduce((f1, f2) => Person("sum", (f1.age + f2.age), (f1.height + f2.height))) 51 | 52 | // 5.DataSet结构属性 53 | ds1.columns 54 | ds1.dtypes 55 | ds1.explain() // 返回执行物理计划 56 | 57 | // 6.DataSet rdd数据互转 58 | val rdd1 = ds1.rdd 59 | val ds2 = rdd1.toDS() 60 | ds2.show() 61 | val df2 = rdd1.toDF() 62 | df2.show() 63 | 64 | // 7.DataSet 保存文件 65 | ds1.select("name", "age", "height").write.format("csv").save("hdfs://node1:9000/test2.csv") 66 | // 读取保存的文件 67 | val schema2 = StructType( 68 | StructField("name", StringType, false) :: 69 | StructField("age", IntegerType, false) :: 70 | StructField("name", IntegerType, true) :: Nil) 71 | val out = spark.read. 72 | options(Map(("delimiter", ","), ("header", "false"))). 73 | schema(schema2).csv("hdf2://node:9000/test2.csv") 74 | out.show(10) 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/dataset/createDataSet.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.dataset 2 | 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | 7 | /** 8 | * DataSet创建的多种方式 9 | * Created by ZXL on 2018/1/28. 10 | */ 11 | object createDataSet { 12 | 13 | // 构建Spark对象 14 | val spark = SparkSession.builder() 15 | .master("local[2]") 16 | .appName("createDataSet") 17 | .enableHiveSupport() 18 | .getOrCreate() 19 | 20 | // 导入操作需要的隐式函数 21 | import spark.implicits._ 22 | 23 | // 设置检查点 24 | spark.sparkContext.setCheckpointDir("hdfs://node1:9000/user/spark_checkpoint") 25 | 26 | // 1.产生序列dataset 27 | val numDS = spark.range(5, 100, 5) 28 | numDS.orderBy(desc("id")).show(5) 29 | numDS.describe().show() 30 | 31 | // 2.集合转成DataSet 32 | val seq1 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165)) 33 | val ds1 = spark.createDataset(seq1) 34 | ds1.show() 35 | 36 | // 3.集合转成DataFrame 37 | val df1 = spark.createDataFrame(seq1).withColumnRenamed("_1", "name").withColumnRenamed("_2", "age") 38 | df1.orderBy(desc("age")).show(10) 39 | 40 | // 4.rdd转成DataFrame 41 | val array1 = Array(("zxl", 29, 170), ("wx", 30, 165), ("cc", 30, 165)) 42 | val rdd1 = spark.sparkContext.parallelize(array1, 3).map(f => Row(f._1, f._2, f._3)) 43 | val schema = StructType( 44 | StructField("name", StringType, false) :: 45 | StructField("age", IntegerType, true) :: Nil) 46 | val rddToDataFrame = spark.createDataFrame(rdd1, schema) 47 | rddToDataFrame.orderBy(desc("name")).show(false) 48 | 49 | // 5.rdd转成DataSet/DataFrame 50 | val rdd2 = spark.sparkContext.parallelize(array1, 3).map(f => Person(f._1, f._2, f._3)) 51 | val ds2 = rdd2.toDS() 52 | val df2 = rdd2.toDF() 53 | ds2.orderBy(desc("name")).show(10) 54 | df2.orderBy(desc("name")).show(10) 55 | 56 | // 6.rdd转成DataSet 57 | val ds3 = spark.createDataset(rdd2) 58 | ds3.show(10) 59 | 60 | // 7.读取文件 61 | val df4 = spark.read.csv("hdf2://node:9000/test.csv") 62 | df4.show() 63 | 64 | // 8.读取文件,详细参数 65 | val schema2 = StructType( 66 | StructField("name", StringType, false) :: 67 | StructField("age", IntegerType, false) :: 68 | StructField("name", IntegerType, true) :: Nil) 69 | val df7 = spark.read. 70 | options(Map(("delimiter", ","), ("header", "false"))). 71 | schema(schema2).csv("hdf2://node:9000/test.csv") 72 | } 73 | 74 | case class Person(name: String, age: Int, height: Int) 75 | case class Peoples(age: Int, names: String) -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/kafka/StreamingKafka10.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.kafka 2 | 3 | import org.apache.kafka.common.serialization.StringDeserializer 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 6 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 7 | import org.apache.spark.streaming.kafka010._ 8 | import org.apache.spark.streaming.{Seconds, StreamingContext} 9 | 10 | /** 11 | * SparkStreaming从kafka中读取数据 12 | * kafka版本0.10 13 | * 采取直连方式 14 | * 15 | * Created by ZXL on 2017/10/15. 16 | */ 17 | object StreamingKafka10 { 18 | 19 | def main(args: Array[String]): Unit = { 20 | 21 | val spark = SparkSession.builder() 22 | .master("local[2]") 23 | .appName("streaming").getOrCreate() 24 | 25 | val sc =spark.sparkContext 26 | val ssc = new StreamingContext(sc, Seconds(5)) 27 | val kafkaParams = Map[String, Object]( 28 | "bootstrap.servers" -> "node2:9092", 29 | "key.deserializer" -> classOf[StringDeserializer], 30 | "value.deserializer" -> classOf[StringDeserializer], 31 | "group.id" -> "0001", 32 | "auto.offset.reset" -> "latest", 33 | "enable.auto.commit" -> (false: java.lang.Boolean) 34 | ) 35 | val topics = Array("weblogs") 36 | val stream = KafkaUtils.createDirectStream[String, String]( 37 | ssc, 38 | PreferConsistent, 39 | Subscribe[String, String](topics, kafkaParams) 40 | ) 41 | 42 | val lines = stream.map(x => x.value()) 43 | val words = lines.flatMap(_.split(" ")) 44 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) 45 | wordCounts.print() 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/kafka/StreamingKafka8.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.kafka 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.streaming.kafka.KafkaUtils 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | 8 | /** 9 | * SparkStreaming从kafka中读取数据 10 | * kafka版本0.8 11 | * 采取直连方式 12 | * 13 | * Created by ZXL on 2017/10/15. 14 | */ 15 | object StreamingKafka8 { 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | val spark = SparkSession.builder() 20 | .master("local[2]") 21 | .appName("streaming").getOrCreate() 22 | 23 | val sc =spark.sparkContext 24 | val ssc = new StreamingContext(sc, Seconds(5)) 25 | 26 | // Create direct kafka stream with brokers and topics 27 | val topicsSet =Set("weblogs") 28 | val kafkaParams = Map[String, String]("metadata.broker.list" -> "node1:9092") 29 | val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 30 | ssc, kafkaParams, topicsSet) 31 | 32 | val lines = kafkaStream.map(x => x._2) 33 | val words = lines.flatMap(_.split(" ")) 34 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) 35 | wordCounts.print() 36 | 37 | ssc.start() 38 | ssc.awaitTermination() 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/streaming/StreamingToMysql.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.streaming 2 | 3 | import java.sql.DriverManager 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | 8 | /** 9 | * SparkStreaming读取数据,存储到Mysql中 10 | * 11 | * Created by ZXL on 2017/10/23. 12 | */ 13 | object StreamingToMysql { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val spark = SparkSession.builder() 18 | .master("local[2]") 19 | .appName("streaming").getOrCreate() 20 | 21 | val sc =spark.sparkContext 22 | val ssc = new StreamingContext(sc, Seconds(5)) 23 | val lines = ssc.socketTextStream("node2", 9999) 24 | val words = lines.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) 25 | 26 | words.foreachRDD(rdd => rdd.foreachPartition(line => { 27 | Class.forName("com.mysql.jdbc.Driver") 28 | val conn = DriverManager 29 | .getConnection("jdbc:mysql://node3:3306/test","root","1234") 30 | try{ 31 | for(row <- line){ 32 | val sql = "insert into webCount(titleName,count)values('"+row._1+"',"+row._2+")" 33 | conn.prepareStatement(sql).executeUpdate() 34 | } 35 | }finally { 36 | conn.close() 37 | } 38 | })) 39 | 40 | //words.print() 41 | ssc.start() 42 | ssc.awaitTermination() 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/structured/JDBCSink.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.structured 2 | 3 | import java.sql._ 4 | 5 | import org.apache.spark.sql.{ForeachWriter, Row} 6 | 7 | /** 8 | * 处理从StructuredStreaming中向mysql中写入数据 9 | * 10 | * Created by ZXL on 2017/10/15. 11 | */ 12 | class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] { 13 | 14 | var statement: Statement = _ 15 | var resultSet: ResultSet = _ 16 | var connection: Connection = _ 17 | 18 | override def open(partitionId: Long, version: Long): Boolean = { 19 | connection = new MySqlPool(url, username, password).getJdbcConn() 20 | statement = connection.createStatement() 21 | return true 22 | } 23 | 24 | override def process(value: Row): Unit = { 25 | 26 | val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "") 27 | val count = value.getAs[Long]("count") 28 | 29 | val querySql = "select 1 from webCount " + 30 | "where titleName = '" + titleName + "'" 31 | 32 | val updateSql = "update webCount set " + 33 | "count = " + count + " where titleName = '" + titleName + "'" 34 | 35 | val insertSql = "insert into webCount(titleName,count)" + 36 | "values('" + titleName + "'," + count + ")" 37 | 38 | try { 39 | 40 | //查看连接是否成功 41 | var resultSet = statement.executeQuery(querySql) 42 | if (resultSet.next()) { 43 | statement.executeUpdate(updateSql) 44 | } else { 45 | statement.execute(insertSql) 46 | } 47 | } catch { 48 | case ex: SQLException => { 49 | println("SQLException") 50 | } 51 | case ex: Exception => { 52 | println("Exception") 53 | } 54 | case ex: RuntimeException => { 55 | println("RuntimeException") 56 | } 57 | case ex: Throwable => { 58 | println("Throwable") 59 | } 60 | } 61 | } 62 | 63 | override def close(errorOrNull: Throwable): Unit = { 64 | // if(resultSet.wasNull()){ 65 | // resultSet.close() 66 | // } 67 | if (statement == null) { 68 | statement.close() 69 | } 70 | if (connection == null) { 71 | connection.close() 72 | } 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/structured/MySqlPool.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.structured 2 | 3 | import java.sql.{Connection, DriverManager} 4 | import java.util 5 | 6 | /** 7 | * 从mysql连接池中获取连接 8 | * 9 | * Created by ZXL on 2017/10/15. 10 | */ 11 | class MySqlPool(url: String, user: String, pwd: String) extends Serializable { 12 | //连接池连接总数 13 | private val max = 3 14 | 15 | //每次产生连接数 16 | private val connectionNum = 1 17 | 18 | //当前连接池已产生的连接数 19 | private var conNum = 0 20 | 21 | private val pool = new util.LinkedList[Connection]() //连接池 22 | 23 | //获取连接 24 | def getJdbcConn(): Connection = { 25 | //同步代码块,AnyRef为所有引用类型的基类,AnyVal为所有值类型的基类 26 | AnyRef.synchronized({ 27 | if (pool.isEmpty) { 28 | //加载驱动 29 | preGetConn() 30 | for (i <- 1 to connectionNum) { 31 | val conn = DriverManager.getConnection(url, user, pwd) 32 | pool.push(conn) 33 | conNum += 1 34 | } 35 | } 36 | pool.poll() 37 | }) 38 | } 39 | 40 | //释放连接 41 | def releaseConn(conn: Connection): Unit = { 42 | pool.push(conn) 43 | } 44 | 45 | //加载驱动 46 | private def preGetConn(): Unit = { 47 | //控制加载 48 | if (conNum < max && !pool.isEmpty) { 49 | println("Jdbc Pool has no connection now, please wait a moments!") 50 | Thread.sleep(2000) 51 | preGetConn() 52 | } else { 53 | Class.forName("com.mysql.jdbc.Driver") 54 | } 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/zxl/spark2_2/structured/StructuredStreamingKafka.scala: -------------------------------------------------------------------------------- 1 | package com.zxl.spark2_2.structured 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.ProcessingTime 5 | 6 | /** 7 | * 结构化流从kafka中读取数据存储到关系型数据库mysql 8 | * 目前结构化流对kafka的要求版本0.10及以上 9 | * 10 | * Created by ZXL on 2017/10/15. 11 | */ 12 | object StructuredStreamingKafka { 13 | 14 | case class Weblog(datatime:String, 15 | userid:String, 16 | searchname:String, 17 | retorder:String, 18 | cliorder:String, 19 | cliurl:String) 20 | 21 | def main(args: Array[String]): Unit = { 22 | 23 | val spark = SparkSession.builder() 24 | .master("local[2]") 25 | .appName("streaming").getOrCreate() 26 | 27 | val df = spark 28 | .readStream 29 | .format("kafka") 30 | .option("kafka.bootstrap.servers", "node1:9092") 31 | .option("subscribe", "weblogs") 32 | .load() 33 | 34 | import spark.implicits._ 35 | val lines = df.selectExpr("CAST(value AS STRING)").as[String] 36 | val weblog = lines.map(_.split(",")) 37 | .map(x => Weblog(x(0), x(1), x(2),x(3),x(4),x(5))) 38 | val titleCount = weblog 39 | .groupBy("searchname").count().toDF("titleName","count") 40 | 41 | val url ="jdbc:mysql://node3:3306/test" 42 | val username="root" 43 | val password="1234" 44 | 45 | val writer = new JDBCSink(url,username,password) 46 | val query = titleCount.writeStream 47 | .foreach(writer) 48 | .outputMode("update") 49 | .trigger(ProcessingTime("5 seconds")) 50 | .start() 51 | query.awaitTermination() 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/streaming/kafka/KafkaManager.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.streaming.kafka 2 | 3 | import kafka.common.TopicAndPartition 4 | import kafka.message.MessageAndMetadata 5 | import kafka.serializer.Decoder 6 | import org.apache.spark.SparkException 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.InputDStream 10 | import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset 11 | 12 | import scala.reflect.ClassTag 13 | 14 | /** 15 | * 自己管理offset 16 | */ 17 | class KafkaManager(val kafkaParams: Map[String, String]) extends Serializable { 18 | 19 | private val kc = new KafkaCluster(kafkaParams) 20 | 21 | /** 22 | * 创建数据流 23 | */ 24 | def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag]( 25 | ssc: StreamingContext, kafkaParams: Map[String, String], topics: Set[String]): InputDStream[(K, V)] = { 26 | val groupId = kafkaParams.get("group.id").get 27 | // 在zookeeper上读取offsets前先根据实际情况更新offsets 28 | setOrUpdateOffsets(topics, groupId) 29 | 30 | //从zookeeper上读取offset开始消费message 31 | val messages = { 32 | val partitionsE = kc.getPartitions(topics) 33 | if (partitionsE.isLeft) 34 | throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}") 35 | val partitions = partitionsE.right.get 36 | val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) 37 | if (consumerOffsetsE.isLeft) 38 | throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}") 39 | val consumerOffsets = consumerOffsetsE.right.get 40 | KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)]( 41 | ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)) 42 | } 43 | messages 44 | } 45 | 46 | /** 47 | * 创建数据流前,根据实际消费情况更新消费offsets 48 | * @param topics 49 | * @param groupId 50 | */ 51 | private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = { 52 | topics.foreach(topic => { 53 | var hasConsumed = true 54 | val partitionsE = kc.getPartitions(Set(topic)) 55 | if (partitionsE.isLeft) 56 | throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}") 57 | val partitions = partitionsE.right.get 58 | val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) 59 | if (consumerOffsetsE.isLeft) hasConsumed = false 60 | if (hasConsumed) {// 消费过 61 | /** 62 | * 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException, 63 | * 说明zk上保存的offsets已经过时了,即kafka的定时清理策略已经将包含该offsets的文件删除。 64 | * 针对这种情况,只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小, 65 | * 如果consumerOffsets比earliestLeaderOffsets还小的话,说明consumerOffsets已过时, 66 | * 这时把consumerOffsets更新为earliestLeaderOffsets 67 | */ 68 | val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions) 69 | if (earliestLeaderOffsetsE.isLeft) 70 | throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}") 71 | val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get 72 | val consumerOffsets = consumerOffsetsE.right.get 73 | 74 | // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets 75 | var offsets: Map[TopicAndPartition, Long] = Map() 76 | consumerOffsets.foreach({ case(tp, n) => 77 | val earliestLeaderOffset = earliestLeaderOffsets(tp).offset 78 | if (n < earliestLeaderOffset) { 79 | println("consumer group:" + groupId + ",topic:" + tp.topic + ",partition:" + tp.partition + 80 | " offsets已经过时,更新为" + earliestLeaderOffset) 81 | offsets += (tp -> earliestLeaderOffset) 82 | } 83 | }) 84 | if (!offsets.isEmpty) { 85 | kc.setConsumerOffsets(groupId, offsets) 86 | } 87 | } else {// 没有消费过 88 | val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) 89 | var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null 90 | if (reset == Some("smallest")) { 91 | val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions) 92 | if (leaderOffsetsE.isLeft) 93 | throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsE.left.get}") 94 | leaderOffsets = leaderOffsetsE.right.get 95 | } else { 96 | val leaderOffsetsE = kc.getLatestLeaderOffsets(partitions) 97 | if (leaderOffsetsE.isLeft) 98 | throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsE.left.get}") 99 | leaderOffsets = leaderOffsetsE.right.get 100 | } 101 | val offsets = leaderOffsets.map { 102 | case (tp, offset) => (tp, offset.offset) 103 | } 104 | kc.setConsumerOffsets(groupId, offsets) 105 | } 106 | }) 107 | } 108 | 109 | /** 110 | * 更新zookeeper上的消费offsets 111 | * @param rdd 112 | */ 113 | def updateZKOffsets(rdd: RDD[(String, String)]) : Unit = { 114 | val groupId = kafkaParams.get("group.id").get 115 | val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 116 | 117 | for (offsets <- offsetsList) { 118 | val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition) 119 | val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset))) 120 | if (o.isLeft) { 121 | println(s"Error updating the offset to Kafka cluster: ${o.left.get}") 122 | } 123 | } 124 | } 125 | } 126 | 127 | --------------------------------------------------------------------------------