├── README.md
├── pom.xml
└── src
└── main
├── resources
├── advUrlCount.log
├── bs_log
│ ├── 19735E1C66.log
│ ├── DDE7970F68.log
│ └── E549D940E0.log
└── ip.txt
└── scala
├── com
└── zxl
│ ├── spark1_6
│ ├── dataframe
│ │ └── SQLDemo.scala
│ ├── elastic
│ │ └── ElasticSpark.scala
│ ├── flume
│ │ └── FlumePushWordCount.scala
│ ├── jedis
│ │ └── JedisConnectionPool.scala
│ ├── kafka
│ │ ├── DirectKafkaWordCount.scala
│ │ ├── KafkaWordCount.scala
│ │ └── LoggerLevels.scala
│ ├── my_partitioner
│ │ └── UrlCountPartition.scala
│ ├── my_sort
│ │ └── CustomSort.scala
│ ├── mysql
│ │ └── JdbcRDDDemo.scala
│ ├── simple
│ │ ├── AdvUrlCount.scala
│ │ ├── IpDemo.scala
│ │ ├── UserLocation.scala
│ │ └── WordCount.scala
│ └── streaming
│ │ ├── LoggerLevels.scala
│ │ ├── StateFulWordCount.scala
│ │ ├── StreamingWordCount.scala
│ │ └── WindowOpts.scala
│ └── spark2_2
│ ├── dataset
│ ├── actions.scala
│ ├── basicAction.scala
│ └── createDataSet.scala
│ ├── kafka
│ ├── StreamingKafka10.scala
│ └── StreamingKafka8.scala
│ ├── streaming
│ └── StreamingToMysql.scala
│ └── structured
│ ├── JDBCSink.scala
│ ├── MySqlPool.scala
│ └── StructuredStreamingKafka.scala
└── org
└── apache
└── spark
└── streaming
└── kafka
└── KafkaManager.scala
/README.md:
--------------------------------------------------------------------------------
1 | # Spark-Example
2 | com.zxl.spark2_2.kafka
3 |
4 | StreamingKafka8:
5 |
6 | SparkStreaming从kafka中读取数据
7 |
8 | kafka版本0.8
9 |
10 | 采取直连方式
11 |
12 | StreamingKafka10:
13 |
14 | SparkStreaming从kafka中读取数据
15 |
16 | kafka版本0.10
17 |
18 | 采取直连方式
19 |
20 | com.zxl.spark2_2.streaming
21 |
22 | StreamingToMysql:
23 |
24 | SparkStreaming读取数据,存储到Mysql中
25 |
26 | com.zxl.spark2_2.structured
27 |
28 | JDBCSink:
29 |
30 | 处理从StructuredStreaming中向mysql中写入数据
31 |
32 | MySqlPool:
33 |
34 | 从mysql连接池中获取连接
35 |
36 | StructuredStreamingKafka:
37 |
38 | 结构化流从kafka中读取数据存储到关系型数据库mysql
39 |
40 | 目前结构化流对kafka的要求版本0.10及以上
41 |
42 | com.zxl.spark2_2.dataset
43 |
44 | createDataSet:
45 |
46 | DataSet创建的多种方式
47 |
48 | basicAction:
49 |
50 | DataSet的基本操作
51 |
52 | actions:
53 |
54 | DataSet的Action操作
55 | 1.map操作,flatMap操作
56 | 2.filter操作,where操作
57 | 3.去重操作
58 | 4.加法/减法操作
59 | 5.select操作
60 | 6.排序操作
61 | 7.分割抽样操作
62 | 8.列操作
63 | 9.join操作
64 | 10.分组聚合操作
65 |
66 | com.zxl.spark1_6.dataframe
67 |
68 | SQLDemo:
69 |
70 | 从hdfs中读取数据,转化为DataFrame,执行简单操作
71 |
72 | com.zxl.spark1_6.elastic
73 |
74 | ElasticSpark:
75 |
76 | Elasticsearch是一个基于Lucene的实时地分布式搜索和分析引擎。
77 |
78 | 设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便。
79 |
80 | com.zxl.spark1_6.flume
81 |
82 | FlumePushWordCount:
83 |
84 | flume向spark发送数据
85 |
86 | 添加三个jar包
87 |
88 | - commons-lang3-3.3.2.jar
89 |
90 | - scala-library-2.10.5.jar
91 |
92 | - spark-streaming-flume-sink_2.10-1.6.1.jar
93 |
94 | 打成jar包上传到集群中运行
95 |
96 | 集群命令如下:
97 |
98 | bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.flume.FlumePushWordCount /jar/____.jar 192.168.13.131 8888
99 |
100 | com.zxl.spark1_6.jedis
101 |
102 | JedisConnectionPool:
103 |
104 | 获得Jedis连接,进行简单操作
105 |
106 | com.zxl.spark1_6.kafka
107 |
108 | DirectKafkaWordCount:
109 |
110 | Spark Streaming维护偏移量相关的信息,实现零数据丢失,保证不重复消费
111 |
112 | 采用直连的方式有一个缺点,就是不再向zookeeper中更新offset信息。
113 |
114 | 因此,在采用直连的方式消费kafka中的数据的时候,大体思路是首先获取保存在zookeeper中的偏移量信息,
115 |
116 | 根据偏移量信息去创建stream,消费数据后再把当前的偏移量写入zookeeper中
117 |
118 | 在2.0以前的版本中KafkaManager这个类是private权限,需要把它拷贝到项目里使用。
119 | org.apache.spark.streaming.kafka
120 |
121 | KafkaWordCount:
122 |
123 | 从集群中的kafka读取数据操作
124 |
125 | 运行时参数:
126 |
127 | node1:2181,node2:2181,node3:2181 g1 test 2
128 |
129 | 其中g1为组名,此处随意写,test为topic名,kafka中的topic名要一致
130 |
131 | 集群命令(需先启动完成):
132 |
133 | 1.启动kafak
134 |
135 | bin/kafka-server-start.sh config/server.properties > /dev/null 2>&1 &
136 |
137 | 2.创建topic
138 |
139 | bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 3 --partitions 3 --topic test
140 |
141 | 3.向topic中添加数据
142 |
143 | bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test
144 |
145 | com.zxl.spark1_6.my_partitioner
146 |
147 | UrlCountPartition:
148 |
149 | 自定义分区
150 |
151 | 数据格式(时间点 url地址),例如:
152 | 20160321101954 http://net.zxl.cn/net/video.shtml
153 |
154 | 处理成数据(k, v)
155 |
156 | 对于数据(k, v)
157 |
158 | 重写自己的 partitioner
159 |
160 | com.zxl.spark1_6.my_sort
161 |
162 | CustomSort:自定义排序
163 |
164 | com.zxl.spark1_6.mysql
165 |
166 | JdbcRDDDemo:简单连接数据库操作
167 |
168 | com.zxl.spark1_6.simple
169 |
170 | AdvUrlCount:
171 |
172 | 读取文本内容,根据指定的学科, 取出点击量前三的
173 |
174 | 文本内容为某广告链接点击量,格式为:(时间点 某学科url链接)
175 |
176 | 举例:(20160321101957 http://net.zxl.cn/net/course.shtml)
177 |
178 | IpDemo:
179 |
180 | 数据格式如下:
181 | (1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302)
182 |
183 | 根据ip地址转换为数字,从数据集中找出详细信息.
184 |
185 | 为了简化查找速率,采用二分查找.
186 |
187 | UserLocation:
188 |
189 | 根据日志统计出每个用户在站点所呆时间最长的前2个的信息
190 |
191 | 日志内容格式为(手机号,时间点,基站站点,事件类型),事件类型为1时是进入基站,0是出基站。
192 |
193 | 1, 先根据"手机号_站点"为唯一标识, 算一次进站出站的时间, 返回(手机号_站点, 时间间隔)
194 |
195 | 2, 以"手机号_站点"为key, 统计每个站点的时间总和, ("手机号_站点", 时间总和)
196 |
197 | 3, ("手机号_站点", 时间总和) --> (手机号, 站点, 时间总和)
198 |
199 | 4, (手机号, 站点, 时间总和) --> groupBy().mapValues(以时间排序,取出前2个) --> (手机->((m,s,t)(m,s,t)))
200 |
201 | WordCount:
202 |
203 | 简单WordCount实现
204 |
205 | 集群上执行示例,指定相关配置
206 |
207 | bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.simple.WordCount --executor-memory 512m --total-executor-cores 2 /opt/soft/jar/hello-spark-1.0.jar hdfs://node1:9000/wc hdfs://node1:9000/out
208 |
209 | com.zxl.spark1_6.streaming
210 |
211 | LoggerLevels:
212 |
213 | 设置打印的log的级别
214 |
215 | StateFulWordCount:
216 |
217 | Spark Streaming累加器操作(updateStateByKey)
218 |
219 | StreamingWordCount:
220 |
221 | 通过SparkStreaming简单实现WordCount
222 |
223 | WindowOpts:
224 |
225 | SparkStreaming窗口函数的实现
226 |
227 | org.apache.spark.streaming.kafka
228 |
229 | KafkaManager:
230 |
231 | SparkStreaming直连kafka获取数据,自己编写偏移量offset,用于spark2.0以前
232 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.zxl
8 | spark-example
9 | 1.0
10 |
11 |
12 | 1.8
13 | 1.8
14 | UTF-8
15 | 2.11.8
16 | 2.2.0
17 | 2.6.4
18 |
19 |
20 |
21 |
22 | org.scala-lang
23 | scala-library
24 | ${scala.version}
25 |
26 |
27 |
28 | org.apache.spark
29 | spark-core_2.11
30 | ${spark.version}
31 |
32 |
33 |
34 | org.apache.hadoop
35 | hadoop-client
36 | ${hadoop.version}
37 |
38 |
39 |
40 | mysql
41 | mysql-connector-java
42 | 5.1.32
43 |
44 |
45 |
46 | org.apache.spark
47 | spark-sql_2.11
48 | ${spark.version}
49 |
50 |
51 |
52 | org.apache.spark
53 | spark-hive_2.11
54 | ${spark.version}
55 |
56 |
57 | org.apache.hive
58 | hive-jdbc
59 | ${spark.version}
60 |
61 |
62 |
63 | org.apache.spark
64 | spark-streaming_2.11
65 | ${spark.version}
66 |
67 |
68 |
69 | org.apache.spark
70 | spark-streaming-flume_2.11
71 | ${spark.version}
72 |
73 |
74 |
75 | org.apache.spark
76 | spark-streaming-kafka-0-8_2.11
77 | ${spark.version}
78 |
79 |
80 |
81 | org.apache.spark
82 | spark-streaming-kafka-0-10_2.11
83 | ${spark.version}
84 |
85 |
86 |
87 | org.apache.spark
88 | spark-sql-kafka-0-10_2.11
89 | ${spark.version}
90 |
91 |
92 |
93 | org.apache.spark
94 | spark-graphx_2.11
95 | ${spark.version}
96 |
97 |
98 |
99 | org.apache.spark
100 | spark-mllib_2.11
101 | ${spark.version}
102 |
103 |
104 |
105 | org.scalanlp
106 | breeze_2.11
107 | 0.12
108 |
109 |
110 |
111 | redis.clients
112 | jedis
113 | 2.8.1
114 |
115 |
116 |
117 | org.elasticsearch
118 | elasticsearch
119 | 2.3.1
120 |
121 |
122 |
123 | org.elasticsearch
124 | elasticsearch-spark_2.11
125 | 2.3.0
126 |
127 |
128 |
129 | mysql
130 | mysql-connector-java
131 | 5.1.35
132 |
133 |
134 |
135 |
136 | src/main/scala
137 |
138 |
139 | net.alchim31.maven
140 | scala-maven-plugin
141 | 3.2.2
142 |
143 |
144 |
145 | compile
146 | testCompile
147 |
148 |
149 |
150 | -make:transitive
151 | -dependencyfile
152 | ${project.build.directory}/.scala_dependencies
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 | org.apache.maven.plugins
161 | maven-shade-plugin
162 | 2.4.3
163 |
164 |
165 | package
166 |
167 | shade
168 |
169 |
170 |
171 |
172 | *:*
173 |
174 | META-INF/*.SF
175 | META-INF/*.DSA
176 | META-INF/*.RSA
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
--------------------------------------------------------------------------------
/src/main/resources/bs_log/19735E1C66.log:
--------------------------------------------------------------------------------
1 | 18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
2 | 18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1
3 | 18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0
4 | 18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0
5 |
--------------------------------------------------------------------------------
/src/main/resources/bs_log/DDE7970F68.log:
--------------------------------------------------------------------------------
1 | 18611132889,20160327075000,9F36407EAD0629FC166F14DDE7970F68,1
2 | 18688888888,20160327075100,9F36407EAD0629FC166F14DDE7970F68,1
3 | 18611132889,20160327081000,9F36407EAD0629FC166F14DDE7970F68,0
4 | 18688888888,20160327081300,9F36407EAD0629FC166F14DDE7970F68,0
5 | 18688888888,20160327175000,9F36407EAD0629FC166F14DDE7970F68,1
6 | 18611132889,20160327182000,9F36407EAD0629FC166F14DDE7970F68,1
7 | 18688888888,20160327220000,9F36407EAD0629FC166F14DDE7970F68,0
8 | 18611132889,20160327230000,9F36407EAD0629FC166F14DDE7970F68,0
9 |
--------------------------------------------------------------------------------
/src/main/resources/bs_log/E549D940E0.log:
--------------------------------------------------------------------------------
1 | 18611132889,20160327081100,CC0710CC94ECC657A8561DE549D940E0,1
2 | 18688888888,20160327081200,CC0710CC94ECC657A8561DE549D940E0,1
3 | 18688888888,20160327081900,CC0710CC94ECC657A8561DE549D940E0,0
4 | 18611132889,20160327082000,CC0710CC94ECC657A8561DE549D940E0,0
5 | 18688888888,20160327171000,CC0710CC94ECC657A8561DE549D940E0,1
6 | 18688888888,20160327171600,CC0710CC94ECC657A8561DE549D940E0,0
7 | 18611132889,20160327180500,CC0710CC94ECC657A8561DE549D940E0,1
8 | 18611132889,20160327181500,CC0710CC94ECC657A8561DE549D940E0,0
9 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/dataframe/SQLDemo.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.dataframe
2 |
3 | import org.apache.spark.sql.SQLContext
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | /**
7 | * 从hdfs中读取数据,转化为DataFrame,执行简单操作
8 | * Created by ZXL on 2017/10/23.
9 | */
10 | object SQLDemo {
11 |
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setAppName("SQLDemo")//.setMaster("local")
14 | val sc = new SparkContext(conf)
15 | val sqlContext = new SQLContext(sc)
16 | // 设置可以读取集群中的hdfs中文件
17 | System.setProperty("user.name", "root")
18 |
19 | val personRdd = sc.textFile("hdfs://node1:9000/person.txt").map(line =>{
20 | val fields = line.split(",")
21 | Person(fields(0).toLong, fields(1), fields(2).toInt)
22 | })
23 |
24 | import sqlContext.implicits._
25 | // 转为DataFrame
26 | val personDf = personRdd.toDF
27 |
28 | personDf.show()
29 |
30 | personDf.registerTempTable("person")
31 |
32 | sqlContext.sql("select * from person where age >= 20 order by age desc limit 2").show()
33 |
34 | sc.stop()
35 |
36 | }
37 |
38 | case class Person(id: Long, name: String, age: Int)
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/elastic/ElasticSpark.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.elastic
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.elasticsearch.spark._
5 |
6 | /**
7 | * Elasticsearch是一个基于Lucene的实时地分布式搜索和分析引擎。
8 | * 设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便。
9 | *
10 | * Created by ZXL on 2017/10/23.
11 | */
12 | object ElasticSpark {
13 |
14 | def main(args: Array[String]) {
15 | val conf = new SparkConf().setAppName("ElasticSpark").setMaster("local")
16 | conf.set("es.nodes", "192.168.13.131,192.168.13.132,192.168.13.133")
17 | conf.set("es.port", "9200")
18 | conf.set("es.index.auto.create", "true")
19 | val sc = new SparkContext(conf)
20 | //val query: String = "{\"query\":{\"match_all\":{}}}"
21 | val start = 1463998397
22 | val end = 1463998399
23 | // val query: String =
24 | // s"""{
25 | // "query": {"match_all": {}},
26 | // "filter": {
27 | // "bool": {
28 | // "must": {
29 | // "range": {
30 | // "access.time": {
31 | // "gte": "$start",
32 | // "lte": "$end"
33 | // }
34 | // }
35 | // }
36 | // }
37 | // }
38 | // }"""
39 |
40 | val tp = "1"
41 | val query: String = s"""{
42 | "query": {"match_all": {}},
43 | "filter" : {
44 | "bool": {
45 | "must": [
46 | {"term" : {"access.type" : $tp}},
47 | {
48 | "range": {
49 | "access.time": {
50 | "gte": "$start",
51 | "lte": "$end"
52 | }
53 | }
54 | }
55 | ]
56 | }
57 | }
58 | }"""
59 | val rdd1 = sc.esRDD("accesslogs", query)
60 |
61 | println(rdd1.collect().toBuffer)
62 | println(rdd1.collect().size)
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/flume/FlumePushWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.flume
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.streaming.flume.FlumeUtils
5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
6 |
7 | /**
8 | * flume向spark发送数据
9 | *
10 | * 添加三个jar包
11 | * - commons-lang3-3.3.2.jar
12 | * - scala-library-2.10.5.jar
13 | * - spark-streaming-flume-sink_2.10-1.6.1.jar
14 | *
15 | * 打成jar包上传到集群中运行
16 | * 集群命令如下:
17 | * bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.flume.FlumePushWordCount
18 | * /jar/____.jar 192.168.13.131 8888
19 | *
20 | * Created by ZXL on 2017/10/23.
21 | */
22 | object FlumePushWordCount {
23 |
24 | def main(args: Array[String]) {
25 | val host = args(0)
26 | val port = args(1).toInt
27 | val conf = new SparkConf().setAppName("FlumeWordCount")//.setMaster("local[2]")
28 | val ssc = new StreamingContext(conf, Seconds(5))
29 | //推送方式: flume向spark发送数据
30 | val flumeStream = FlumeUtils.createStream(ssc, host, port)
31 | //flume中的数据通过event.getBody()才能拿到真正的内容
32 | val words = flumeStream.flatMap(x => new String(x.event.getBody().array()).split(" ")).map((_, 1))
33 |
34 | val results = words.reduceByKey(_ + _)
35 | results.print()
36 | ssc.start()
37 | ssc.awaitTermination()
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/jedis/JedisConnectionPool.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.jedis
2 |
3 | import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
4 |
5 | /**
6 | * 获得Jedis连接,进行简单操作
7 | * Created by ZXL on 2016/5/24.
8 | */
9 | object JedisConnectionPool{
10 |
11 | val config = new JedisPoolConfig()
12 | //最大连接数,
13 | config.setMaxTotal(10)
14 | //最大空闲连接数,
15 | config.setMaxIdle(5)
16 | //当调用borrow Object方法时,是否进行有效性检查 -->
17 | config.setTestOnBorrow(true)
18 | val pool = new JedisPool(config, "172.16.0.101", 6379)
19 |
20 | def getConnection(): Jedis = {
21 | pool.getResource
22 |
23 | }
24 |
25 | def main(args: Array[String]) {
26 | val conn = JedisConnectionPool.getConnection()
27 | val r = conn.keys("*")
28 | println(r)
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/kafka/DirectKafkaWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.kafka
2 |
3 | import kafka.serializer.StringDecoder
4 | import org.apache.log4j.{Level, Logger}
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.rdd.RDD
7 | import org.apache.spark.streaming.kafka.KafkaManager
8 | import org.apache.spark.streaming.{Seconds, StreamingContext}
9 |
10 | /**
11 | * Spark Streaming维护偏移量相关的信息,实现零数据丢失,保证不重复消费
12 | * 采用直连的方式有一个缺点,就是不再向zookeeper中更新offset信息。
13 | * 因此,在采用直连的方式消费kafka中的数据的时候,大体思路是首先获取保存在zookeeper中的偏移量信息,
14 | * 根据偏移量信息去创建stream,消费数据后再把当前的偏移量写入zookeeper中
15 | *
16 | * 在2.0以前的版本中KafkaManager这个类是private权限的,需要把它拷贝到项目里使用。
17 | * org.apache.spark.streaming.kafka
18 | *
19 | * Created by ZXL on 2017/11/1.
20 | */
21 | object DirectKafkaWordCount {
22 |
23 | /* def dealLine(line: String): String = {
24 | val list = line.split(',').toList
25 | // val list = AnalysisUtil.dealString(line, ',', '"')// 把dealString函数当做split即可
26 | list.get(0).substring(0, 10) + "-" + list.get(26)
27 | }*/
28 |
29 | def processRdd(rdd: RDD[(String, String)]): Unit = {
30 | val lines = rdd.map(_._2)
31 | val words = lines.map(_.split(" "))
32 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
33 | wordCounts.foreach(println)
34 | }
35 |
36 | def main(args: Array[String]) {
37 | if (args.length < 3) {
38 | System.err.println(
39 | s"""
40 | |Usage: DirectKafkaWordCount
41 | | is a list of one or more Kafka brokers
42 | | is a list of one or more kafka topics to consume from
43 | | is a consume group
44 | |
45 | """.stripMargin)
46 | System.exit(1)
47 | }
48 |
49 | Logger.getLogger("org").setLevel(Level.WARN)
50 |
51 | val Array(brokers, topics, groupId) = args
52 |
53 | // Create context with 2 second batch interval
54 | val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
55 | sparkConf.setMaster("local[*]")
56 | sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "5")
57 | sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
58 |
59 | val ssc = new StreamingContext(sparkConf, Seconds(2))
60 |
61 | // Create direct kafka stream with brokers and topics
62 | val topicsSet = topics.split(",").toSet
63 | val kafkaParams = Map[String, String](
64 | "metadata.broker.list" -> brokers,
65 | "group.id" -> groupId,
66 | "auto.offset.reset" -> "smallest"
67 | )
68 |
69 | val km = new KafkaManager(kafkaParams)
70 |
71 | val messages = km.createDirectStream[String, String, StringDecoder, StringDecoder](
72 | ssc, kafkaParams, topicsSet)
73 |
74 | messages.foreachRDD(rdd => {
75 | if (!rdd.isEmpty()) {
76 | // 先处理消息
77 | processRdd(rdd)
78 | // 再更新offsets
79 | km.updateZKOffsets(rdd)
80 | }
81 | })
82 |
83 | ssc.start()
84 | ssc.awaitTermination()
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/kafka/KafkaWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.kafka
2 |
3 | import org.apache.spark.storage.StorageLevel
4 | import org.apache.spark.streaming.kafka.KafkaUtils
5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
6 | import org.apache.spark.{HashPartitioner, SparkConf}
7 |
8 | /**
9 | * 从集群中的kafka读取数据操作
10 | *
11 | * 运行时参数:
12 | * node1:2181,node2:2181,node3:2181 g1 test 2
13 | * 其中g1为组名,此处随意写,test为topic名,kafka中的topic名要一致
14 | *
15 | * 集群命令(需先启动完成):
16 | * 1.启动kafak
17 | * bin/kafka-server-start.sh config/server.properties > /dev/null 2>&1 &
18 | * 2.创建topic
19 | * bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 3 --partitions 3 --topic test
20 | * 3.向topic中添加数据
21 | * bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test
22 | *
23 | * Created by ZXL on 2017/11/1.
24 | */
25 | object KafkaWordCount {
26 |
27 | val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => {
28 | //iter.flatMap(it => Some(it._2.sum + it._3.getOrElse(0)).map(x => (it._1, x)))
29 | iter.flatMap{case(x, y, z) => Some(y.sum + z.getOrElse(0)).map(i => (x, i))}
30 | }
31 |
32 | def main(args: Array[String]) {
33 |
34 | LoggerLevels.setStreamingLogLevels()
35 | val Array(zkQuorum, group, topics, numThreads) = args
36 | val sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]")
37 | val ssc = new StreamingContext(sparkConf, Seconds(5))
38 | ssc.checkpoint("D:\\test\\spark\\checkpoint2")
39 | // 线程执行个数
40 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
41 | val data = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK_SER)
42 | // 返回(K, V),_._2返回的是值,值得输入是按空格分开
43 | val words = data.map(_._2).flatMap(_.split(" "))
44 | val wordCounts = words.map((_, 1)).updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
45 | wordCounts.print()
46 |
47 | ssc.start()
48 | ssc.awaitTermination()
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/kafka/LoggerLevels.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.kafka
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.internal.Logging
5 |
6 | object LoggerLevels extends Logging {
7 |
8 | def setStreamingLogLevels() {
9 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
10 | if (!log4jInitialized) {
11 | logInfo("Setting log level to [WARN] for streaming example." +
12 | " To override add a custom log4j.properties to the classpath.")
13 | Logger.getRootLogger.setLevel(Level.WARN)
14 | }
15 | }
16 | }
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/my_partitioner/UrlCountPartition.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.my_partitioner
2 |
3 | import java.net.URL
4 |
5 | import org.apache.spark.{Partitioner, SparkContext, SparkConf}
6 | import scala.collection.mutable
7 |
8 | /**
9 | * 自定义分区
10 | * 数据格式(时间点 url地址),例如:
11 | * 20160321101954 http://net.zxl.cn/net/video.shtml
12 | * 处理成数据(k, v)
13 | * 对于数据(k, v)
14 | * 重写自己的 partitioner
15 | * Created by ZXL on 2017/10/20.
16 | */
17 | object UrlCountPartition {
18 |
19 | def main(args: Array[String]) {
20 |
21 | val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]")
22 | val sc = new SparkContext(conf)
23 |
24 | // rdd1将数据切分,元组中放的是(URL, 1)
25 | val rdd1 = sc.textFile("D://test//spark//adv_url_count.log").map(line => {
26 | val f = line.split("\t")
27 | (f(1), 1)
28 | })
29 |
30 | val rdd2 = rdd1.reduceByKey(_ + _)
31 |
32 | // (URL, n)
33 | val rdd3 = rdd2.map(t => {
34 | val url = t._1
35 | val host = new URL(url).getHost
36 | // host返回的是如 php.zxl.cn
37 | (host, (url, t._2))
38 | })
39 |
40 | // 得到结果为 ArrayBuffer(net.zxl.cn, java.zxl.cn, php.zxl.cn)
41 | val ints = rdd3.map(_._1).distinct().collect()
42 | // rdd3.repartition(3).saveAsTextFile("D://test//spark//out//out1")
43 | // println(ints.toBuffer)
44 |
45 | val hostPartitioner = new HostPartitioner(ints)
46 | // 取出每个 partitioner 中的信息
47 | val rdd4 = rdd3.partitionBy(hostPartitioner).mapPartitions(it => {
48 | it.toList.sortBy(_._2._2).reverse.take(2).iterator
49 | })
50 |
51 | rdd4.saveAsTextFile("D://test//spark//out//out3")
52 |
53 | sc.stop()
54 | }
55 | }
56 |
57 | class HostPartitioner(ins: Array[String]) extends Partitioner {
58 |
59 | val parMap = new mutable.HashMap[String, Int]()
60 | var count = 0
61 | for(i <- ins) {
62 | parMap += (i -> count)
63 | count += 1
64 | }
65 |
66 | override def numPartitions: Int = ins.length
67 |
68 | override def getPartition(key: Any): Int = {
69 | // 根据 key 值获得分区
70 | parMap.getOrElse(key.toString, 0)
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/my_sort/CustomSort.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.my_sort
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | // 第二种方式
6 | object OrderContext {
7 |
8 | /**
9 | * 第一种形式
10 |
11 | implicit object GirlOrdering extends Ordering[Girl] {
12 | override def compare(x: Girl, y: Girl): Int = {
13 | if(x.faceValue > y.faceValue) 1
14 | else if(x.faceValue == y.faceValue) {
15 | if(x.age > y.age) -1 else 1
16 | } else -1
17 | }
18 | }
19 | */
20 |
21 | /**
22 | * 第二种形式
23 | */
24 | implicit val girlOrdering = new Ordering[Girl] {
25 | override def compare(x: Girl, y: Girl): Int = {
26 | if(x.faceValue > y.faceValue) 1
27 | else if(x.faceValue == y.faceValue) {
28 | if(x.age > y.age) -1 else 1
29 | } else -1
30 | }
31 | }
32 | }
33 |
34 | /**
35 | * Created by ZXL on 2017/10/21.
36 | * 自定义排序
37 | */
38 | object CustomSort {
39 |
40 | def main(args: Array[String]) {
41 | val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]")
42 | val sc = new SparkContext(conf)
43 | val rdd1 = sc.parallelize(List(("zzz", 90, 28, 1), ("xxx", 90, 27, 2), ("lll", 95, 22, 3)))
44 | import OrderContext._
45 | val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false)
46 | println(rdd2.collect().toBuffer)
47 | sc.stop()
48 | }
49 | }
50 |
51 | /**
52 | * 第一种方式
53 | * @param faceValue
54 | * @param age
55 |
56 | case class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable {
57 | override def compare(that: Girl): Int = {
58 | if(this.faceValue == that.faceValue) {
59 | that.age - this.age
60 | } else {
61 | this.faceValue - that.faceValue
62 | }
63 | }
64 | }
65 | */
66 |
67 | // 第二种方式
68 | case class Girl(faceValue: Int, age: Int) extends Serializable
69 |
70 |
71 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/mysql/JdbcRDDDemo.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.mysql
2 |
3 | import java.sql.DriverManager
4 |
5 | import org.apache.spark.rdd.JdbcRDD
6 | import org.apache.spark.{SparkConf, SparkContext}
7 |
8 | /**
9 | * 简单连接数据库操作
10 | * Created by ZXL on 2017/10/22.
11 | */
12 | object JdbcRDDDemo {
13 |
14 | def main(args: Array[String]) {
15 | val conf = new SparkConf().setAppName("JdbcRDDDemo").setMaster("local[2]")
16 | val sc = new SparkContext(conf)
17 | val connection = () => {
18 | Class.forName("com.mysql.jdbc.Driver").newInstance()
19 | DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?useUnicode=true&characterEncoding=utf-8", "root", "1234")
20 | }
21 | val jdbcRDD = new JdbcRDD(
22 | sc,
23 | connection,
24 | "SELECT * FROM ta where id >= ? AND id <= ?",
25 | // 1,4分别为两个占位符赋值,2表示两个任务一起读取数据
26 | 1, 4, 2,
27 | // 返回的内容
28 | r => {
29 | val id = r.getInt(1)
30 | val code = r.getString(2)
31 | (id, code)
32 | }
33 | )
34 | val data = jdbcRDD.collect()
35 | println(data.toBuffer)
36 | sc.stop()
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/AdvUrlCount.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.simple
2 |
3 | import java.net.URL
4 |
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | /**
8 | * 读取文本内容,根据指定的学科, 取出点击量前三的
9 | * 文本内容为某广告链接点击量,格式为:
10 | * (时间点 某学科url链接)
11 | * 举例:(20160321101957 http://net.zxl.cn/net/course.shtml)
12 | * Created by ZXL on 2017/10/16.
13 | */
14 | object AdvUrlCount {
15 |
16 | def main(args: Array[String]) {
17 |
18 | // 从数据库中加载规则
19 | val arr = Array("java.zxl.cn", "php.zxl.cn", "net.zxl.cn")
20 |
21 | val conf = new SparkConf().setAppName("AdvUrlCount").setMaster("local[2]")
22 | val sc = new SparkContext(conf)
23 |
24 | // rdd1将数据切分,元组中放的是(URL, 1)
25 | val rdd1 = sc.textFile("D://test//spark//advUrlCount.log").map(line => {
26 | val f = line.split("\t")
27 | (f(1), 1)
28 | })
29 | val rdd2 = rdd1.reduceByKey(_ + _)
30 |
31 | val rdd3 = rdd2.map(t => {
32 | val url = t._1
33 | val host = new URL(url).getHost
34 | (host, url, t._2)
35 | })
36 |
37 | // println(rdd3.collect().toBuffer)
38 |
39 | // val rddjava = rdd3.filter(_._1 == "java.zxl.cn")
40 | // val sortdjava = rddjava.sortBy(_._3, false).take(3)
41 | // val rddphp = rdd3.filter(_._1 == "php.zxl.cn")
42 |
43 | for (ins <- arr) {
44 | val rdd = rdd3.filter(_._1 == ins)
45 | val result= rdd.sortBy(_._3, false).take(3)
46 | //通过JDBC向数据库中存储数据
47 | //id,学院,URL,次数, 访问日期
48 | println(result.toBuffer)
49 | }
50 |
51 | //println(sortdjava.toBuffer)
52 | sc.stop()
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/IpDemo.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.simple
2 |
3 | import scala.collection.mutable.ArrayBuffer
4 | import scala.io.Source
5 |
6 | /**
7 | * 数据格式如下:
8 | * (1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302)
9 | * 根据ip地址转换为数字,从数据集中找出详细信息.
10 | * 为了简化查找速率,采用二分查找.
11 | * Created by ZXL on 2017/10/22.
12 | */
13 | object IpDemo {
14 |
15 | // ip地址转换为数字
16 | // 如 100.101.102.103,从100开始向左移动8位
17 | def ip2Long(ip: String): Long = {
18 | val fragments = ip.split("[.]")
19 | var ipNum = 0L
20 | for (i <- 0 until fragments.length) {
21 | // | 二进制OR运算符
22 | // ipNum向左移动8位,相当于乘以256(即2^8)
23 | ipNum = fragments(i).toLong | ipNum << 8L
24 | }
25 | ipNum
26 | }
27 |
28 | // 从文件中读取数据
29 | def readData(path: String) = {
30 |
31 | val lines = new ArrayBuffer[String]()
32 |
33 | /**
34 | * java读取文件方式
35 | * val br = new BufferedReader(new InputStreamReader(new FileInputStream(path)))
36 | * var s: String = null
37 | * var flag = true
38 | * while (flag) {
39 | * s = br.readLine()
40 | * if (s != null)
41 | * lines += s
42 | * else
43 | * flag = false
44 | * }
45 | * lines
46 | */
47 |
48 | val content = Source.fromFile(path)
49 | for (line <- content.getLines()) {
50 | lines += line
51 | }
52 | lines
53 | }
54 |
55 | // 二分查找ip的下标地址,ip地址已经转为十进制
56 | def binarySearch(lines: ArrayBuffer[String], ip: Long): Int = {
57 | var low = 0
58 | var high = lines.length - 1
59 | while (low <= high) {
60 | val middle = (low + high) / 2
61 | if ((ip >= lines(middle).split("\\|")(2).toLong) && (ip <= lines(middle).split("\\|")(3).toLong))
62 | return middle
63 | if (ip < lines(middle).split("\\|")(2).toLong)
64 | high = middle - 1
65 | else
66 | low = middle + 1
67 | }
68 | -1
69 | }
70 |
71 | def main(args: Array[String]) {
72 | val ip = "120.55.185.61"
73 | val ipNum = ip2Long(ip)
74 | println(ipNum)
75 | val lines = readData("d://test//spark//ip//ip.txt")
76 | val index = binarySearch(lines, ipNum)
77 | print(lines(index))
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/UserLocation.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.simple
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * 根据日志统计出每个用户在站点所呆时间最长的前2个的信息
7 | * 日志内容格式为(手机号,时间点,基站站点,事件类型),事件类型为1时是进入基站,0是出基站。
8 | * 1, 先根据"手机号_站点"为唯一标识, 算一次进站出站的时间, 返回(手机号_站点, 时间间隔)
9 | * 2, 以"手机号_站点"为key, 统计每个站点的时间总和, ("手机号_站点", 时间总和)
10 | * 3, ("手机号_站点", 时间总和) --> (手机号, 站点, 时间总和)
11 | * 4, (手机号, 站点, 时间总和) --> groupBy().mapValues(以时间排序,取出前2个) --> (手机->((m,s,t)(m,s,t)))
12 | * Created by ZXL on 2017/10/15.
13 | */
14 | object UserLocation {
15 |
16 | def main(args: Array[String]) {
17 | val conf = new SparkConf().setAppName("UserLocation").setMaster("local[2]")
18 | val sc = new SparkContext(conf)
19 | //sc.textFile("D://test//spark//bs_log").map(_.split(",")).map(x => (x(0), x(1), x(2), x(3)))
20 | val mbt = sc.textFile("D://test//spark//bs_log").map( line => {
21 | val fields = line.split(",")
22 | val eventType = fields(3)
23 | val time = fields(1)
24 | val timeLong = if(eventType == "1") -time.toLong else time.toLong
25 | (fields(0) + "_" + fields(2), timeLong)
26 | })
27 | //println(mbt.collect().toBuffer)
28 | //(18611132889_9F36407EAD0629FC166F14DDE7970F68,54000)
29 | val rdd1 = mbt.groupBy(_._1).mapValues(_.foldLeft(0L)(_ + _._2))
30 | val rdd2 = rdd1.map( t => {
31 | val mobile_bs = t._1
32 | val mobile = mobile_bs.split("_")(0)
33 | val lac = mobile_bs.split("_")(1)
34 | val time = t._2
35 | (mobile, lac, time)
36 | })
37 | val rdd3 = rdd2.groupBy(_._1)
38 | //ArrayBuffer((18688888888,List((18688888888,16030401EAFB68F1E3CDF819735E1C66,87600), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200))), (18611132889,List((18611132889,16030401EAFB68F1E3CDF819735E1C66,97500), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000))))
39 | val rdd4 = rdd3.mapValues(it => {
40 | it.toList.sortBy(_._3).reverse.take(2)
41 | })
42 | println(rdd4.collect().toBuffer)
43 | sc.stop()
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/WordCount.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.simple
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * 简单WordCount实现
7 | * Created by ZXL on 2017/10/12.
8 | *
9 | * 集群上执行示例,指定相关配置
10 | * bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.simple.WordCount --executor-memory 512m
11 | * --total-executor-cores 2 /opt/soft/jar/hello-spark-1.0.jar hdfs://node1:9000/wc hdfs://node1:9000/out
12 | */
13 | object WordCount {
14 |
15 | def main(args: Array[String]) {
16 | // 非常重要,是通向Spark集群的入口
17 | val conf = new SparkConf().setAppName("WordCount")
18 | val sc = new SparkContext(conf)
19 |
20 | // reduceByKey(_+_, 1)指定partition的个数为1,即生成一个输出文件
21 | sc.textFile(args(0)).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_).sortBy(_._2, false).saveAsTextFile(args(1))
22 | sc.stop()
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/LoggerLevels.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.streaming
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.internal.Logging
5 |
6 | /**
7 | * 设置打印的log的级别
8 | */
9 | object LoggerLevels extends Logging {
10 |
11 | def setStreamingLogLevels() {
12 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
13 | if (!log4jInitialized) {
14 | logInfo("Setting log level to [WARN] for streaming example." +
15 | " To override add a custom log4j.properties to the classpath.")
16 | Logger.getRootLogger.setLevel(Level.WARN)
17 | }
18 | }
19 | }
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/StateFulWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.streaming
2 |
3 | import org.apache.spark.streaming.{Seconds, StreamingContext}
4 | import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
5 |
6 | /**
7 | * Spark Streaming累加器操作(updateStateByKey)
8 | * Created by ZXL on 2017/11/1.
9 | */
10 | object StateFulWordCount {
11 |
12 | // Seq这个批次某个单词的次数
13 | // Option[Int]:以前的结果
14 | // 分好组的数据
15 | // updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)]
16 | val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => {
17 | // 下面几种操作结果一致
18 | //iter.flatMap(it => Some(it._2.sum + it._3.getOrElse(0)).map(x => (it._1, x)))
19 | //iter.map(t => (t._1, t._2.sum + t._3.getOrElse(0)))
20 | //iter.map{case(x, y, z) => Some(y.sum + z.getOrElse(0)).map(m => (x, m))}
21 | iter.map{case(word, current_count, history_count) => (word, current_count.sum + history_count.getOrElse(0))}
22 | }
23 |
24 | def main(args: Array[String]) {
25 | LoggerLevels.setStreamingLogLevels()
26 | // StreamingContext
27 | val conf = new SparkConf().setAppName("StateFulWordCount").setMaster("local[2]")
28 | val sc = new SparkContext(conf)
29 | // updateStateByKey必须设置setCheckpointDir
30 | sc.setCheckpointDir("D:\\test\\spark\\checkpoint")
31 | val ssc = new StreamingContext(sc, Seconds(5))
32 |
33 | val ds = ssc.socketTextStream("192.168.13.131", 8888)
34 |
35 | // DStream是一个特殊的RDD
36 | // hello tom hello jerry
37 | val result = ds.flatMap(_.split(" ")).map((_, 1)).updateStateByKey(updateFunc, new HashPartitioner(sc.defaultParallelism), true)
38 | result.print()
39 | ssc.start()
40 | ssc.awaitTermination()
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/StreamingWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.streaming
2 |
3 | import org.apache.spark.streaming.{Seconds, StreamingContext}
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | /**
7 | * 通过SparkStreaming简单实现WordCount
8 | * Created by ZXL on 2017/10/31.
9 | */
10 | object StreamingWordCount {
11 |
12 | def main(args: Array[String]) {
13 | // 设置log level
14 | LoggerLevels.setStreamingLogLevels()
15 |
16 | // StreamingContext
17 | val conf = new SparkConf().setAppName("StreamingWordCount").setMaster("local[2]")
18 | val sc = new SparkContext(conf)
19 | val ssc = new StreamingContext(sc, Seconds(5))
20 |
21 | // 接收数据,使用nc绑定ip和端口发送数据
22 | val ds = ssc.socketTextStream("192.168.13.131", 8888)
23 |
24 | // DStream是一个特殊的RDD
25 | // hello tom hello jerry
26 | val result = ds.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_)
27 |
28 | // 打印结果
29 | result.print()
30 | ssc.start()
31 | ssc.awaitTermination()
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/WindowOpts.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark1_6.streaming
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}
5 |
6 | /**
7 | * SparkStreaming窗口函数的实现
8 | * Created by ZXL on 2017/11/2.
9 | */
10 | object WindowOpts {
11 |
12 | def main(args: Array[String]) {
13 | LoggerLevels.setStreamingLogLevels()
14 | val conf = new SparkConf().setAppName("WindowOpts").setMaster("local[2]")
15 | val ssc = new StreamingContext(conf, Milliseconds(5000))
16 | val lines = ssc.socketTextStream("192.168.13.131", 9999)
17 | val pairs = lines.flatMap(_.split(" ")).map((_, 1))
18 | // Seconds(15):窗口的宽度,Seconds(10):移动窗口的间隔
19 | val windowedWordCounts = pairs.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(15), Seconds(10))
20 | windowedWordCounts.print()
21 | // Map((hello, 5), (jerry, 2), (kitty, 3))
22 | val a = windowedWordCounts.map(_._2).reduce(_+_)
23 | a.foreachRDD(rdd => {
24 | println(rdd.take(0))
25 | })
26 | a.print()
27 |
28 | // windowedWordCounts.map(t => (t._1, t._2.toDouble / a.toD))
29 | ssc.start()
30 | ssc.awaitTermination()
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/dataset/actions.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.dataset
2 |
3 | import org.apache.spark.sql._
4 | import org.apache.spark.sql.functions._
5 |
6 | /**
7 | * DataSet的操作
8 | * Created by ZXL on 2018/1/28.
9 | */
10 | object actions {
11 |
12 | // 构建Spark对象
13 | val spark = SparkSession.builder()
14 | .master("local[2]")
15 | .appName("createDataSet")
16 | .enableHiveSupport()
17 | .getOrCreate()
18 |
19 | // 导入操作需要的隐式函数
20 | import spark.implicits._
21 |
22 | // 1.map操作,flatMap操作
23 | val seq1 = Seq(Peoples(21, "zxl,wr,hy"), Peoples(20, "cc,hw,lwq"))
24 | val ds1 = spark.createDataset(seq1)
25 | val ds2 = ds1.map{ x => (x.age + 1, x.names)}.show()
26 | val ds3 = ds1.flatMap{ x =>
27 | val a = x.age
28 | val s = x.names.split(",").map{ x => (a, x)}
29 | s
30 | }.show()
31 |
32 | // 2.filter操作,where操作
33 | val seq2 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165))
34 | val ds4 = spark.createDataset(seq2)
35 | ds4.filter("age >= 20 and height >= 170").show()
36 | ds4.filter($"age" >= 20 && $"height" >= 170).show()
37 | ds4.filter{x => x.age > 20 && x.height >= 170}.show()
38 | ds4.where("age >= 20 and height >= 170").show()
39 | ds4.where($"age" >= 20 && $"height" >= 170).show()
40 |
41 | // 3.去重操作
42 | ds4.distinct().show()
43 | ds4.dropDuplicates("age").show()
44 | ds4.dropDuplicates("age", "height").show()
45 | ds4.dropDuplicates(Seq("age", "height")).show()
46 | ds4.dropDuplicates(Array("age", "height")).show()
47 |
48 | // 4.加法/减法操作
49 | val seq3 = Seq(Person("zxl2", 29, 170), Person("wx2", 30, 165), Person("cc2", 30, 165))
50 | val ds5 = spark.createDataset(seq3)
51 | ds4.except(ds5).show()
52 | ds4.union(ds5).show()
53 | ds4.intersect(ds5).show()
54 |
55 | // 5.select操作
56 | ds5.select("name", "age").show()
57 | ds5.select(expr("height + 1").as[Int]).show()
58 |
59 | // 6.排序操作
60 | ds5.sort("age").show()
61 | ds5.sort($"age".desc, $"height".desc).show()
62 | ds5.orderBy("age").show()
63 | ds5.orderBy($"age".desc, $"height".desc).show()
64 |
65 | // 7.分割抽样操作
66 | val ds6 = ds4.union(ds5)
67 | val rands = ds6.randomSplit(Array(0.3, 0.7))
68 | rands(0).count()
69 | rands(1).count()
70 | rands(0).show()
71 | rands(1).show()
72 | val ds7 = ds6.sample(false, 0.5)
73 | ds7.count()
74 | ds7.show()
75 |
76 | // 8.列操作
77 | val ds8 = ds6.drop("height")
78 | ds8.columns
79 | ds8.show()
80 | val ds9 = ds6.withColumn("add2", $"age" + 2) // 对数据集增加列
81 | ds9.columns
82 | ds9.show()
83 | val ds10 = ds9.withColumnRenamed("add2", "age_new")
84 | ds10.columns
85 | ds10.show()
86 | ds6.withColumn("add_col", lit(1)).show()
87 |
88 | // 9.join操作
89 | val seq4 = Seq(Score("zxl", 85), Score("wr", 90), Score("hy", 95))
90 | val ds11 = spark.createDataset(seq4)
91 | val ds12 = ds5.join(ds11, Seq("name"), "inner")
92 | ds12.show()
93 | val ds13 = ds5.join(ds11, Seq("name"), "left")
94 | ds13.show()
95 |
96 | // 10.分组聚合操作
97 | val ds14 = ds4.union(ds5).groupBy("height").agg(avg("age")).as("avg_agg")
98 | ds14.show()
99 | }
100 |
101 | case class Score(name: String, score: Int)
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/dataset/basicAction.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.dataset
2 |
3 | import org.apache.spark.sql._
4 | import org.apache.spark.sql.types._
5 | import org.apache.spark.storage.StorageLevel._
6 |
7 | /**
8 | * DataSet的基本操作
9 | * Created by ZXL on 2018/1/28.
10 | */
11 | object basicAction {
12 |
13 | // 构建Spark对象
14 | val spark = SparkSession.builder()
15 | .master("local[2]")
16 | .appName("createDataSet")
17 | .enableHiveSupport()
18 | .getOrCreate()
19 |
20 | // 导入操作需要的隐式函数
21 | import spark.implicits._
22 |
23 | // 1.DataSet存储类型
24 | val seq1 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165))
25 | val ds1 = spark.createDataset(seq1)
26 | ds1.show()
27 | ds1.checkpoint()
28 | ds1.cache()
29 | ds1.persist(MEMORY_ONLY)
30 | ds1.count()
31 | ds1.show()
32 | ds1.unpersist(true) // 将DataSet删除
33 |
34 | // 2.获取数据集
35 | val c1 = ds1.collect()
36 | val c2 = ds1.collectAsList()
37 | val h1 = ds1.head()
38 | val h2 = ds1.head(3)
39 | val f1 = ds1.first()
40 | val t1 = ds1.take(2)
41 | val t2 = ds1.takeAsList(2)
42 |
43 | // 3.统计数据集
44 | ds1.count()
45 | ds1.describe().show()
46 | ds1.describe("age").show()
47 | ds1.describe("age", "height").show()
48 |
49 | // 4.聚集
50 | ds1.reduce((f1, f2) => Person("sum", (f1.age + f2.age), (f1.height + f2.height)))
51 |
52 | // 5.DataSet结构属性
53 | ds1.columns
54 | ds1.dtypes
55 | ds1.explain() // 返回执行物理计划
56 |
57 | // 6.DataSet rdd数据互转
58 | val rdd1 = ds1.rdd
59 | val ds2 = rdd1.toDS()
60 | ds2.show()
61 | val df2 = rdd1.toDF()
62 | df2.show()
63 |
64 | // 7.DataSet 保存文件
65 | ds1.select("name", "age", "height").write.format("csv").save("hdfs://node1:9000/test2.csv")
66 | // 读取保存的文件
67 | val schema2 = StructType(
68 | StructField("name", StringType, false) ::
69 | StructField("age", IntegerType, false) ::
70 | StructField("name", IntegerType, true) :: Nil)
71 | val out = spark.read.
72 | options(Map(("delimiter", ","), ("header", "false"))).
73 | schema(schema2).csv("hdf2://node:9000/test2.csv")
74 | out.show(10)
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/dataset/createDataSet.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.dataset
2 |
3 | import org.apache.spark.sql._
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types._
6 |
7 | /**
8 | * DataSet创建的多种方式
9 | * Created by ZXL on 2018/1/28.
10 | */
11 | object createDataSet {
12 |
13 | // 构建Spark对象
14 | val spark = SparkSession.builder()
15 | .master("local[2]")
16 | .appName("createDataSet")
17 | .enableHiveSupport()
18 | .getOrCreate()
19 |
20 | // 导入操作需要的隐式函数
21 | import spark.implicits._
22 |
23 | // 设置检查点
24 | spark.sparkContext.setCheckpointDir("hdfs://node1:9000/user/spark_checkpoint")
25 |
26 | // 1.产生序列dataset
27 | val numDS = spark.range(5, 100, 5)
28 | numDS.orderBy(desc("id")).show(5)
29 | numDS.describe().show()
30 |
31 | // 2.集合转成DataSet
32 | val seq1 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165))
33 | val ds1 = spark.createDataset(seq1)
34 | ds1.show()
35 |
36 | // 3.集合转成DataFrame
37 | val df1 = spark.createDataFrame(seq1).withColumnRenamed("_1", "name").withColumnRenamed("_2", "age")
38 | df1.orderBy(desc("age")).show(10)
39 |
40 | // 4.rdd转成DataFrame
41 | val array1 = Array(("zxl", 29, 170), ("wx", 30, 165), ("cc", 30, 165))
42 | val rdd1 = spark.sparkContext.parallelize(array1, 3).map(f => Row(f._1, f._2, f._3))
43 | val schema = StructType(
44 | StructField("name", StringType, false) ::
45 | StructField("age", IntegerType, true) :: Nil)
46 | val rddToDataFrame = spark.createDataFrame(rdd1, schema)
47 | rddToDataFrame.orderBy(desc("name")).show(false)
48 |
49 | // 5.rdd转成DataSet/DataFrame
50 | val rdd2 = spark.sparkContext.parallelize(array1, 3).map(f => Person(f._1, f._2, f._3))
51 | val ds2 = rdd2.toDS()
52 | val df2 = rdd2.toDF()
53 | ds2.orderBy(desc("name")).show(10)
54 | df2.orderBy(desc("name")).show(10)
55 |
56 | // 6.rdd转成DataSet
57 | val ds3 = spark.createDataset(rdd2)
58 | ds3.show(10)
59 |
60 | // 7.读取文件
61 | val df4 = spark.read.csv("hdf2://node:9000/test.csv")
62 | df4.show()
63 |
64 | // 8.读取文件,详细参数
65 | val schema2 = StructType(
66 | StructField("name", StringType, false) ::
67 | StructField("age", IntegerType, false) ::
68 | StructField("name", IntegerType, true) :: Nil)
69 | val df7 = spark.read.
70 | options(Map(("delimiter", ","), ("header", "false"))).
71 | schema(schema2).csv("hdf2://node:9000/test.csv")
72 | }
73 |
74 | case class Person(name: String, age: Int, height: Int)
75 | case class Peoples(age: Int, names: String)
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/kafka/StreamingKafka10.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.kafka
2 |
3 | import org.apache.kafka.common.serialization.StringDeserializer
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
6 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
7 | import org.apache.spark.streaming.kafka010._
8 | import org.apache.spark.streaming.{Seconds, StreamingContext}
9 |
10 | /**
11 | * SparkStreaming从kafka中读取数据
12 | * kafka版本0.10
13 | * 采取直连方式
14 | *
15 | * Created by ZXL on 2017/10/15.
16 | */
17 | object StreamingKafka10 {
18 |
19 | def main(args: Array[String]): Unit = {
20 |
21 | val spark = SparkSession.builder()
22 | .master("local[2]")
23 | .appName("streaming").getOrCreate()
24 |
25 | val sc =spark.sparkContext
26 | val ssc = new StreamingContext(sc, Seconds(5))
27 | val kafkaParams = Map[String, Object](
28 | "bootstrap.servers" -> "node2:9092",
29 | "key.deserializer" -> classOf[StringDeserializer],
30 | "value.deserializer" -> classOf[StringDeserializer],
31 | "group.id" -> "0001",
32 | "auto.offset.reset" -> "latest",
33 | "enable.auto.commit" -> (false: java.lang.Boolean)
34 | )
35 | val topics = Array("weblogs")
36 | val stream = KafkaUtils.createDirectStream[String, String](
37 | ssc,
38 | PreferConsistent,
39 | Subscribe[String, String](topics, kafkaParams)
40 | )
41 |
42 | val lines = stream.map(x => x.value())
43 | val words = lines.flatMap(_.split(" "))
44 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
45 | wordCounts.print()
46 |
47 | ssc.start()
48 | ssc.awaitTermination()
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/kafka/StreamingKafka8.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.kafka
2 |
3 | import kafka.serializer.StringDecoder
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.streaming.kafka.KafkaUtils
6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
7 |
8 | /**
9 | * SparkStreaming从kafka中读取数据
10 | * kafka版本0.8
11 | * 采取直连方式
12 | *
13 | * Created by ZXL on 2017/10/15.
14 | */
15 | object StreamingKafka8 {
16 |
17 | def main(args: Array[String]): Unit = {
18 |
19 | val spark = SparkSession.builder()
20 | .master("local[2]")
21 | .appName("streaming").getOrCreate()
22 |
23 | val sc =spark.sparkContext
24 | val ssc = new StreamingContext(sc, Seconds(5))
25 |
26 | // Create direct kafka stream with brokers and topics
27 | val topicsSet =Set("weblogs")
28 | val kafkaParams = Map[String, String]("metadata.broker.list" -> "node1:9092")
29 | val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
30 | ssc, kafkaParams, topicsSet)
31 |
32 | val lines = kafkaStream.map(x => x._2)
33 | val words = lines.flatMap(_.split(" "))
34 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
35 | wordCounts.print()
36 |
37 | ssc.start()
38 | ssc.awaitTermination()
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/streaming/StreamingToMysql.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.streaming
2 |
3 | import java.sql.DriverManager
4 |
5 | import org.apache.spark.sql.SparkSession
6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
7 |
8 | /**
9 | * SparkStreaming读取数据,存储到Mysql中
10 | *
11 | * Created by ZXL on 2017/10/23.
12 | */
13 | object StreamingToMysql {
14 |
15 | def main(args: Array[String]): Unit = {
16 |
17 | val spark = SparkSession.builder()
18 | .master("local[2]")
19 | .appName("streaming").getOrCreate()
20 |
21 | val sc =spark.sparkContext
22 | val ssc = new StreamingContext(sc, Seconds(5))
23 | val lines = ssc.socketTextStream("node2", 9999)
24 | val words = lines.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
25 |
26 | words.foreachRDD(rdd => rdd.foreachPartition(line => {
27 | Class.forName("com.mysql.jdbc.Driver")
28 | val conn = DriverManager
29 | .getConnection("jdbc:mysql://node3:3306/test","root","1234")
30 | try{
31 | for(row <- line){
32 | val sql = "insert into webCount(titleName,count)values('"+row._1+"',"+row._2+")"
33 | conn.prepareStatement(sql).executeUpdate()
34 | }
35 | }finally {
36 | conn.close()
37 | }
38 | }))
39 |
40 | //words.print()
41 | ssc.start()
42 | ssc.awaitTermination()
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/structured/JDBCSink.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.structured
2 |
3 | import java.sql._
4 |
5 | import org.apache.spark.sql.{ForeachWriter, Row}
6 |
7 | /**
8 | * 处理从StructuredStreaming中向mysql中写入数据
9 | *
10 | * Created by ZXL on 2017/10/15.
11 | */
12 | class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] {
13 |
14 | var statement: Statement = _
15 | var resultSet: ResultSet = _
16 | var connection: Connection = _
17 |
18 | override def open(partitionId: Long, version: Long): Boolean = {
19 | connection = new MySqlPool(url, username, password).getJdbcConn()
20 | statement = connection.createStatement()
21 | return true
22 | }
23 |
24 | override def process(value: Row): Unit = {
25 |
26 | val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "")
27 | val count = value.getAs[Long]("count")
28 |
29 | val querySql = "select 1 from webCount " +
30 | "where titleName = '" + titleName + "'"
31 |
32 | val updateSql = "update webCount set " +
33 | "count = " + count + " where titleName = '" + titleName + "'"
34 |
35 | val insertSql = "insert into webCount(titleName,count)" +
36 | "values('" + titleName + "'," + count + ")"
37 |
38 | try {
39 |
40 | //查看连接是否成功
41 | var resultSet = statement.executeQuery(querySql)
42 | if (resultSet.next()) {
43 | statement.executeUpdate(updateSql)
44 | } else {
45 | statement.execute(insertSql)
46 | }
47 | } catch {
48 | case ex: SQLException => {
49 | println("SQLException")
50 | }
51 | case ex: Exception => {
52 | println("Exception")
53 | }
54 | case ex: RuntimeException => {
55 | println("RuntimeException")
56 | }
57 | case ex: Throwable => {
58 | println("Throwable")
59 | }
60 | }
61 | }
62 |
63 | override def close(errorOrNull: Throwable): Unit = {
64 | // if(resultSet.wasNull()){
65 | // resultSet.close()
66 | // }
67 | if (statement == null) {
68 | statement.close()
69 | }
70 | if (connection == null) {
71 | connection.close()
72 | }
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/structured/MySqlPool.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.structured
2 |
3 | import java.sql.{Connection, DriverManager}
4 | import java.util
5 |
6 | /**
7 | * 从mysql连接池中获取连接
8 | *
9 | * Created by ZXL on 2017/10/15.
10 | */
11 | class MySqlPool(url: String, user: String, pwd: String) extends Serializable {
12 | //连接池连接总数
13 | private val max = 3
14 |
15 | //每次产生连接数
16 | private val connectionNum = 1
17 |
18 | //当前连接池已产生的连接数
19 | private var conNum = 0
20 |
21 | private val pool = new util.LinkedList[Connection]() //连接池
22 |
23 | //获取连接
24 | def getJdbcConn(): Connection = {
25 | //同步代码块,AnyRef为所有引用类型的基类,AnyVal为所有值类型的基类
26 | AnyRef.synchronized({
27 | if (pool.isEmpty) {
28 | //加载驱动
29 | preGetConn()
30 | for (i <- 1 to connectionNum) {
31 | val conn = DriverManager.getConnection(url, user, pwd)
32 | pool.push(conn)
33 | conNum += 1
34 | }
35 | }
36 | pool.poll()
37 | })
38 | }
39 |
40 | //释放连接
41 | def releaseConn(conn: Connection): Unit = {
42 | pool.push(conn)
43 | }
44 |
45 | //加载驱动
46 | private def preGetConn(): Unit = {
47 | //控制加载
48 | if (conNum < max && !pool.isEmpty) {
49 | println("Jdbc Pool has no connection now, please wait a moments!")
50 | Thread.sleep(2000)
51 | preGetConn()
52 | } else {
53 | Class.forName("com.mysql.jdbc.Driver")
54 | }
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/structured/StructuredStreamingKafka.scala:
--------------------------------------------------------------------------------
1 | package com.zxl.spark2_2.structured
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.streaming.ProcessingTime
5 |
6 | /**
7 | * 结构化流从kafka中读取数据存储到关系型数据库mysql
8 | * 目前结构化流对kafka的要求版本0.10及以上
9 | *
10 | * Created by ZXL on 2017/10/15.
11 | */
12 | object StructuredStreamingKafka {
13 |
14 | case class Weblog(datatime:String,
15 | userid:String,
16 | searchname:String,
17 | retorder:String,
18 | cliorder:String,
19 | cliurl:String)
20 |
21 | def main(args: Array[String]): Unit = {
22 |
23 | val spark = SparkSession.builder()
24 | .master("local[2]")
25 | .appName("streaming").getOrCreate()
26 |
27 | val df = spark
28 | .readStream
29 | .format("kafka")
30 | .option("kafka.bootstrap.servers", "node1:9092")
31 | .option("subscribe", "weblogs")
32 | .load()
33 |
34 | import spark.implicits._
35 | val lines = df.selectExpr("CAST(value AS STRING)").as[String]
36 | val weblog = lines.map(_.split(","))
37 | .map(x => Weblog(x(0), x(1), x(2),x(3),x(4),x(5)))
38 | val titleCount = weblog
39 | .groupBy("searchname").count().toDF("titleName","count")
40 |
41 | val url ="jdbc:mysql://node3:3306/test"
42 | val username="root"
43 | val password="1234"
44 |
45 | val writer = new JDBCSink(url,username,password)
46 | val query = titleCount.writeStream
47 | .foreach(writer)
48 | .outputMode("update")
49 | .trigger(ProcessingTime("5 seconds"))
50 | .start()
51 | query.awaitTermination()
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/streaming/kafka/KafkaManager.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.streaming.kafka
2 |
3 | import kafka.common.TopicAndPartition
4 | import kafka.message.MessageAndMetadata
5 | import kafka.serializer.Decoder
6 | import org.apache.spark.SparkException
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.streaming.StreamingContext
9 | import org.apache.spark.streaming.dstream.InputDStream
10 | import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
11 |
12 | import scala.reflect.ClassTag
13 |
14 | /**
15 | * 自己管理offset
16 | */
17 | class KafkaManager(val kafkaParams: Map[String, String]) extends Serializable {
18 |
19 | private val kc = new KafkaCluster(kafkaParams)
20 |
21 | /**
22 | * 创建数据流
23 | */
24 | def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](
25 | ssc: StreamingContext, kafkaParams: Map[String, String], topics: Set[String]): InputDStream[(K, V)] = {
26 | val groupId = kafkaParams.get("group.id").get
27 | // 在zookeeper上读取offsets前先根据实际情况更新offsets
28 | setOrUpdateOffsets(topics, groupId)
29 |
30 | //从zookeeper上读取offset开始消费message
31 | val messages = {
32 | val partitionsE = kc.getPartitions(topics)
33 | if (partitionsE.isLeft)
34 | throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
35 | val partitions = partitionsE.right.get
36 | val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
37 | if (consumerOffsetsE.isLeft)
38 | throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}")
39 | val consumerOffsets = consumerOffsetsE.right.get
40 | KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](
41 | ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
42 | }
43 | messages
44 | }
45 |
46 | /**
47 | * 创建数据流前,根据实际消费情况更新消费offsets
48 | * @param topics
49 | * @param groupId
50 | */
51 | private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = {
52 | topics.foreach(topic => {
53 | var hasConsumed = true
54 | val partitionsE = kc.getPartitions(Set(topic))
55 | if (partitionsE.isLeft)
56 | throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
57 | val partitions = partitionsE.right.get
58 | val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
59 | if (consumerOffsetsE.isLeft) hasConsumed = false
60 | if (hasConsumed) {// 消费过
61 | /**
62 | * 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException,
63 | * 说明zk上保存的offsets已经过时了,即kafka的定时清理策略已经将包含该offsets的文件删除。
64 | * 针对这种情况,只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小,
65 | * 如果consumerOffsets比earliestLeaderOffsets还小的话,说明consumerOffsets已过时,
66 | * 这时把consumerOffsets更新为earliestLeaderOffsets
67 | */
68 | val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
69 | if (earliestLeaderOffsetsE.isLeft)
70 | throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}")
71 | val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get
72 | val consumerOffsets = consumerOffsetsE.right.get
73 |
74 | // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets
75 | var offsets: Map[TopicAndPartition, Long] = Map()
76 | consumerOffsets.foreach({ case(tp, n) =>
77 | val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
78 | if (n < earliestLeaderOffset) {
79 | println("consumer group:" + groupId + ",topic:" + tp.topic + ",partition:" + tp.partition +
80 | " offsets已经过时,更新为" + earliestLeaderOffset)
81 | offsets += (tp -> earliestLeaderOffset)
82 | }
83 | })
84 | if (!offsets.isEmpty) {
85 | kc.setConsumerOffsets(groupId, offsets)
86 | }
87 | } else {// 没有消费过
88 | val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
89 | var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null
90 | if (reset == Some("smallest")) {
91 | val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
92 | if (leaderOffsetsE.isLeft)
93 | throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsE.left.get}")
94 | leaderOffsets = leaderOffsetsE.right.get
95 | } else {
96 | val leaderOffsetsE = kc.getLatestLeaderOffsets(partitions)
97 | if (leaderOffsetsE.isLeft)
98 | throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsE.left.get}")
99 | leaderOffsets = leaderOffsetsE.right.get
100 | }
101 | val offsets = leaderOffsets.map {
102 | case (tp, offset) => (tp, offset.offset)
103 | }
104 | kc.setConsumerOffsets(groupId, offsets)
105 | }
106 | })
107 | }
108 |
109 | /**
110 | * 更新zookeeper上的消费offsets
111 | * @param rdd
112 | */
113 | def updateZKOffsets(rdd: RDD[(String, String)]) : Unit = {
114 | val groupId = kafkaParams.get("group.id").get
115 | val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
116 |
117 | for (offsets <- offsetsList) {
118 | val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
119 | val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
120 | if (o.isLeft) {
121 | println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
122 | }
123 | }
124 | }
125 | }
126 |
127 |
--------------------------------------------------------------------------------