├── .gitignore
├── README.md
├── pom.xml
└── src
├── main
├── resources
│ └── log4j.properties
└── scala
│ └── org
│ └── apache
│ └── spark
│ └── sql
│ └── structured
│ └── datasource
│ ├── C3p0Utils.scala
│ ├── MySQLSink.scala
│ ├── MySQLSource.scala
│ ├── MySQLSourceProvider.scala
│ ├── custom
│ ├── CustomDataSink.scala
│ ├── CustomDataSource.scala
│ └── CustomDataSourceProvider.scala
│ └── example
│ ├── ConsoleSinkExample.scala
│ ├── FileSinkExample.scala
│ ├── FileSourceExample.scala
│ ├── ForeachSinkExample.scala
│ ├── KafkaSinkExample.scala
│ ├── KafkaSourceExample.scala
│ ├── MemorySinkExample.scala
│ ├── RateSourceExample.scala
│ └── SocketSourceExample.scala
└── test
├── java
└── org
│ └── apache
│ └── spark
│ └── sql
│ └── structured
│ └── datasource
│ └── MySQLSourceTest.scala
└── resources
└── log4j.properties
/.gitignore:
--------------------------------------------------------------------------------
1 | .settings
2 | .project
3 | .classpath
4 | target/
5 | logs/
6 | *.iml
7 | .iml*
8 | .idea
9 | .idea*
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # StructuredStreaming 内置数据源及实现自定义数据源
2 |
3 | > 版本说明:
4 | >
5 | > Spark:2.3/2.4
6 | >
7 | > 代码仓库:https://github.com/shirukai/spark-structured-datasource.git
8 |
9 | # 1 Structured内置的输入源 Source
10 |
11 | 官网文档:http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#input-sources
12 |
13 | | Source | Options | Fault-tolerant | Notes |
14 | | ------------- | ------------------------------------------------------------ | -------------- | ------------------------------------------ |
15 | | File Source | maxFilesPerTrigger:每个触发器中要考虑的最大新文件数(默认值:无最大值)latestFirst:是否先处理最新的新文件,当存在大量积压的文件时有用(默认值:false)
fileNameOnly:是否基于以下方法检查新文件只有文件名而不是完整路径(默认值:false)。 | 支持容错 | 支持glob路径,但不支持以口号分割的多个路径 |
16 | | Socket Source | host:要连接的主机,必须指定
port:要连接的端口,必须指定 | 不支持容错 | |
17 | | Rate Source | rowsPerSecond(例如100,默认值:1):每秒应生成多少行。
rampUpTime(例如5s,默认值:0s):在生成速度变为之前加速多长时间rowsPerSecond。使用比秒更精细的粒度将被截断为整数秒。numPartitions(例如10,默认值:Spark的默认并行性):生成的行的分区号 | 支持容错 | |
18 | | Kafka Source | http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html | 支持容错 | |
19 | ## 1.1 File Source
20 |
21 | 将目录中写入的文件作为数据流读取。支持的文件格式为:text、csv、json、orc、parquet
22 |
23 | **用例**
24 |
25 | 代码位置:org.apache.spark.sql.structured.datasource.example
26 |
27 | ```scala
28 | val source = spark
29 | .readStream
30 | // Schema must be specified when creating a streaming source DataFrame.
31 | .schema(schema)
32 | // 每个trigger最大文件数量
33 | .option("maxFilesPerTrigger",100)
34 | // 是否首先计算最新的文件,默认为false
35 | .option("latestFirst",value = true)
36 | // 是否值检查名字,如果名字相同,则不视为更新,默认为false
37 | .option("fileNameOnly",value = true)
38 | .csv("*.csv")
39 | ```
40 |
41 | ## 1.2 Socket Source
42 |
43 | 从Socket中读取UTF8文本数据。一般用于测试,使用nc -lc 端口号 向Socket监听的端口发送数据。
44 |
45 | **用例**
46 |
47 | 代码位置:org.apache.spark.sql.structured.datasource.example
48 |
49 | ```scala
50 | val lines = spark.readStream
51 | .format("socket")
52 | .option("host", "localhost")
53 | .option("port", 9090)
54 | .load()
55 | ```
56 |
57 | ## 1.3 Rate Source
58 |
59 | 以每秒指定的行数生成数据,每个输出行包含一个`timestamp`和`value`。其中`timestamp`是一个`Timestamp`含有信息分配的时间类型,并且`value`是`Long`包含消息的计数从0开始作为第一行类型。此源用于测试和基准测试。
60 |
61 | **用例**
62 |
63 | 代码位置:org.apache.spark.sql.structured.datasource.example
64 |
65 | ```scala
66 | val rate = spark.readStream
67 | .format("rate")
68 | // 每秒生成的行数,默认值为1
69 | .option("rowsPerSecond", 10)
70 | .option("numPartitions", 10)
71 | .option("rampUpTime",0)
72 | .option("rampUpTime",5)
73 | .load()
74 | ```
75 |
76 | ## 1.4 Kafka Source
77 |
78 | 官网文档:http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html
79 |
80 | **用例**
81 |
82 | 代码位置:org.apache.spark.sql.structured.datasource.example
83 |
84 | ```scala
85 | val df = spark
86 | .readStream
87 | .format("kafka")
88 | .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
89 | .option("subscribePattern", "topic.*")
90 | .load()
91 | ```
92 |
93 | # 2 Structured 内置的输出源 Sink
94 |
95 | | Sink | Supported Output Modes | Options | Fault-tolerant | Notes |
96 | | ----------------- | ---------------------- | ------------------------------------------------------------ | ------------------------- | ------------------------------------------------------------ |
97 | | File Sink | Append | path:输出路径(必须指定) | 支持容错(exactly-once) | 支持分区写入 |
98 | | Kafka Sink | Append,Update,Complete | See the [Kafka Integration Guide](http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html) | 支持容错(at-least-once) | [Kafka Integration Guide](http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html) |
99 | | Foreach Sink | Append,Update,Complete | None | | [Foreach Guide](http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch) |
100 | | ForeachBatch Sink | Append,Update,Complete | None | | [Foreach Guide](http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch) |
101 | | Console Sink | Append,Update,Complete | numRows:每次触发器打印的行数(默认值:20)
truncate:是否过长时截断输出(默认值:true | | |
102 | | Memory Sink | Append,Complete | None | | 表名是查询的名字 |
103 |
104 | ## 2.1 File Sink
105 |
106 | 将结果输出到文件,支持格式parquet、csv、orc、json等
107 |
108 | **用例**
109 |
110 | 代码位置:org.apache.spark.sql.structured.datasource.example
111 |
112 | ```scala
113 | val fileSink = source.writeStream
114 | .format("parquet")
115 | //.format("csv")
116 | //.format("orc")
117 | // .format("json")
118 | .option("path", "data/sink")
119 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
120 | .start()
121 | ```
122 |
123 | ## 2.2 Console Sink
124 |
125 | 将结果输出到控制台
126 |
127 | **用例**
128 |
129 | 代码位置:org.apache.spark.sql.structured.datasource.example
130 |
131 | ```scala
132 | val consoleSink = source.writeStream
133 | .format("console")
134 | // 是否压缩显示
135 | .option("truncate", value = false)
136 | // 显示条数
137 | .option("numRows", 30)
138 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
139 | .start()
140 |
141 | ```
142 |
143 | ## 2.3 Memory Sink
144 |
145 | 将结果输出到内存,需要指定内存中的表名。可以使用sql进行查询
146 |
147 | **用例**
148 |
149 | 代码位置:org.apache.spark.sql.structured.datasource.example
150 |
151 | ````scala
152 |
153 | val memorySink = source.writeStream
154 | .format("memory")
155 | .queryName("memorySinkTable")
156 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
157 | .start()
158 |
159 |
160 | new Thread(new Runnable {
161 | override def run(): Unit = {
162 | while (true) {
163 | spark.sql("select * from memorySinkTable").show(false)
164 | Thread.sleep(1000)
165 | }
166 | }
167 | }).start()
168 | memorySink.awaitTermination()
169 | ````
170 |
171 | ## 2.4 Kafka Sink
172 |
173 | 将结果输出到Kafka,需要将DataFrame转成key,value两列,或者topic、key、value三列
174 |
175 | **用例**
176 |
177 | 代码位置:org.apache.spark.sql.structured.datasource.example
178 |
179 | ```scala
180 | import org.apache.spark.sql.functions._
181 | import spark.implicits._
182 | val kafkaSink = source.select(array(to_json(struct("*"))).as("value").cast(StringType),
183 | $"timestamp".as("key").cast(StringType)).writeStream
184 | .format("kafka")
185 | .option("kafka.bootstrap.servers", "localhost:9092")
186 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
187 | .option("topic", "hiacloud-ts-dev")
188 | .start()
189 | ```
190 |
191 | ## 2.5 ForeachBatch Sink(2.4)
192 |
193 | 适用于对于一个批次来说应用相同的写入方式的场景。方法传入这个batch的DataFrame以及batchId。这个方法在2.3之后的版本才有而且仅支持微批模式。
194 |
195 | 
196 |
197 | **用例**
198 |
199 | 代码位置:org.apache.spark.sql.structured.datasource.example
200 |
201 | ```scala
202 | val foreachBatchSink = source.writeStream.foreachBatch((batchData: DataFrame, batchId) => {
203 | batchData.show(false)
204 | }).start()
205 | ```
206 |
207 | ## 2.6 Foreach Sink
208 |
209 | Foreach 每一条记录,通过继承ForeachWriter[Row],实现open(),process(),close()方法。在open方法了我们可以获取一个资源连接,如MySQL的连接。在process里我们可以获取一条记录,并处理这条数据发送到刚才获取资源连接的MySQL中,在close里我们可以关闭资源连接。注意,foreach是对Partition来说的,同一个分区只会调用一次open、close方法,但对于每条记录来说,都会调用process方法。
210 |
211 | **用例**
212 |
213 | 代码位置:org.apache.spark.sql.structured.datasource.example
214 |
215 | ```scala
216 | val foreachSink = source.writeStream
217 | .foreach(new ForeachWriter[Row] {
218 | override def open(partitionId: Long, version: Long): Boolean = {
219 | println(s"partitionId=$partitionId,version=$version")
220 | true
221 |
222 | }
223 |
224 | override def process(value: Row): Unit = {
225 | println(value)
226 | }
227 |
228 | override def close(errorOrNull: Throwable): Unit = {
229 | println("close")
230 | }
231 | })
232 | .start()
233 |
234 | ```
235 |
236 | # 3 自定义输入源
237 |
238 | 某些应用场景下我们可能需要自定义数据源,如业务中,需要在获取KafkaSource的同时,动态从缓存中或者http请求中加载业务数据,或者是其它的数据源等都可以参考规范自定义。自定义输入源需要以下步骤:
239 |
240 | 第一步:继承DataSourceRegister和StreamSourceProvider创建自定义Provider类
241 |
242 | 第二步:重写DataSourceRegister类中的shotName和StreamSourceProvider中的createSource以及sourceSchema方法
243 |
244 | 第三步:继承Source创建自定义Source类
245 |
246 | 第四步:重写Source中的schema方法指定输入源的schema
247 |
248 | 第五步:重写Source中的getOffest方法监听流数据
249 |
250 | 第六步:重写Source中的getBatch方法获取数据
251 |
252 | 第七步:重写Source中的stop方法用来关闭资源
253 |
254 | ## 3.1 创建CustomDataSourceProvider类
255 |
256 | ### 3.1.1 继承DataSourceRegister和StreamSourceProvider
257 |
258 | 要创建自定义的DataSourceProvider必须要继承位于org.apache.spark.sql.sources包下的DataSourceRegister以及该包下的StreamSourceProvider。如下所示:
259 |
260 | ```scala
261 | class CustomDataSourceProvider extends DataSourceRegister
262 | with StreamSourceProvider
263 | with Logging {
264 | //Override some functions ……
265 | }
266 | ```
267 |
268 | ### 3.1.2 重写DataSourceRegister的shotName方法
269 |
270 | 该方法用来指定一个数据源的名字,用来想spark注册该数据源。如Spark内置的数据源的shotName:kafka
271 |
272 | 、socket、rate等,该方法返回一个字符串,如下所示:
273 |
274 | ```scala
275 | /**
276 | * 数据源的描述名字,如:kafka、socket
277 | *
278 | * @return 字符串shotName
279 | */
280 | override def shortName(): String = "custom"
281 | ```
282 |
283 | ### 3.1.3 重写StreamSourceProvider中的sourceSchema方法
284 |
285 | 该方法是用来定义数据源的schema,可以使用用户传入的schema,也可以根据传入的参数进行动态创建。返回值是个二元组(shotName,scheam),代码如下所示:
286 |
287 | ```scala
288 | /**
289 | * 定义数据源的Schema
290 | *
291 | * @param sqlContext Spark SQL 上下文
292 | * @param schema 通过.schema()方法传入的schema
293 | * @param providerName Provider的名称,包名+类名
294 | * @param parameters 通过.option()方法传入的参数
295 | * @return 元组,(shotName,schema)
296 | */
297 | override def sourceSchema(sqlContext: SQLContext,
298 | schema: Option[StructType],
299 | providerName: String,
300 | parameters: Map[String, String]): (String, StructType) = (shortName(),schema.get)
301 | ```
302 |
303 | ### 3.1.4 重写StreamSourceProvider中的createSource方法
304 |
305 | 通过传入的参数,来实例化我们自定义的DataSource,是我们自定义Source的重要入口的地方
306 |
307 | ```scala
308 | /**
309 | * 创建输入源
310 | *
311 | * @param sqlContext Spark SQL 上下文
312 | * @param metadataPath 元数据Path
313 | * @param schema 通过.schema()方法传入的schema
314 | * @param providerName Provider的名称,包名+类名
315 | * @param parameters 通过.option()方法传入的参数
316 | * @return 自定义source,需要继承Source接口实现
317 | **/
318 |
319 | override def createSource(sqlContext: SQLContext,
320 | metadataPath: String,
321 | schema: Option[StructType],
322 | providerName: String,
323 | parameters: Map[String, String]): Source = new CustomDataSource(sqlContext,parameters,schema)
324 | ```
325 |
326 | ### 3.1.5 CustomDataSourceProvider.scala完整代码
327 |
328 | ```scala
329 | package org.apache.spark.sql.structured.datasource.custom
330 |
331 | import org.apache.spark.internal.Logging
332 | import org.apache.spark.sql.SQLContext
333 | import org.apache.spark.sql.execution.streaming.{Sink, Source}
334 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider}
335 | import org.apache.spark.sql.streaming.OutputMode
336 | import org.apache.spark.sql.types.StructType
337 |
338 | /**
339 | * @author : shirukai
340 | * @date : 2019-01-25 17:49
341 | * 自定义Structured Streaming数据源
342 | *
343 | * (1)继承DataSourceRegister类
344 | * 需要重写shortName方法,用来向Spark注册该组件
345 | *
346 | * (2)继承StreamSourceProvider类
347 | * 需要重写createSource以及sourceSchema方法,用来创建数据输入源
348 | *
349 | * (3)继承StreamSinkProvider类
350 | * 需要重写createSink方法,用来创建数据输出源
351 | *
352 | *
353 | */
354 | class CustomDataSourceProvider extends DataSourceRegister
355 | with StreamSourceProvider
356 | with StreamSinkProvider
357 | with Logging {
358 |
359 |
360 | /**
361 | * 数据源的描述名字,如:kafka、socket
362 | *
363 | * @return 字符串shotName
364 | */
365 | override def shortName(): String = "custom"
366 |
367 |
368 | /**
369 | * 定义数据源的Schema
370 | *
371 | * @param sqlContext Spark SQL 上下文
372 | * @param schema 通过.schema()方法传入的schema
373 | * @param providerName Provider的名称,包名+类名
374 | * @param parameters 通过.option()方法传入的参数
375 | * @return 元组,(shotName,schema)
376 | */
377 | override def sourceSchema(sqlContext: SQLContext,
378 | schema: Option[StructType],
379 | providerName: String,
380 | parameters: Map[String, String]): (String, StructType) = (shortName(),schema.get)
381 |
382 | /**
383 | * 创建输入源
384 | *
385 | * @param sqlContext Spark SQL 上下文
386 | * @param metadataPath 元数据Path
387 | * @param schema 通过.schema()方法传入的schema
388 | * @param providerName Provider的名称,包名+类名
389 | * @param parameters 通过.option()方法传入的参数
390 | * @return 自定义source,需要继承Source接口实现
391 | **/
392 |
393 | override def createSource(sqlContext: SQLContext,
394 | metadataPath: String,
395 | schema: Option[StructType],
396 | providerName: String,
397 | parameters: Map[String, String]): Source = new CustomDataSource(sqlContext,parameters,schema)
398 |
399 |
400 | /**
401 | * 创建输出源
402 | *
403 | * @param sqlContext Spark SQL 上下文
404 | * @param parameters 通过.option()方法传入的参数
405 | * @param partitionColumns 分区列名?
406 | * @param outputMode 输出模式
407 | * @return
408 | */
409 | override def createSink(sqlContext: SQLContext,
410 | parameters: Map[String, String],
411 | partitionColumns: Seq[String],
412 | outputMode: OutputMode): Sink = new CustomDataSink(sqlContext,parameters,outputMode)
413 | }
414 |
415 | ```
416 |
417 | ## 3.2 创建CustomDataSource类
418 |
419 | ### 3.2.1 继承Source创建CustomDataSource类
420 |
421 | 要创建自定义的DataSource必须要继承位于org.apache.spark.sql.sources包下的Source。如下所示:
422 |
423 | ```scala
424 | class CustomDataSource(sqlContext: SQLContext,
425 | parameters: Map[String, String],
426 | schemaOption: Option[StructType]) extends Source
427 | with Logging {
428 | //Override some functions ……
429 | }
430 | ```
431 |
432 | ### 3.2.2 重写Source的schema方法
433 |
434 | 指定数据源的schema,需要与Provider中的sourceSchema指定的schema保持一致,否则会报异常
435 |
436 | ```scala
437 | /**
438 | * 指定数据源的schema,需要与Provider中sourceSchema中指定的schema保持一直,否则会报异常
439 | * 触发机制:当创建数据源的时候被触发执行
440 | *
441 | * @return schema
442 | */
443 | override def schema: StructType = schemaOption.get
444 | ```
445 |
446 | ### 3.2.3 重写Source的getOffset方法
447 |
448 | 此方法是Spark不断的轮询执行的,目的是用来监控流数据的变化情况,一旦数据发生变化,就会触发getBatch方法用来获取数据。
449 |
450 | ```scala
451 | /**
452 | * 获取offset,用来监控数据的变化情况
453 | * 触发机制:不断轮询调用
454 | * 实现要点:
455 | * (1)Offset的实现:
456 | * 由函数返回值可以看出,我们需要提供一个标准的返回值Option[Offset]
457 | * 我们可以通过继承 org.apache.spark.sql.sources.v2.reader.streaming.Offset实现,这里面其实就是保存了个json字符串
458 | *
459 | * (2) JSON转化
460 | * 因为Offset里实现的是一个json字符串,所以我们需要将我们存放offset的集合或者case class转化重json字符串
461 | * spark里是通过org.json4s.jackson这个包来实现case class 集合类(Map、List、Seq、Set等)与json字符串的相互转化
462 | *
463 | * @return Offset
464 | */
465 | override def getOffset: Option[Offset] = ???
466 | ```
467 |
468 | ### 3.2.4 重写Source的getBatch方法
469 |
470 | 此方法是Spark用来获取数据的,getOffset方法检测的数据发生变化的时候,会触发该方法, 传入上一次触发时的end Offset作为当前batch的start Offset,将新的offset作为end Offset。
471 |
472 | ```scala
473 | /**
474 | * 获取数据
475 | *
476 | * @param start 上一个批次的end offset
477 | * @param end 通过getOffset获取的新的offset
478 | * 触发机制:当不断轮询的getOffset方法,获取的offset发生改变时,会触发该方法
479 | *
480 | * 实现要点:
481 | * (1)DataFrame的创建:
482 | * 可以通过生成RDD,然后使用RDD创建DataFrame
483 | * RDD创建:sqlContext.sparkContext.parallelize(rows.toSeq)
484 | * DataFrame创建:sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
485 | * @return DataFrame
486 | */
487 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = ???
488 | ```
489 |
490 | ### 3.2.5 重写Source的stop方法
491 |
492 | 用来关闭一些需要关闭或停止的资源及进程
493 |
494 | ```scala
495 | /**
496 | * 关闭资源
497 | * 将一些需要关闭的资源放到这里来关闭,如MySQL的数据库连接等
498 | */
499 | override def stop(): Unit = ???
500 | ```
501 |
502 | ### 3.2.6 CustomDataSource.scala完整代码
503 |
504 | ```scala
505 | package org.apache.spark.sql.structured.datasource.custom
506 |
507 | import org.apache.spark.internal.Logging
508 | import org.apache.spark.sql.execution.streaming.{Offset, Source}
509 | import org.apache.spark.sql.types.StructType
510 | import org.apache.spark.sql.{DataFrame, SQLContext}
511 |
512 | /**
513 | * @author : shirukai
514 | * @date : 2019-01-25 18:03
515 | * 自定义数据输入源:需要继承Source接口
516 | * 实现思路:
517 | * (1)通过重写schema方法来指定数据输入源的schema,这个schema需要与Provider中指定的schema保持一致
518 | * (2)通过重写getOffset方法来获取数据的偏移量,这个方法会一直被轮询调用,不断的获取偏移量
519 | * (3) 通过重写getBatch方法,来获取数据,这个方法是在偏移量发生改变后被触发
520 | * (4)通过stop方法,来进行一下关闭资源的操作
521 | *
522 | */
523 | class CustomDataSource(sqlContext: SQLContext,
524 | parameters: Map[String, String],
525 | schemaOption: Option[StructType]) extends Source
526 | with Logging {
527 |
528 | /**
529 | * 指定数据源的schema,需要与Provider中sourceSchema中指定的schema保持一直,否则会报异常
530 | * 触发机制:当创建数据源的时候被触发执行
531 | *
532 | * @return schema
533 | */
534 | override def schema: StructType = schemaOption.get
535 |
536 | /**
537 | * 获取offset,用来监控数据的变化情况
538 | * 触发机制:不断轮询调用
539 | * 实现要点:
540 | * (1)Offset的实现:
541 | * 由函数返回值可以看出,我们需要提供一个标准的返回值Option[Offset]
542 | * 我们可以通过继承 org.apache.spark.sql.sources.v2.reader.streaming.Offset实现,这里面其实就是保存了个json字符串
543 | *
544 | * (2) JSON转化
545 | * 因为Offset里实现的是一个json字符串,所以我们需要将我们存放offset的集合或者case class转化重json字符串
546 | * spark里是通过org.json4s.jackson这个包来实现case class 集合类(Map、List、Seq、Set等)与json字符串的相互转化
547 | *
548 | * @return Offset
549 | */
550 | override def getOffset: Option[Offset] = ???
551 |
552 | /**
553 | * 获取数据
554 | *
555 | * @param start 上一个批次的end offset
556 | * @param end 通过getOffset获取的新的offset
557 | * 触发机制:当不断轮询的getOffset方法,获取的offset发生改变时,会触发该方法
558 | *
559 | * 实现要点:
560 | * (1)DataFrame的创建:
561 | * 可以通过生成RDD,然后使用RDD创建DataFrame
562 | * RDD创建:sqlContext.sparkContext.parallelize(rows.toSeq)
563 | * DataFrame创建:sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
564 | * @return DataFrame
565 | */
566 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = ???
567 |
568 | /**
569 | * 关闭资源
570 | * 将一些需要关闭的资源放到这里来关闭,如MySQL的数据库连接等
571 | */
572 | override def stop(): Unit = ???
573 | }
574 | ```
575 |
576 | ## 3.3 自定义DataSource的使用
577 |
578 | 自定义DataSource的使用与内置DataSource一样,只需要在format里指定一下我们的Provider类路径即可。如
579 |
580 | ```scala
581 | val source = spark
582 | .readStream
583 | .format("org.apache.spark.sql.kafka010.CustomSourceProvider")
584 | .options(options)
585 | .schema(schema)
586 | .load()
587 | ```
588 |
589 | ## 3.4 实现MySQL自定义数据源
590 |
591 | 此例子仅仅是为了演示如何自定义数据源,与实际业务场景无关。
592 |
593 | ### 3.4.1 创建MySQLSourceProvider.scala
594 |
595 | ```scala
596 | package org.apache.spark.sql.structured.datasource
597 |
598 | import org.apache.spark.internal.Logging
599 | import org.apache.spark.sql.SQLContext
600 | import org.apache.spark.sql.execution.streaming.{Sink, Source}
601 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider}
602 | import org.apache.spark.sql.streaming.OutputMode
603 | import org.apache.spark.sql.types.StructType
604 |
605 | /**
606 | * @author : shirukai
607 | * @date : 2019-01-25 09:10
608 | * 自定义MySQL数据源
609 | */
610 | class MySQLSourceProvider extends DataSourceRegister
611 | with StreamSourceProvider
612 | with StreamSinkProvider
613 | with Logging {
614 | /**
615 | * 数据源的描述名字,如:kafka、socket
616 | *
617 | * @return 字符串shotName
618 | */
619 | override def shortName(): String = "mysql"
620 |
621 |
622 | /**
623 | * 定义数据源的Schema
624 | *
625 | * @param sqlContext Spark SQL 上下文
626 | * @param schema 通过.schema()方法传入的schema
627 | * @param providerName Provider的名称,包名+类名
628 | * @param parameters 通过.option()方法传入的参数
629 | * @return 元组,(shotName,schema)
630 | */
631 | override def sourceSchema(
632 | sqlContext: SQLContext,
633 | schema: Option[StructType],
634 | providerName: String,
635 | parameters: Map[String, String]): (String, StructType) = {
636 | (providerName, schema.get)
637 | }
638 |
639 | /**
640 | * 创建输入源
641 | *
642 | * @param sqlContext Spark SQL 上下文
643 | * @param metadataPath 元数据Path
644 | * @param schema 通过.schema()方法传入的schema
645 | * @param providerName Provider的名称,包名+类名
646 | * @param parameters 通过.option()方法传入的参数
647 | * @return 自定义source,需要继承Source接口实现
648 | */
649 | override def createSource(
650 | sqlContext: SQLContext,
651 | metadataPath: String, schema: Option[StructType],
652 | providerName: String, parameters: Map[String, String]): Source = new MySQLSource(sqlContext, parameters, schema)
653 |
654 | /**
655 | * 创建输出源
656 | *
657 | * @param sqlContext Spark SQL 上下文
658 | * @param parameters 通过.option()方法传入的参数
659 | * @param partitionColumns 分区列名?
660 | * @param outputMode 输出模式
661 | * @return
662 | */
663 | override def createSink(
664 | sqlContext: SQLContext,
665 | parameters: Map[String, String],
666 | partitionColumns: Seq[String], outputMode: OutputMode): Sink = new MySQLSink(sqlContext: SQLContext,parameters, outputMode)
667 | }
668 | ```
669 |
670 | ### 3.4.2 创建MySQLSource.scala
671 |
672 | ```scala
673 | package org.apache.spark.sql.structured.datasource
674 |
675 | import java.sql.Connection
676 |
677 | import org.apache.spark.executor.InputMetrics
678 | import org.apache.spark.internal.Logging
679 | import org.apache.spark.sql.catalyst.InternalRow
680 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
681 | import org.apache.spark.sql.execution.streaming.{Offset, Source}
682 | import org.apache.spark.sql.types.StructType
683 | import org.apache.spark.sql.{DataFrame, SQLContext}
684 | import org.json4s.jackson.Serialization
685 | import org.json4s.{Formats, NoTypeHints}
686 |
687 |
688 | /**
689 | * @author : shirukai
690 | * @date : 2019-01-25 09:41
691 | */
692 | class MySQLSource(sqlContext: SQLContext,
693 | options: Map[String, String],
694 | schemaOption: Option[StructType]) extends Source with Logging {
695 |
696 | lazy val conn: Connection = C3p0Utils.getDataSource(options).getConnection
697 |
698 | val tableName: String = options("tableName")
699 |
700 | var currentOffset: Map[String, Long] = Map[String, Long](tableName -> 0)
701 |
702 | val maxOffsetPerBatch: Option[Long] = Option(100)
703 |
704 | val inputMetrics = new InputMetrics()
705 |
706 | override def schema: StructType = schemaOption.get
707 |
708 | /**
709 | * 获取Offset
710 | * 这里监控MySQL数据库表中条数变化情况
711 | * @return Option[Offset]
712 | */
713 | override def getOffset: Option[Offset] = {
714 | val latest = getLatestOffset
715 | val offsets = maxOffsetPerBatch match {
716 | case None => MySQLSourceOffset(latest)
717 | case Some(limit) =>
718 | MySQLSourceOffset(rateLimit(limit, currentOffset, latest))
719 | }
720 | Option(offsets)
721 | }
722 |
723 | /**
724 | * 获取数据
725 | * @param start 上一次的offset
726 | * @param end 最新的offset
727 | * @return df
728 | */
729 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
730 |
731 | var offset: Long = 0
732 | if (start.isDefined) {
733 | offset = offset2Map(start.get)(tableName)
734 | }
735 | val limit = offset2Map(end)(tableName) - offset
736 | val sql = s"SELECT * FROM $tableName limit $limit offset $offset"
737 |
738 | val st = conn.prepareStatement(sql)
739 | val rs = st.executeQuery()
740 | val rows: Iterator[InternalRow] = JdbcUtils.resultSetToSparkInternalRows(rs, schemaOption.get, inputMetrics) //todo 好用
741 | val rdd = sqlContext.sparkContext.parallelize(rows.toSeq)
742 |
743 | currentOffset = offset2Map(end)
744 |
745 | sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
746 | }
747 |
748 | override def stop(): Unit = {
749 | conn.close()
750 | }
751 |
752 | def rateLimit(limit: Long, currentOffset: Map[String, Long], latestOffset: Map[String, Long]): Map[String, Long] = {
753 | val co = currentOffset(tableName)
754 | val lo = latestOffset(tableName)
755 | if (co + limit > lo) {
756 | Map[String, Long](tableName -> lo)
757 | } else {
758 | Map[String, Long](tableName -> (co + limit))
759 | }
760 | }
761 |
762 | // 获取最新条数
763 | def getLatestOffset: Map[String, Long] = {
764 | var offset: Long = 0
765 | val sql = s"SELECT COUNT(1) FROM $tableName"
766 | val st = conn.prepareStatement(sql)
767 | val rs = st.executeQuery()
768 | while (rs.next()) {
769 | offset = rs.getLong(1)
770 | }
771 | Map[String, Long](tableName -> offset)
772 | }
773 |
774 | def offset2Map(offset: Offset): Map[String, Long] = {
775 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints)
776 | Serialization.read[Map[String, Long]](offset.json())
777 | }
778 | }
779 |
780 | case class MySQLSourceOffset(offset: Map[String, Long]) extends Offset {
781 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints)
782 |
783 | override def json(): String = Serialization.write(offset)
784 | }
785 | ```
786 |
787 | ### 3.4.3 测试MySQLSource
788 |
789 | ```scala
790 | package org.apache.spark.sql.structured.datasource
791 |
792 | import org.apache.spark.sql.SparkSession
793 | import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
794 |
795 | /**
796 | * @author : shirukai
797 | * @date : 2019-01-25 15:12
798 | */
799 | object MySQLSourceTest {
800 | def main(args: Array[String]): Unit = {
801 | val spark = SparkSession
802 | .builder()
803 | .appName(this.getClass.getSimpleName)
804 | .master("local[2]")
805 | .getOrCreate()
806 | val schema = StructType(List(
807 | StructField("name", StringType),
808 | StructField("creatTime", TimestampType),
809 | StructField("modifyTime", TimestampType)
810 | )
811 | )
812 | val options = Map[String, String](
813 | "driverClass" -> "com.mysql.cj.jdbc.Driver",
814 | "jdbcUrl" -> "jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8",
815 | "user" -> "root",
816 | "password" -> "hollysys",
817 | "tableName" -> "model")
818 | val source = spark
819 | .readStream
820 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider")
821 | .options(options)
822 | .schema(schema)
823 | .load()
824 |
825 | import org.apache.spark.sql.functions._
826 | val query = source.writeStream.format("console")
827 | // 是否压缩显示
828 | .option("truncate", value = false)
829 | // 显示条数
830 | .option("numRows", 30)
831 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
832 | .start()
833 | query.awaitTermination()
834 | }
835 | }
836 |
837 | ```
838 |
839 | # 4 自定义输出源
840 |
841 | 相比较输入源的自定义性,输出源自定义的应用场景貌似更为常用。比如:数据写入关系型数据库、数据写入HBase、数据写入Redis等等。其实Structured提供的foreach以及2.4版本的foreachBatch方法已经可以实现绝大数的应用场景的,几乎是数据想写到什么地方都能实现。但是想要更优雅的实现,我们可以参考Spark SQL Sink规范,通过自定义的Sink的方式来实现。实现自定义Sink需要以下四个个步骤:
842 |
843 | 第一步:继承DataSourceRegister和StreamSinkProvider创建自定义SinkProvider类
844 |
845 | 第二步:重写DataSourceRegister类中的shotName和StreamSinkProvider中的createSink方法
846 |
847 | 第三步:继承Sink创建自定义Sink类
848 |
849 | 第四步:重写Sink中的addBatch方法
850 |
851 | ## 4.1 改写CustomDataSourceProvider类
852 |
853 | ### 4.1.1 新增继承StreamSinkProvider
854 |
855 | 在上面创建自定义输入源的基础上,新增继承StreamSourceProvider。如下所示:
856 |
857 | ```scala
858 | class CustomDataSourceProvider extends DataSourceRegister
859 | with StreamSourceProvider
860 | with StreamSinkProvider
861 | with Logging {
862 | //Override some functions ……
863 | }
864 | ```
865 |
866 | ### 4.1.2 重写StreamSinkProvider中的createSink方法
867 |
868 | 通过传入的参数,来实例化我们自定义的DataSink,是我们自定义Sink的重要入口的地方
869 |
870 | ```scala
871 | /**
872 | * 创建输出源
873 | *
874 | * @param sqlContext Spark SQL 上下文
875 | * @param parameters 通过.option()方法传入的参数
876 | * @param partitionColumns 分区列名?
877 | * @param outputMode 输出模式
878 | * @return
879 | */
880 | override def createSink(sqlContext: SQLContext,
881 | parameters: Map[String, String],
882 | partitionColumns: Seq[String],
883 | outputMode: OutputMode): Sink = new CustomDataSink(sqlContext,parameters,outputMode)
884 | ```
885 |
886 | ## 4.2 创建CustomDataSink类
887 |
888 | ### 4.2.1 继承Sink创建CustomDataSink类
889 |
890 | 要创建自定义的DataSink必须要继承位于org.apache.spark.sql.sources包下的Sink。如下所示:
891 |
892 | ```scala
893 | class CustomDataSink(sqlContext: SQLContext,
894 | parameters: Map[String, String],
895 | outputMode: OutputMode) extends Sink with Logging {
896 | // Override some functions
897 | }
898 | ```
899 |
900 | ### 4.2.2 重写Sink中的addBatch方法
901 |
902 | 该方法是当发生计算时会被触发,传入的是一个batchId和dataFrame,拿到DataFrame之后,我们有三种写出方式,第一种是使用Spark SQL内置的Sink写出,如 JSON数据源、CSV数据源、Text数据源、Parquet数据源、JDBC数据源等。第二种是通过DataFrame的foreachPartition写出。第三种就是自定义SparkSQL的输出源然后写出。
903 |
904 | ```scala
905 | /**
906 | * 添加Batch,即数据写出
907 | *
908 | * @param batchId batchId
909 | * @param data DataFrame
910 | * 触发机制:当发生计算时,会触发该方法,并且得到要输出的DataFrame
911 | * 实现摘要:
912 | * 1. 数据写入方式:
913 | * (1)通过SparkSQL内置的数据源写出
914 | * 我们拿到DataFrame之后可以通过SparkSQL内置的数据源将数据写出,如:
915 | * JSON数据源、CSV数据源、Text数据源、Parquet数据源、JDBC数据源等。
916 | * (2)通过自定义SparkSQL的数据源进行写出
917 | * (3)通过foreachPartition 将数据写出
918 | */
919 | override def addBatch(batchId: Long, data: DataFrame): Unit = ???
920 | ```
921 |
922 | **注意**:
923 |
924 | 当我们使用第一种方式的时候要注意,此时拿到的DataFrame是一个流式的DataFrame,即isStreaming=ture,通过查看KafkaSink,如下代码所示,先是通过DataFrame.queryExecution执行查询,然后在wite里转成rdd,通过rdd的foreachPartition实现。同样的思路,我们可以利用这个rdd和schema,利用sqlContext.internalCreateDataFrame(rdd, data.schema)重新生成DataFrame,这个在MySQLSink中使用过。
925 |
926 | ```scala
927 | override def addBatch(batchId: Long, data: DataFrame): Unit = {
928 | if (batchId <= latestBatchId) {
929 | logInfo(s"Skipping already committed batch $batchId")
930 | } else {
931 | KafkaWriter.write(sqlContext.sparkSession,
932 | data.queryExecution, executorKafkaParams, topic)
933 | latestBatchId = batchId
934 | }
935 | }
936 |
937 | def write(
938 | sparkSession: SparkSession,
939 | queryExecution: QueryExecution,
940 | kafkaParameters: ju.Map[String, Object],
941 | topic: Option[String] = None): Unit = {
942 | val schema = queryExecution.analyzed.output
943 | validateQuery(schema, kafkaParameters, topic)
944 | queryExecution.toRdd.foreachPartition { iter =>
945 | val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
946 | Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
947 | finallyBlock = writeTask.close())
948 | }
949 | }
950 | ```
951 |
952 |
953 |
954 | ## 4.3 自定义DataSink的使用
955 |
956 | 自定义DataSink的使用与自定义DataSource的使用相同,在format里指定一些类Provider的类路径即可。
957 |
958 | ```scala
959 | val query = source.groupBy("creatTime").agg(collect_list("name")).writeStream
960 | .outputMode("update")
961 | .format("org.apache.spark.sql.kafka010.CustomDataSourceProvider")
962 | .option(options)
963 | .start()
964 | query.awaitTermination()
965 | ```
966 |
967 | ## 4.4 实现MySQL自定义输出源
968 |
969 | ### 4.4.1 修改MySQLSourceProvider.scala
970 |
971 | 上面我们实现MySQL自定义输入源的时候,已经创建了MySQLSourceProvider类,我们需要在这个基础上新增继承StreamSinkProvider,并重写createSink方法,如下所示:
972 |
973 | ```scala
974 | package org.apache.spark.sql.structured.datasource
975 |
976 | import org.apache.spark.internal.Logging
977 | import org.apache.spark.sql.SQLContext
978 | import org.apache.spark.sql.execution.streaming.{Sink, Source}
979 | import org.apache.spark.sql.kafka010.{MySQLSink, MySQLSource}
980 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider}
981 | import org.apache.spark.sql.streaming.OutputMode
982 | import org.apache.spark.sql.types.StructType
983 |
984 | /**
985 | * @author : shirukai
986 | * @date : 2019-01-25 09:10
987 | * 自定义MySQL数据源
988 | */
989 | class MySQLSourceProvider extends DataSourceRegister
990 | with StreamSourceProvider
991 | with StreamSinkProvider
992 | with Logging {
993 |
994 | //……省略自定义输入源的方法
995 |
996 | /**
997 | * 创建输出源
998 | *
999 | * @param sqlContext Spark SQL 上下文
1000 | * @param parameters 通过.option()方法传入的参数
1001 | * @param partitionColumns 分区列名?
1002 | * @param outputMode 输出模式
1003 | * @return
1004 | */
1005 | override def createSink(
1006 | sqlContext: SQLContext,
1007 | parameters: Map[String, String],
1008 | partitionColumns: Seq[String], outputMode: OutputMode): Sink = new MySQLSink(sqlContext: SQLContext,parameters, outputMode)
1009 | }
1010 |
1011 | ```
1012 |
1013 | ### 4.4.1 创建MySQLSink.scala
1014 |
1015 | ```scala
1016 | package org.apache.spark.sql.structured.datasource
1017 |
1018 | import org.apache.spark.internal.Logging
1019 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
1020 | import org.apache.spark.sql.execution.streaming.Sink
1021 | import org.apache.spark.sql.streaming.OutputMode
1022 |
1023 | /**
1024 | * @author : shirukai
1025 | * @date : 2019-01-25 17:35
1026 | */
1027 | class MySQLSink(sqlContext: SQLContext,parameters: Map[String, String], outputMode: OutputMode) extends Sink with Logging {
1028 | override def addBatch(batchId: Long, data: DataFrame): Unit = {
1029 | val query = data.queryExecution
1030 | val rdd = query.toRdd
1031 | val df = sqlContext.internalCreateDataFrame(rdd, data.schema)
1032 | df.show(false)
1033 | df.write.format("jdbc").options(parameters).mode(SaveMode.Append).save()
1034 | }
1035 | }
1036 | ```
1037 |
1038 | ### 4.2.3 测试MySQLSink
1039 |
1040 | ```scala
1041 | package org.apache.spark.sql.structured.datasource
1042 |
1043 | import org.apache.spark.sql.SparkSession
1044 | import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
1045 |
1046 | /**
1047 | * @author : shirukai
1048 | * @date : 2019-01-29 09:57
1049 | * 测试自定义MySQLSource
1050 | */
1051 | object MySQLSourceTest {
1052 | def main(args: Array[String]): Unit = {
1053 | val spark = SparkSession
1054 | .builder()
1055 | .appName(this.getClass.getSimpleName)
1056 | .master("local[2]")
1057 | .getOrCreate()
1058 | val schema = StructType(List(
1059 | StructField("name", StringType),
1060 | StructField("creatTime", TimestampType),
1061 | StructField("modifyTime", TimestampType)
1062 | )
1063 | )
1064 | val options = Map[String, String](
1065 | "driverClass" -> "com.mysql.cj.jdbc.Driver",
1066 | "jdbcUrl" -> "jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8",
1067 | "user" -> "root",
1068 | "password" -> "hollysys",
1069 | "tableName" -> "model")
1070 | val source = spark
1071 | .readStream
1072 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider")
1073 | .options(options)
1074 | .schema(schema)
1075 | .load()
1076 |
1077 | import org.apache.spark.sql.functions._
1078 | val query = source.groupBy("creatTime").agg(collect_list("name").cast(StringType).as("names")).writeStream
1079 | .outputMode("update")
1080 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider")
1081 | .option("checkpointLocation", "/tmp/MySQLSourceProvider11")
1082 | .option("user","root")
1083 | .option("password","hollysys")
1084 | .option("dbtable","test")
1085 | .option("url","jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8")
1086 | .start()
1087 |
1088 | query.awaitTermination()
1089 | }
1090 | }
1091 | ```
1092 |
1093 | # 3 总结
1094 |
1095 | 通过上面的笔记,参看官网文档,可以学习到Structured支持的几种输入源:File Source、Socket Source、Rate Source、Kafka Source,平时我们会用到KafkaSource以及FileSource,SocketSource、RateSource多用于测试场景。关于输入源没有什么优雅的操作,只能通过重写Source来实现。对于输出源来说,Spark Structured提供的foreach以及foreachBatch已经能适用于大多数场景,没有重写Sink的必要。关于Spark SQL 自定义输入源、Streaming自定义数据源后期会慢慢整理出来。
1096 |
1097 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | spark.demo
8 | spark.structured.datasource
9 | 1.0
10 |
11 | 2.3.0
12 |
13 |
14 |
15 |
16 | org.apache.spark
17 | spark-sql_2.11
18 | ${spark.version}
19 |
20 |
21 |
22 | org.apache.spark
23 | spark-streaming_2.11
24 | ${spark.version}
25 |
26 |
27 |
28 | org.apache.spark
29 | spark-sql-kafka-0-10_2.11
30 | ${spark.version}
31 |
32 |
33 |
34 |
35 | org.apache.httpcomponents
36 | fluent-hc
37 | 4.5.6
38 |
39 |
40 |
41 |
42 | com.alibaba
43 | fastjson
44 | 1.2.47
45 |
46 |
47 |
48 |
49 | com.mchange
50 | c3p0
51 | 0.9.5.2
52 |
53 |
54 |
55 |
56 | mysql
57 | mysql-connector-java
58 | 8.0.12
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-shade-plugin
69 | 2.4.3
70 |
71 |
72 | package
73 |
74 | shade
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 | org.apache.maven.plugins
96 | maven-compiler-plugin
97 |
98 | 1.8
99 | 1.8
100 |
101 |
102 |
103 |
104 |
105 | org.scala-tools
106 | maven-scala-plugin
107 | 2.15.2
108 |
109 |
110 | scala-compile-first
111 |
112 | compile
113 |
114 |
115 |
116 | **/*.scala
117 |
118 |
119 |
120 |
121 | scala-test-compile
122 |
123 | testCompile
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=ERROR,console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24 |
25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
26 | # log level for this class is used to overwrite the root logger's log level, so that
27 | # the user can have different defaults for the shell and regular Spark apps.
28 | log4j.logger.org.apache.spark.repl.Main=WARN
29 |
30 | # Settings to quiet third party logs that are too verbose
31 | log4j.logger.org.spark_project.jetty=WARN
32 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
35 | log4j.logger.org.apache.parquet=ERROR
36 | log4j.logger.parquet=ERROR
37 |
38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
41 |
42 |
43 | log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender
44 | log4j.appender.flume.Hostname = localhost
45 | log4j.appender.flume.Port = 9999
46 | log4j.appender.flume.UnsafeMode = true
47 |
48 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/C3p0Utils.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource
2 |
3 | import java.util.Properties
4 |
5 | import com.mchange.v2.c3p0.ComboPooledDataSource
6 |
7 | /**
8 | * @author : shirukai
9 | * @date : 2019-01-25 11:24
10 | */
11 | object C3p0Utils {
12 | def getDataSource(dbOptions: Map[String, String]): ComboPooledDataSource
13 | = {
14 | val properties = new Properties()
15 | dbOptions.foreach(x => properties.setProperty(x._1, x._2))
16 | val dataSource = new ComboPooledDataSource()
17 | dataSource.setDriverClass(dbOptions("driverClass"))
18 | dataSource.setJdbcUrl(dbOptions("jdbcUrl"))
19 | dataSource.setProperties(properties)
20 | dataSource
21 | }
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/MySQLSink.scala:
--------------------------------------------------------------------------------
1 |
2 | package org.apache.spark.sql.structured.datasource
3 |
4 | import org.apache.spark.internal.Logging
5 | import org.apache.spark.sql.execution.streaming.Sink
6 | import org.apache.spark.sql.streaming.OutputMode
7 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
8 |
9 | /**
10 | * @author : shirukai
11 | * @date : 2019-01-25 17:35
12 | */
13 | class MySQLSink(sqlContext: SQLContext,parameters: Map[String, String], outputMode: OutputMode) extends Sink with Logging {
14 | override def addBatch(batchId: Long, data: DataFrame): Unit = {
15 | val query = data.queryExecution
16 | val rdd = query.toRdd
17 | val df = sqlContext.internalCreateDataFrame(rdd, data.schema)
18 | df.show(false)
19 | df.write.format("jdbc").options(parameters).mode(SaveMode.Append).save()
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/MySQLSource.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource
2 |
3 | import java.sql.Connection
4 |
5 | import org.apache.spark.executor.InputMetrics
6 | import org.apache.spark.internal.Logging
7 | import org.apache.spark.sql.catalyst.InternalRow
8 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
9 | import org.apache.spark.sql.execution.streaming.{Offset, Source}
10 | import org.apache.spark.sql.types.StructType
11 | import org.apache.spark.sql.{DataFrame, SQLContext}
12 | import org.json4s.jackson.Serialization
13 | import org.json4s.{Formats, NoTypeHints}
14 |
15 |
16 | /**
17 | * @author : shirukai
18 | * @date : 2019-01-25 09:41
19 | */
20 | class MySQLSource(sqlContext: SQLContext,
21 | options: Map[String, String],
22 | schemaOption: Option[StructType]) extends Source with Logging {
23 |
24 | lazy val conn: Connection = C3p0Utils.getDataSource(options).getConnection
25 |
26 | val tableName: String = options("tableName")
27 |
28 | var currentOffset: Map[String, Long] = Map[String, Long](tableName -> 0)
29 |
30 | val maxOffsetPerBatch: Option[Long] = Option(100)
31 |
32 | val inputMetrics = new InputMetrics()
33 |
34 | override def schema: StructType = schemaOption.get
35 |
36 | /**
37 | * 获取Offset
38 | * 这里监控MySQL数据库表中条数变化情况
39 | * @return Option[Offset]
40 | */
41 | override def getOffset: Option[Offset] = {
42 | val latest = getLatestOffset
43 | val offsets = maxOffsetPerBatch match {
44 | case None => MySQLSourceOffset(latest)
45 | case Some(limit) =>
46 | MySQLSourceOffset(rateLimit(limit, currentOffset, latest))
47 | }
48 | Option(offsets)
49 | }
50 |
51 | /**
52 | * 获取数据
53 | * @param start 上一次的offset
54 | * @param end 最新的offset
55 | * @return df
56 | */
57 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
58 |
59 | var offset: Long = 0
60 | if (start.isDefined) {
61 | offset = offset2Map(start.get)(tableName)
62 | }
63 | val limit = offset2Map(end)(tableName) - offset
64 | val sql = s"SELECT * FROM $tableName limit $limit offset $offset"
65 |
66 | val st = conn.prepareStatement(sql)
67 | val rs = st.executeQuery()
68 | val rows: Iterator[InternalRow] = JdbcUtils.resultSetToSparkInternalRows(rs, schemaOption.get, inputMetrics) //todo 好用
69 | val rdd = sqlContext.sparkContext.parallelize(rows.toSeq)
70 |
71 | currentOffset = offset2Map(end)
72 |
73 | sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
74 | }
75 |
76 | override def stop(): Unit = {
77 | conn.close()
78 | }
79 |
80 | def rateLimit(limit: Long, currentOffset: Map[String, Long], latestOffset: Map[String, Long]): Map[String, Long] = {
81 | val co = currentOffset(tableName)
82 | val lo = latestOffset(tableName)
83 | if (co + limit > lo) {
84 | Map[String, Long](tableName -> lo)
85 | } else {
86 | Map[String, Long](tableName -> (co + limit))
87 | }
88 | }
89 |
90 | // 获取最新条数
91 | def getLatestOffset: Map[String, Long] = {
92 | var offset: Long = 0
93 | val sql = s"SELECT COUNT(1) FROM $tableName"
94 | val st = conn.prepareStatement(sql)
95 | val rs = st.executeQuery()
96 | while (rs.next()) {
97 | offset = rs.getLong(1)
98 | }
99 | Map[String, Long](tableName -> offset)
100 | }
101 |
102 | def offset2Map(offset: Offset): Map[String, Long] = {
103 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints)
104 | Serialization.read[Map[String, Long]](offset.json())
105 | }
106 | }
107 |
108 | case class MySQLSourceOffset(offset: Map[String, Long]) extends Offset {
109 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints)
110 |
111 | override def json(): String = Serialization.write(offset)
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/MySQLSourceProvider.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.sql.SQLContext
5 | import org.apache.spark.sql.execution.streaming.{Sink, Source}
6 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider}
7 | import org.apache.spark.sql.streaming.OutputMode
8 | import org.apache.spark.sql.types.StructType
9 |
10 | /**
11 | * @author : shirukai
12 | * @date : 2019-01-25 09:10
13 | * 自定义MySQL数据源
14 | */
15 | class MySQLSourceProvider extends DataSourceRegister
16 | with StreamSourceProvider
17 | with StreamSinkProvider
18 | with Logging {
19 | /**
20 | * 数据源的描述名字,如:kafka、socket
21 | *
22 | * @return 字符串shotName
23 | */
24 | override def shortName(): String = "mysql"
25 |
26 |
27 | /**
28 | * 定义数据源的Schema
29 | *
30 | * @param sqlContext Spark SQL 上下文
31 | * @param schema 通过.schema()方法传入的schema
32 | * @param providerName Provider的名称,包名+类名
33 | * @param parameters 通过.option()方法传入的参数
34 | * @return 元组,(shotName,schema)
35 | */
36 | override def sourceSchema(
37 | sqlContext: SQLContext,
38 | schema: Option[StructType],
39 | providerName: String,
40 | parameters: Map[String, String]): (String, StructType) = {
41 | (providerName, schema.get)
42 | }
43 |
44 | /**
45 | * 创建输入源
46 | *
47 | * @param sqlContext Spark SQL 上下文
48 | * @param metadataPath 元数据Path
49 | * @param schema 通过.schema()方法传入的schema
50 | * @param providerName Provider的名称,包名+类名
51 | * @param parameters 通过.option()方法传入的参数
52 | * @return 自定义source,需要继承Source接口实现
53 | */
54 | override def createSource(
55 | sqlContext: SQLContext,
56 | metadataPath: String, schema: Option[StructType],
57 | providerName: String, parameters: Map[String, String]): Source = new MySQLSource(sqlContext, parameters, schema)
58 |
59 | /**
60 | * 创建输出源
61 | *
62 | * @param sqlContext Spark SQL 上下文
63 | * @param parameters 通过.option()方法传入的参数
64 | * @param partitionColumns 分区列名?
65 | * @param outputMode 输出模式
66 | * @return
67 | */
68 | override def createSink(
69 | sqlContext: SQLContext,
70 | parameters: Map[String, String],
71 | partitionColumns: Seq[String], outputMode: OutputMode): Sink = new MySQLSink(sqlContext: SQLContext,parameters, outputMode)
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/custom/CustomDataSink.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.custom
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.sql.execution.streaming.Sink
5 | import org.apache.spark.sql.streaming.OutputMode
6 | import org.apache.spark.sql.{DataFrame, SQLContext}
7 |
8 | /**
9 | * @author : shirukai
10 | * @date : 2019-01-25 18:03
11 | * 自定义数据输出源
12 | */
13 | class CustomDataSink(sqlContext: SQLContext,
14 | parameters: Map[String, String],
15 | outputMode: OutputMode) extends Sink with Logging {
16 |
17 | /**
18 | * 添加Batch,即数据写出
19 | *
20 | * @param batchId batchId
21 | * @param data DataFrame
22 | * 触发机制:当发生计算时,会触发该方法,并且得到要输出的DataFrame
23 | * 实现摘要:
24 | * 1. 数据写入方式:
25 | * (1)通过SparkSQL内置的数据源写出
26 | * 我们拿到DataFrame之后可以通过SparkSQL内置的数据源将数据写出,如:
27 | * JSON数据源、CSV数据源、Text数据源、Parquet数据源、JDBC数据源等。
28 | * (2)通过自定义SparkSQL的数据源进行写出
29 | * (3)通过foreachPartition 将数据写出
30 | */
31 | override def addBatch(batchId: Long, data: DataFrame): Unit = ???
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/custom/CustomDataSource.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.custom
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.sql.execution.streaming.{Offset, Source}
5 | import org.apache.spark.sql.types.StructType
6 | import org.apache.spark.sql.{DataFrame, SQLContext}
7 |
8 | /**
9 | * @author : shirukai
10 | * @date : 2019-01-25 18:03
11 | * 自定义数据输入源:需要继承Source接口
12 | * 实现思路:
13 | * (1)通过重写schema方法来指定数据输入源的schema,这个schema需要与Provider中指定的schema保持一致
14 | * (2)通过重写getOffset方法来获取数据的偏移量,这个方法会一直被轮询调用,不断的获取偏移量
15 | * (3) 通过重写getBatch方法,来获取数据,这个方法是在偏移量发生改变后被触发
16 | * (4)通过stop方法,来进行一下关闭资源的操作
17 | *
18 | */
19 | class CustomDataSource(sqlContext: SQLContext,
20 | parameters: Map[String, String],
21 | schemaOption: Option[StructType]) extends Source
22 | with Logging {
23 |
24 | /**
25 | * 指定数据源的schema,需要与Provider中sourceSchema中指定的schema保持一直,否则会报异常
26 | * 触发机制:当创建数据源的时候被触发执行
27 | *
28 | * @return schema
29 | */
30 | override def schema: StructType = schemaOption.get
31 |
32 | /**
33 | * 获取offset,用来监控数据的变化情况
34 | * 触发机制:不断轮询调用
35 | * 实现要点:
36 | * (1)Offset的实现:
37 | * 由函数返回值可以看出,我们需要提供一个标准的返回值Option[Offset]
38 | * 我们可以通过继承 org.apache.spark.sql.sources.v2.reader.streaming.Offset实现,这里面其实就是保存了个json字符串
39 | *
40 | * (2) JSON转化
41 | * 因为Offset里实现的是一个json字符串,所以我们需要将我们存放offset的集合或者case class转化重json字符串
42 | * spark里是通过org.json4s.jackson这个包来实现case class 集合类(Map、List、Seq、Set等)与json字符串的相互转化
43 | *
44 | * @return Offset
45 | */
46 | override def getOffset: Option[Offset] = ???
47 |
48 | /**
49 | * 获取数据
50 | *
51 | * @param start 上一个批次的end offset
52 | * @param end 通过getOffset获取的新的offset
53 | * 触发机制:当不断轮询的getOffset方法,获取的offset发生改变时,会触发该方法
54 | *
55 | * 实现要点:
56 | * (1)DataFrame的创建:
57 | * 可以通过生成RDD,然后使用RDD创建DataFrame
58 | * RDD创建:sqlContext.sparkContext.parallelize(rows.toSeq)
59 | * DataFrame创建:sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
60 | * @return DataFrame
61 | */
62 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = ???
63 |
64 | /**
65 | * 关闭资源
66 | * 将一些需要关闭的资源放到这里来关闭,如MySQL的数据库连接等
67 | */
68 | override def stop(): Unit = ???
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/custom/CustomDataSourceProvider.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.custom
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.sql.SQLContext
5 | import org.apache.spark.sql.execution.streaming.{Sink, Source}
6 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider}
7 | import org.apache.spark.sql.streaming.OutputMode
8 | import org.apache.spark.sql.types.StructType
9 |
10 | /**
11 | * @author : shirukai
12 | * @date : 2019-01-25 17:49
13 | * 自定义Structured Streaming数据源
14 | *
15 | * (1)继承DataSourceRegister类
16 | * 需要重写shortName方法,用来向Spark注册该组件
17 | *
18 | * (2)继承StreamSourceProvider类
19 | * 需要重写createSource以及sourceSchema方法,用来创建数据输入源
20 | *
21 | * (3)继承StreamSinkProvider类
22 | * 需要重写createSink方法,用来创建数据输出源
23 | *
24 | *
25 | */
26 | class CustomDataSourceProvider extends DataSourceRegister
27 | with StreamSourceProvider
28 | with StreamSinkProvider
29 | with Logging {
30 |
31 |
32 | /**
33 | * 数据源的描述名字,如:kafka、socket
34 | *
35 | * @return 字符串shotName
36 | */
37 | override def shortName(): String = "custom"
38 |
39 |
40 | /**
41 | * 定义数据源的Schema
42 | *
43 | * @param sqlContext Spark SQL 上下文
44 | * @param schema 通过.schema()方法传入的schema
45 | * @param providerName Provider的名称,包名+类名
46 | * @param parameters 通过.option()方法传入的参数
47 | * @return 元组,(shotName,schema)
48 | */
49 | override def sourceSchema(sqlContext: SQLContext,
50 | schema: Option[StructType],
51 | providerName: String,
52 | parameters: Map[String, String]): (String, StructType) = (shortName(),schema.get)
53 |
54 | /**
55 | * 创建输入源
56 | *
57 | * @param sqlContext Spark SQL 上下文
58 | * @param metadataPath 元数据Path
59 | * @param schema 通过.schema()方法传入的schema
60 | * @param providerName Provider的名称,包名+类名
61 | * @param parameters 通过.option()方法传入的参数
62 | * @return 自定义source,需要继承Source接口实现
63 | **/
64 |
65 | override def createSource(sqlContext: SQLContext,
66 | metadataPath: String,
67 | schema: Option[StructType],
68 | providerName: String,
69 | parameters: Map[String, String]): Source = new CustomDataSource(sqlContext,parameters,schema)
70 |
71 |
72 | /**
73 | * 创建输出源
74 | *
75 | * @param sqlContext Spark SQL 上下文
76 | * @param parameters 通过.option()方法传入的参数
77 | * @param partitionColumns 分区列名?
78 | * @param outputMode 输出模式
79 | * @return
80 | */
81 | override def createSink(sqlContext: SQLContext,
82 | parameters: Map[String, String],
83 | partitionColumns: Seq[String],
84 | outputMode: OutputMode): Sink = new CustomDataSink(sqlContext,parameters,outputMode)
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/ConsoleSinkExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import java.util.UUID
4 |
5 | import org.apache.spark.sql.SparkSession
6 |
7 | /**
8 | * @author : shirukai
9 | * @date : 2019-01-26 09:58
10 | * Spark Structured 内置ConsoleSink用例
11 | */
12 | object ConsoleSinkExample {
13 | def main(args: Array[String]): Unit = {
14 |
15 | val spark = SparkSession
16 | .builder()
17 | .appName(this.getClass.getSimpleName)
18 | .master("local[2]")
19 | .getOrCreate()
20 |
21 | val source = spark.readStream
22 | .format("rate")
23 | // 每秒生成的行数,默认值为1
24 | .option("rowsPerSecond", 10)
25 | .option("numPartitions", 10)
26 | .load()
27 |
28 | val consoleSink = source.writeStream
29 | .format("console")
30 | // 是否压缩显示
31 | .option("truncate", value = false)
32 | // 显示条数
33 | .option("numRows", 30)
34 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
35 | .start()
36 |
37 | consoleSink.awaitTermination()
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/FileSinkExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import java.util.UUID
4 |
5 | import org.apache.spark.sql.SparkSession
6 |
7 | /**
8 | * @author : shirukai
9 | * @date : 2019-01-26 09:58
10 | * Spark Structured 内置Sink用例
11 | */
12 | object FileSinkExample {
13 | def main(args: Array[String]): Unit = {
14 |
15 | val spark = SparkSession
16 | .builder()
17 | .appName(this.getClass.getSimpleName)
18 | .master("local[2]")
19 | .getOrCreate()
20 |
21 | val source = spark.readStream
22 | .format("rate")
23 | // 每秒生成的行数,默认值为1
24 | .option("rowsPerSecond", 10)
25 | .option("numPartitions", 10)
26 | .load()
27 |
28 | val fileSink = source.writeStream
29 | .format("parquet")
30 | //.format("csv")
31 | //.format("orc")
32 | // .format("json")
33 | .option("path", "data/sink")
34 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
35 | .start()
36 |
37 | fileSink.awaitTermination()
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/FileSourceExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types._
5 |
6 | /**
7 | * @author : shirukai
8 | * @date : 2019-01-25 19:18
9 | * 文件数据源测试
10 | */
11 | object FileSourceExample {
12 | def main(args: Array[String]): Unit = {
13 | val spark = SparkSession
14 | .builder()
15 | .appName(this.getClass.getSimpleName)
16 | .master("local[2]")
17 | .getOrCreate()
18 |
19 | val source = spark
20 | .readStream
21 | // Schema must be specified when creating a streaming source DataFrame.
22 | .schema(StructType(List(
23 | StructField("name", StringType),
24 | StructField("value", IntegerType)
25 | )))
26 | // 每个trigger最大文件数量
27 | .option("maxFilesPerTrigger", 100)
28 | // 是否首先计算最新的文件,默认为false
29 | .option("latestFirst", value = true)
30 | // 是否值检查名字,如果名字相同,则不视为更新,默认为false
31 | .option("fileNameOnly", value = true)
32 | .csv("*.csv")
33 |
34 | val query = source.writeStream
35 | .outputMode("update")
36 | .format("console")
37 | //.option("checkpointLocation", checkpointLocation)
38 | .option("truncate", value = false)
39 | .start()
40 |
41 | query.awaitTermination()
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/ForeachSinkExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import org.apache.spark.sql.{ForeachWriter, Row, SparkSession}
4 |
5 | /**
6 | * @author : shirukai
7 | * @date : 2019-01-26 09:58
8 | * Spark Structured 内置ForeachSink用例
9 | */
10 | object ForeachSinkExample {
11 | def main(args: Array[String]): Unit = {
12 |
13 | val spark = SparkSession
14 | .builder()
15 | .appName(this.getClass.getSimpleName)
16 | .master("local[2]")
17 | .getOrCreate()
18 |
19 | val source = spark.readStream
20 | .format("rate")
21 | // 每秒生成的行数,默认值为1
22 | .option("rowsPerSecond", 10)
23 | .option("numPartitions", 10)
24 | .load()
25 |
26 | val foreachSink = source.writeStream
27 | .foreach(new ForeachWriter[Row] {
28 | override def open(partitionId: Long, version: Long): Boolean = {
29 | println(s"partitionId=$partitionId,version=$version")
30 | true
31 |
32 | }
33 |
34 | override def process(value: Row): Unit = {
35 | println(value)
36 | }
37 |
38 | override def close(errorOrNull: Throwable): Unit = {
39 | println("close")
40 | }
41 | })
42 | .start()
43 |
44 | foreachSink.awaitTermination()
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/KafkaSinkExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import java.util.UUID
4 |
5 | import org.apache.spark.sql.SparkSession
6 | import org.apache.spark.sql.types.StringType
7 |
8 | /**
9 | * @author : shirukai
10 | * @date : 2019-01-26 09:58
11 | * Spark Structured 内置KafkaSink用例
12 | */
13 | object KafkaSinkExample {
14 | def main(args: Array[String]): Unit = {
15 |
16 | val spark = SparkSession
17 | .builder()
18 | .appName(this.getClass.getSimpleName)
19 | .master("local[2]")
20 | .getOrCreate()
21 |
22 | val source = spark.readStream
23 | .format("rate")
24 | // 每秒生成的行数,默认值为1
25 | .option("rowsPerSecond", 10)
26 | .option("numPartitions", 10)
27 | .load()
28 | import org.apache.spark.sql.functions._
29 | import spark.implicits._
30 | val kafkaSink = source.select(array(to_json(struct("*"))).as("value").cast(StringType),
31 | $"timestamp".as("key").cast(StringType)).writeStream
32 | .format("kafka")
33 | .option("kafka.bootstrap.servers", "localhost:9092")
34 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
35 | .option("topic", "hiacloud-ts-dev")
36 | .start()
37 |
38 |
39 | kafkaSink.awaitTermination()
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/KafkaSourceExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | /**
6 | * @author : shirukai
7 | * @date : 2019-01-26 09:46
8 | * Kafka输入源测试
9 | */
10 | object KafkaSourceExample {
11 | def main(args: Array[String]): Unit = {
12 | val spark = SparkSession
13 | .builder()
14 | .appName(this.getClass.getSimpleName)
15 | .master("local[2]")
16 | .getOrCreate()
17 |
18 | val source = spark
19 | .readStream
20 | .format("kafka")
21 | .option("kafka.bootstrap.servers", "localhost:9092")
22 | .option("subscribe", "hiacloud-ts-dev")
23 | //.option("startingOffsets", "earliest")
24 | .option("failOnDataLoss", "true")
25 | .load()
26 |
27 | val query = source.writeStream
28 | .outputMode("update")
29 | .format("console")
30 | //.option("checkpointLocation", checkpointLocation)
31 | .option("truncate", value = false)
32 | .start()
33 |
34 | query.awaitTermination()
35 |
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/MemorySinkExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import java.util.UUID
4 |
5 | import org.apache.spark.sql.SparkSession
6 |
7 | /**
8 | * @author : shirukai
9 | * @date : 2019-01-26 09:58
10 | * Spark Structured 内置MemorySink用例
11 | */
12 | object MemorySinkExample {
13 | def main(args: Array[String]): Unit = {
14 |
15 | val spark = SparkSession
16 | .builder()
17 | .appName(this.getClass.getSimpleName)
18 | .master("local[2]")
19 | .getOrCreate()
20 |
21 | val source = spark.readStream
22 | .format("rate")
23 | // 每秒生成的行数,默认值为1
24 | .option("rowsPerSecond", 10)
25 | .option("numPartitions", 10)
26 | .load()
27 |
28 | val memorySink = source.writeStream
29 | .format("memory")
30 | .queryName("memorySinkTable")
31 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString)
32 | .start()
33 |
34 |
35 | new Thread(new Runnable {
36 | override def run(): Unit = {
37 | while (true) {
38 | spark.sql("select * from memorySinkTable").show(false)
39 | Thread.sleep(1000)
40 | }
41 | }
42 | }).start()
43 | memorySink.awaitTermination()
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/RateSourceExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | /**
6 | * @author : shirukai
7 | * @date : 2019-01-25 20:04
8 | * 基于RateSource的数据源测试
9 | */
10 | object RateSourceExample {
11 | def main(args: Array[String]): Unit = {
12 | val spark = SparkSession
13 | .builder()
14 | .appName(this.getClass.getSimpleName)
15 | .master("local[2]")
16 | .getOrCreate()
17 |
18 | val rate = spark.readStream
19 | .format("rate")
20 | // 每秒生成的行数,默认值为1
21 | .option("rowsPerSecond", 10)
22 | .option("numPartitions", 10)
23 | .load()
24 |
25 | val query =rate.writeStream
26 | .outputMode("update")
27 | .format("console")
28 | .option("truncate", value = false)
29 | .start()
30 |
31 | query.awaitTermination()
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/structured/datasource/example/SocketSourceExample.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | /**
6 | * @author : shirukai
7 | * @date : 2019-01-25 19:57
8 | * 基于Socket的数据源 nc -lc 9090
9 | */
10 | object SocketSourceExample {
11 | def main(args: Array[String]): Unit = {
12 | val spark = SparkSession
13 | .builder()
14 | .appName(this.getClass.getSimpleName)
15 | .master("local[2]")
16 | .getOrCreate()
17 |
18 | val lines = spark.readStream
19 | .format("socket")
20 | .option("host", "localhost")
21 | .option("port", 9090)
22 | .load()
23 |
24 | val query = lines.writeStream
25 | .outputMode("update")
26 | .format("console")
27 | .option("truncate", value = false)
28 | .start()
29 |
30 | query.awaitTermination()
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/java/org/apache/spark/sql/structured/datasource/MySQLSourceTest.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.structured.datasource
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
5 |
6 | /**
7 | * @author : shirukai
8 | * @date : 2019-01-29 09:57
9 | * 测试自定义MySQLSource
10 | */
11 | object MySQLSourceTest {
12 | def main(args: Array[String]): Unit = {
13 | val spark = SparkSession
14 | .builder()
15 | .appName(this.getClass.getSimpleName)
16 | .master("local[2]")
17 | .getOrCreate()
18 | val schema = StructType(List(
19 | StructField("name", StringType),
20 | StructField("creatTime", TimestampType),
21 | StructField("modifyTime", TimestampType)
22 | )
23 | )
24 | val options = Map[String, String](
25 | "driverClass" -> "com.mysql.cj.jdbc.Driver",
26 | "jdbcUrl" -> "jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8",
27 | "user" -> "root",
28 | "password" -> "hollysys",
29 | "tableName" -> "model")
30 | val source = spark
31 | .readStream
32 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider")
33 | .options(options)
34 | .schema(schema)
35 | .load()
36 |
37 | import org.apache.spark.sql.functions._
38 | val query = source.groupBy("creatTime").agg(collect_list("name").cast(StringType).as("names")).writeStream
39 | .outputMode("update")
40 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider")
41 | .option("checkpointLocation", "/tmp/MySQLSourceProvider11")
42 | .option("user","root")
43 | .option("password","hollysys")
44 | .option("dbtable","test")
45 | .option("url","jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8")
46 | .start()
47 |
48 | query.awaitTermination()
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=ERROR,console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24 |
25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
26 | # log level for this class is used to overwrite the root logger's log level, so that
27 | # the user can have different defaults for the shell and regular Spark apps.
28 | log4j.logger.org.apache.spark.repl.Main=WARN
29 |
30 | # Settings to quiet third party logs that are too verbose
31 | log4j.logger.org.spark_project.jetty=WARN
32 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
35 | log4j.logger.org.apache.parquet=ERROR
36 | log4j.logger.parquet=ERROR
37 |
38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
41 |
42 |
43 | log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender
44 | log4j.appender.flume.Hostname = localhost
45 | log4j.appender.flume.Port = 9999
46 | log4j.appender.flume.UnsafeMode = true
47 |
48 |
--------------------------------------------------------------------------------