├── .gitignore ├── README.md ├── pom.xml └── src ├── main ├── resources │ └── log4j.properties └── scala │ └── org │ └── apache │ └── spark │ └── sql │ └── structured │ └── datasource │ ├── C3p0Utils.scala │ ├── MySQLSink.scala │ ├── MySQLSource.scala │ ├── MySQLSourceProvider.scala │ ├── custom │ ├── CustomDataSink.scala │ ├── CustomDataSource.scala │ └── CustomDataSourceProvider.scala │ └── example │ ├── ConsoleSinkExample.scala │ ├── FileSinkExample.scala │ ├── FileSourceExample.scala │ ├── ForeachSinkExample.scala │ ├── KafkaSinkExample.scala │ ├── KafkaSourceExample.scala │ ├── MemorySinkExample.scala │ ├── RateSourceExample.scala │ └── SocketSourceExample.scala └── test ├── java └── org │ └── apache │ └── spark │ └── sql │ └── structured │ └── datasource │ └── MySQLSourceTest.scala └── resources └── log4j.properties /.gitignore: -------------------------------------------------------------------------------- 1 | .settings 2 | .project 3 | .classpath 4 | target/ 5 | logs/ 6 | *.iml 7 | .iml* 8 | .idea 9 | .idea* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StructuredStreaming 内置数据源及实现自定义数据源 2 | 3 | > 版本说明: 4 | > 5 | > Spark:2.3/2.4 6 | > 7 | > 代码仓库:https://github.com/shirukai/spark-structured-datasource.git 8 | 9 | # 1 Structured内置的输入源 Source 10 | 11 | 官网文档:http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#input-sources 12 | 13 | | Source | Options | Fault-tolerant | Notes | 14 | | ------------- | ------------------------------------------------------------ | -------------- | ------------------------------------------ | 15 | | File Source | maxFilesPerTrigger:每个触发器中要考虑的最大新文件数(默认值:无最大值)latestFirst:是否先处理最新的新文件,当存在大量积压的文件时有用(默认值:false)
fileNameOnly:是否基于以下方法检查新文件只有文件名而不是完整路径(默认值:false)。 | 支持容错 | 支持glob路径,但不支持以口号分割的多个路径 | 16 | | Socket Source | host:要连接的主机,必须指定
port:要连接的端口,必须指定 | 不支持容错 | | 17 | | Rate Source | rowsPerSecond(例如100,默认值:1):每秒应生成多少行。
rampUpTime(例如5s,默认值:0s):在生成速度变为之前加速多长时间rowsPerSecond。使用比秒更精细的粒度将被截断为整数秒。numPartitions(例如10,默认值:Spark的默认并行性):生成的行的分区号 | 支持容错 | | 18 | | Kafka Source | http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html | 支持容错 | | 19 | ## 1.1 File Source 20 | 21 | 将目录中写入的文件作为数据流读取。支持的文件格式为:text、csv、json、orc、parquet 22 | 23 | **用例** 24 | 25 | 代码位置:org.apache.spark.sql.structured.datasource.example 26 | 27 | ```scala 28 | val source = spark 29 | .readStream 30 | // Schema must be specified when creating a streaming source DataFrame. 31 | .schema(schema) 32 | // 每个trigger最大文件数量 33 | .option("maxFilesPerTrigger",100) 34 | // 是否首先计算最新的文件,默认为false 35 | .option("latestFirst",value = true) 36 | // 是否值检查名字,如果名字相同,则不视为更新,默认为false 37 | .option("fileNameOnly",value = true) 38 | .csv("*.csv") 39 | ``` 40 | 41 | ## 1.2 Socket Source 42 | 43 | 从Socket中读取UTF8文本数据。一般用于测试,使用nc -lc 端口号 向Socket监听的端口发送数据。 44 | 45 | **用例** 46 | 47 | 代码位置:org.apache.spark.sql.structured.datasource.example 48 | 49 | ```scala 50 | val lines = spark.readStream 51 | .format("socket") 52 | .option("host", "localhost") 53 | .option("port", 9090) 54 | .load() 55 | ``` 56 | 57 | ## 1.3 Rate Source 58 | 59 | 以每秒指定的行数生成数据,每个输出行包含一个`timestamp`和`value`。其中`timestamp`是一个`Timestamp`含有信息分配的时间类型,并且`value`是`Long`包含消息的计数从0开始作为第一行类型。此源用于测试和基准测试。 60 | 61 | **用例** 62 | 63 | 代码位置:org.apache.spark.sql.structured.datasource.example 64 | 65 | ```scala 66 | val rate = spark.readStream 67 | .format("rate") 68 | // 每秒生成的行数,默认值为1 69 | .option("rowsPerSecond", 10) 70 | .option("numPartitions", 10) 71 | .option("rampUpTime",0) 72 | .option("rampUpTime",5) 73 | .load() 74 | ``` 75 | 76 | ## 1.4 Kafka Source 77 | 78 | 官网文档:http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html 79 | 80 | **用例** 81 | 82 | 代码位置:org.apache.spark.sql.structured.datasource.example 83 | 84 | ```scala 85 | val df = spark 86 | .readStream 87 | .format("kafka") 88 | .option("kafka.bootstrap.servers", "host1:port1,host2:port2") 89 | .option("subscribePattern", "topic.*") 90 | .load() 91 | ``` 92 | 93 | # 2 Structured 内置的输出源 Sink 94 | 95 | | Sink | Supported Output Modes | Options | Fault-tolerant | Notes | 96 | | ----------------- | ---------------------- | ------------------------------------------------------------ | ------------------------- | ------------------------------------------------------------ | 97 | | File Sink | Append | path:输出路径(必须指定) | 支持容错(exactly-once) | 支持分区写入 | 98 | | Kafka Sink | Append,Update,Complete | See the [Kafka Integration Guide](http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html) | 支持容错(at-least-once) | [Kafka Integration Guide](http://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html) | 99 | | Foreach Sink | Append,Update,Complete | None | | [Foreach Guide](http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch) | 100 | | ForeachBatch Sink | Append,Update,Complete | None | | [Foreach Guide](http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch) | 101 | | Console Sink | Append,Update,Complete | numRows:每次触发器打印的行数(默认值:20)
truncate:是否过长时截断输出(默认值:true | | | 102 | | Memory Sink | Append,Complete | None | | 表名是查询的名字 | 103 | 104 | ## 2.1 File Sink 105 | 106 | 将结果输出到文件,支持格式parquet、csv、orc、json等 107 | 108 | **用例** 109 | 110 | 代码位置:org.apache.spark.sql.structured.datasource.example 111 | 112 | ```scala 113 | val fileSink = source.writeStream 114 | .format("parquet") 115 | //.format("csv") 116 | //.format("orc") 117 | // .format("json") 118 | .option("path", "data/sink") 119 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 120 | .start() 121 | ``` 122 | 123 | ## 2.2 Console Sink 124 | 125 | 将结果输出到控制台 126 | 127 | **用例** 128 | 129 | 代码位置:org.apache.spark.sql.structured.datasource.example 130 | 131 | ```scala 132 | val consoleSink = source.writeStream 133 | .format("console") 134 | // 是否压缩显示 135 | .option("truncate", value = false) 136 | // 显示条数 137 | .option("numRows", 30) 138 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 139 | .start() 140 | 141 | ``` 142 | 143 | ## 2.3 Memory Sink 144 | 145 | 将结果输出到内存,需要指定内存中的表名。可以使用sql进行查询 146 | 147 | **用例** 148 | 149 | 代码位置:org.apache.spark.sql.structured.datasource.example 150 | 151 | ````scala 152 | 153 | val memorySink = source.writeStream 154 | .format("memory") 155 | .queryName("memorySinkTable") 156 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 157 | .start() 158 | 159 | 160 | new Thread(new Runnable { 161 | override def run(): Unit = { 162 | while (true) { 163 | spark.sql("select * from memorySinkTable").show(false) 164 | Thread.sleep(1000) 165 | } 166 | } 167 | }).start() 168 | memorySink.awaitTermination() 169 | ```` 170 | 171 | ## 2.4 Kafka Sink 172 | 173 | 将结果输出到Kafka,需要将DataFrame转成key,value两列,或者topic、key、value三列 174 | 175 | **用例** 176 | 177 | 代码位置:org.apache.spark.sql.structured.datasource.example 178 | 179 | ```scala 180 | import org.apache.spark.sql.functions._ 181 | import spark.implicits._ 182 | val kafkaSink = source.select(array(to_json(struct("*"))).as("value").cast(StringType), 183 | $"timestamp".as("key").cast(StringType)).writeStream 184 | .format("kafka") 185 | .option("kafka.bootstrap.servers", "localhost:9092") 186 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 187 | .option("topic", "hiacloud-ts-dev") 188 | .start() 189 | ``` 190 | 191 | ## 2.5 ForeachBatch Sink(2.4) 192 | 193 | 适用于对于一个批次来说应用相同的写入方式的场景。方法传入这个batch的DataFrame以及batchId。这个方法在2.3之后的版本才有而且仅支持微批模式。 194 | 195 | ![](http://shirukai.gitee.io/images/4a5973ac848b33f8b5938be7e7754b85.jpg) 196 | 197 | **用例** 198 | 199 | 代码位置:org.apache.spark.sql.structured.datasource.example 200 | 201 | ```scala 202 | val foreachBatchSink = source.writeStream.foreachBatch((batchData: DataFrame, batchId) => { 203 | batchData.show(false) 204 | }).start() 205 | ``` 206 | 207 | ## 2.6 Foreach Sink 208 | 209 | Foreach 每一条记录,通过继承ForeachWriter[Row],实现open(),process(),close()方法。在open方法了我们可以获取一个资源连接,如MySQL的连接。在process里我们可以获取一条记录,并处理这条数据发送到刚才获取资源连接的MySQL中,在close里我们可以关闭资源连接。注意,foreach是对Partition来说的,同一个分区只会调用一次open、close方法,但对于每条记录来说,都会调用process方法。 210 | 211 | **用例** 212 | 213 | 代码位置:org.apache.spark.sql.structured.datasource.example 214 | 215 | ```scala 216 | val foreachSink = source.writeStream 217 | .foreach(new ForeachWriter[Row] { 218 | override def open(partitionId: Long, version: Long): Boolean = { 219 | println(s"partitionId=$partitionId,version=$version") 220 | true 221 | 222 | } 223 | 224 | override def process(value: Row): Unit = { 225 | println(value) 226 | } 227 | 228 | override def close(errorOrNull: Throwable): Unit = { 229 | println("close") 230 | } 231 | }) 232 | .start() 233 | 234 | ``` 235 | 236 | # 3 自定义输入源 237 | 238 | 某些应用场景下我们可能需要自定义数据源,如业务中,需要在获取KafkaSource的同时,动态从缓存中或者http请求中加载业务数据,或者是其它的数据源等都可以参考规范自定义。自定义输入源需要以下步骤: 239 | 240 | 第一步:继承DataSourceRegister和StreamSourceProvider创建自定义Provider类 241 | 242 | 第二步:重写DataSourceRegister类中的shotName和StreamSourceProvider中的createSource以及sourceSchema方法 243 | 244 | 第三步:继承Source创建自定义Source类 245 | 246 | 第四步:重写Source中的schema方法指定输入源的schema 247 | 248 | 第五步:重写Source中的getOffest方法监听流数据 249 | 250 | 第六步:重写Source中的getBatch方法获取数据 251 | 252 | 第七步:重写Source中的stop方法用来关闭资源 253 | 254 | ## 3.1 创建CustomDataSourceProvider类 255 | 256 | ### 3.1.1 继承DataSourceRegister和StreamSourceProvider 257 | 258 | 要创建自定义的DataSourceProvider必须要继承位于org.apache.spark.sql.sources包下的DataSourceRegister以及该包下的StreamSourceProvider。如下所示: 259 | 260 | ```scala 261 | class CustomDataSourceProvider extends DataSourceRegister 262 | with StreamSourceProvider 263 | with Logging { 264 | //Override some functions …… 265 | } 266 | ``` 267 | 268 | ### 3.1.2 重写DataSourceRegister的shotName方法 269 | 270 | 该方法用来指定一个数据源的名字,用来想spark注册该数据源。如Spark内置的数据源的shotName:kafka 271 | 272 | 、socket、rate等,该方法返回一个字符串,如下所示: 273 | 274 | ```scala 275 | /** 276 | * 数据源的描述名字,如:kafka、socket 277 | * 278 | * @return 字符串shotName 279 | */ 280 | override def shortName(): String = "custom" 281 | ``` 282 | 283 | ### 3.1.3 重写StreamSourceProvider中的sourceSchema方法 284 | 285 | 该方法是用来定义数据源的schema,可以使用用户传入的schema,也可以根据传入的参数进行动态创建。返回值是个二元组(shotName,scheam),代码如下所示: 286 | 287 | ```scala 288 | /** 289 | * 定义数据源的Schema 290 | * 291 | * @param sqlContext Spark SQL 上下文 292 | * @param schema 通过.schema()方法传入的schema 293 | * @param providerName Provider的名称,包名+类名 294 | * @param parameters 通过.option()方法传入的参数 295 | * @return 元组,(shotName,schema) 296 | */ 297 | override def sourceSchema(sqlContext: SQLContext, 298 | schema: Option[StructType], 299 | providerName: String, 300 | parameters: Map[String, String]): (String, StructType) = (shortName(),schema.get) 301 | ``` 302 | 303 | ### 3.1.4 重写StreamSourceProvider中的createSource方法 304 | 305 | 通过传入的参数,来实例化我们自定义的DataSource,是我们自定义Source的重要入口的地方 306 | 307 | ```scala 308 | /** 309 | * 创建输入源 310 | * 311 | * @param sqlContext Spark SQL 上下文 312 | * @param metadataPath 元数据Path 313 | * @param schema 通过.schema()方法传入的schema 314 | * @param providerName Provider的名称,包名+类名 315 | * @param parameters 通过.option()方法传入的参数 316 | * @return 自定义source,需要继承Source接口实现 317 | **/ 318 | 319 | override def createSource(sqlContext: SQLContext, 320 | metadataPath: String, 321 | schema: Option[StructType], 322 | providerName: String, 323 | parameters: Map[String, String]): Source = new CustomDataSource(sqlContext,parameters,schema) 324 | ``` 325 | 326 | ### 3.1.5 CustomDataSourceProvider.scala完整代码 327 | 328 | ```scala 329 | package org.apache.spark.sql.structured.datasource.custom 330 | 331 | import org.apache.spark.internal.Logging 332 | import org.apache.spark.sql.SQLContext 333 | import org.apache.spark.sql.execution.streaming.{Sink, Source} 334 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider} 335 | import org.apache.spark.sql.streaming.OutputMode 336 | import org.apache.spark.sql.types.StructType 337 | 338 | /** 339 | * @author : shirukai 340 | * @date : 2019-01-25 17:49 341 | * 自定义Structured Streaming数据源 342 | * 343 | * (1)继承DataSourceRegister类 344 | * 需要重写shortName方法,用来向Spark注册该组件 345 | * 346 | * (2)继承StreamSourceProvider类 347 | * 需要重写createSource以及sourceSchema方法,用来创建数据输入源 348 | * 349 | * (3)继承StreamSinkProvider类 350 | * 需要重写createSink方法,用来创建数据输出源 351 | * 352 | * 353 | */ 354 | class CustomDataSourceProvider extends DataSourceRegister 355 | with StreamSourceProvider 356 | with StreamSinkProvider 357 | with Logging { 358 | 359 | 360 | /** 361 | * 数据源的描述名字,如:kafka、socket 362 | * 363 | * @return 字符串shotName 364 | */ 365 | override def shortName(): String = "custom" 366 | 367 | 368 | /** 369 | * 定义数据源的Schema 370 | * 371 | * @param sqlContext Spark SQL 上下文 372 | * @param schema 通过.schema()方法传入的schema 373 | * @param providerName Provider的名称,包名+类名 374 | * @param parameters 通过.option()方法传入的参数 375 | * @return 元组,(shotName,schema) 376 | */ 377 | override def sourceSchema(sqlContext: SQLContext, 378 | schema: Option[StructType], 379 | providerName: String, 380 | parameters: Map[String, String]): (String, StructType) = (shortName(),schema.get) 381 | 382 | /** 383 | * 创建输入源 384 | * 385 | * @param sqlContext Spark SQL 上下文 386 | * @param metadataPath 元数据Path 387 | * @param schema 通过.schema()方法传入的schema 388 | * @param providerName Provider的名称,包名+类名 389 | * @param parameters 通过.option()方法传入的参数 390 | * @return 自定义source,需要继承Source接口实现 391 | **/ 392 | 393 | override def createSource(sqlContext: SQLContext, 394 | metadataPath: String, 395 | schema: Option[StructType], 396 | providerName: String, 397 | parameters: Map[String, String]): Source = new CustomDataSource(sqlContext,parameters,schema) 398 | 399 | 400 | /** 401 | * 创建输出源 402 | * 403 | * @param sqlContext Spark SQL 上下文 404 | * @param parameters 通过.option()方法传入的参数 405 | * @param partitionColumns 分区列名? 406 | * @param outputMode 输出模式 407 | * @return 408 | */ 409 | override def createSink(sqlContext: SQLContext, 410 | parameters: Map[String, String], 411 | partitionColumns: Seq[String], 412 | outputMode: OutputMode): Sink = new CustomDataSink(sqlContext,parameters,outputMode) 413 | } 414 | 415 | ``` 416 | 417 | ## 3.2 创建CustomDataSource类 418 | 419 | ### 3.2.1 继承Source创建CustomDataSource类 420 | 421 | 要创建自定义的DataSource必须要继承位于org.apache.spark.sql.sources包下的Source。如下所示: 422 | 423 | ```scala 424 | class CustomDataSource(sqlContext: SQLContext, 425 | parameters: Map[String, String], 426 | schemaOption: Option[StructType]) extends Source 427 | with Logging { 428 | //Override some functions …… 429 | } 430 | ``` 431 | 432 | ### 3.2.2 重写Source的schema方法 433 | 434 | 指定数据源的schema,需要与Provider中的sourceSchema指定的schema保持一致,否则会报异常 435 | 436 | ```scala 437 | /** 438 | * 指定数据源的schema,需要与Provider中sourceSchema中指定的schema保持一直,否则会报异常 439 | * 触发机制:当创建数据源的时候被触发执行 440 | * 441 | * @return schema 442 | */ 443 | override def schema: StructType = schemaOption.get 444 | ``` 445 | 446 | ### 3.2.3 重写Source的getOffset方法 447 | 448 | 此方法是Spark不断的轮询执行的,目的是用来监控流数据的变化情况,一旦数据发生变化,就会触发getBatch方法用来获取数据。 449 | 450 | ```scala 451 | /** 452 | * 获取offset,用来监控数据的变化情况 453 | * 触发机制:不断轮询调用 454 | * 实现要点: 455 | * (1)Offset的实现: 456 | * 由函数返回值可以看出,我们需要提供一个标准的返回值Option[Offset] 457 | * 我们可以通过继承 org.apache.spark.sql.sources.v2.reader.streaming.Offset实现,这里面其实就是保存了个json字符串 458 | * 459 | * (2) JSON转化 460 | * 因为Offset里实现的是一个json字符串,所以我们需要将我们存放offset的集合或者case class转化重json字符串 461 | * spark里是通过org.json4s.jackson这个包来实现case class 集合类(Map、List、Seq、Set等)与json字符串的相互转化 462 | * 463 | * @return Offset 464 | */ 465 | override def getOffset: Option[Offset] = ??? 466 | ``` 467 | 468 | ### 3.2.4 重写Source的getBatch方法 469 | 470 | 此方法是Spark用来获取数据的,getOffset方法检测的数据发生变化的时候,会触发该方法, 传入上一次触发时的end Offset作为当前batch的start Offset,将新的offset作为end Offset。 471 | 472 | ```scala 473 | /** 474 | * 获取数据 475 | * 476 | * @param start 上一个批次的end offset 477 | * @param end 通过getOffset获取的新的offset 478 | * 触发机制:当不断轮询的getOffset方法,获取的offset发生改变时,会触发该方法 479 | * 480 | * 实现要点: 481 | * (1)DataFrame的创建: 482 | * 可以通过生成RDD,然后使用RDD创建DataFrame 483 | * RDD创建:sqlContext.sparkContext.parallelize(rows.toSeq) 484 | * DataFrame创建:sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) 485 | * @return DataFrame 486 | */ 487 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = ??? 488 | ``` 489 | 490 | ### 3.2.5 重写Source的stop方法 491 | 492 | 用来关闭一些需要关闭或停止的资源及进程 493 | 494 | ```scala 495 | /** 496 | * 关闭资源 497 | * 将一些需要关闭的资源放到这里来关闭,如MySQL的数据库连接等 498 | */ 499 | override def stop(): Unit = ??? 500 | ``` 501 | 502 | ### 3.2.6 CustomDataSource.scala完整代码 503 | 504 | ```scala 505 | package org.apache.spark.sql.structured.datasource.custom 506 | 507 | import org.apache.spark.internal.Logging 508 | import org.apache.spark.sql.execution.streaming.{Offset, Source} 509 | import org.apache.spark.sql.types.StructType 510 | import org.apache.spark.sql.{DataFrame, SQLContext} 511 | 512 | /** 513 | * @author : shirukai 514 | * @date : 2019-01-25 18:03 515 | * 自定义数据输入源:需要继承Source接口 516 | * 实现思路: 517 | * (1)通过重写schema方法来指定数据输入源的schema,这个schema需要与Provider中指定的schema保持一致 518 | * (2)通过重写getOffset方法来获取数据的偏移量,这个方法会一直被轮询调用,不断的获取偏移量 519 | * (3) 通过重写getBatch方法,来获取数据,这个方法是在偏移量发生改变后被触发 520 | * (4)通过stop方法,来进行一下关闭资源的操作 521 | * 522 | */ 523 | class CustomDataSource(sqlContext: SQLContext, 524 | parameters: Map[String, String], 525 | schemaOption: Option[StructType]) extends Source 526 | with Logging { 527 | 528 | /** 529 | * 指定数据源的schema,需要与Provider中sourceSchema中指定的schema保持一直,否则会报异常 530 | * 触发机制:当创建数据源的时候被触发执行 531 | * 532 | * @return schema 533 | */ 534 | override def schema: StructType = schemaOption.get 535 | 536 | /** 537 | * 获取offset,用来监控数据的变化情况 538 | * 触发机制:不断轮询调用 539 | * 实现要点: 540 | * (1)Offset的实现: 541 | * 由函数返回值可以看出,我们需要提供一个标准的返回值Option[Offset] 542 | * 我们可以通过继承 org.apache.spark.sql.sources.v2.reader.streaming.Offset实现,这里面其实就是保存了个json字符串 543 | * 544 | * (2) JSON转化 545 | * 因为Offset里实现的是一个json字符串,所以我们需要将我们存放offset的集合或者case class转化重json字符串 546 | * spark里是通过org.json4s.jackson这个包来实现case class 集合类(Map、List、Seq、Set等)与json字符串的相互转化 547 | * 548 | * @return Offset 549 | */ 550 | override def getOffset: Option[Offset] = ??? 551 | 552 | /** 553 | * 获取数据 554 | * 555 | * @param start 上一个批次的end offset 556 | * @param end 通过getOffset获取的新的offset 557 | * 触发机制:当不断轮询的getOffset方法,获取的offset发生改变时,会触发该方法 558 | * 559 | * 实现要点: 560 | * (1)DataFrame的创建: 561 | * 可以通过生成RDD,然后使用RDD创建DataFrame 562 | * RDD创建:sqlContext.sparkContext.parallelize(rows.toSeq) 563 | * DataFrame创建:sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) 564 | * @return DataFrame 565 | */ 566 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = ??? 567 | 568 | /** 569 | * 关闭资源 570 | * 将一些需要关闭的资源放到这里来关闭,如MySQL的数据库连接等 571 | */ 572 | override def stop(): Unit = ??? 573 | } 574 | ``` 575 | 576 | ## 3.3 自定义DataSource的使用 577 | 578 | 自定义DataSource的使用与内置DataSource一样,只需要在format里指定一下我们的Provider类路径即可。如 579 | 580 | ```scala 581 | val source = spark 582 | .readStream 583 | .format("org.apache.spark.sql.kafka010.CustomSourceProvider") 584 | .options(options) 585 | .schema(schema) 586 | .load() 587 | ``` 588 | 589 | ## 3.4 实现MySQL自定义数据源 590 | 591 | 此例子仅仅是为了演示如何自定义数据源,与实际业务场景无关。 592 | 593 | ### 3.4.1 创建MySQLSourceProvider.scala 594 | 595 | ```scala 596 | package org.apache.spark.sql.structured.datasource 597 | 598 | import org.apache.spark.internal.Logging 599 | import org.apache.spark.sql.SQLContext 600 | import org.apache.spark.sql.execution.streaming.{Sink, Source} 601 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider} 602 | import org.apache.spark.sql.streaming.OutputMode 603 | import org.apache.spark.sql.types.StructType 604 | 605 | /** 606 | * @author : shirukai 607 | * @date : 2019-01-25 09:10 608 | * 自定义MySQL数据源 609 | */ 610 | class MySQLSourceProvider extends DataSourceRegister 611 | with StreamSourceProvider 612 | with StreamSinkProvider 613 | with Logging { 614 | /** 615 | * 数据源的描述名字,如:kafka、socket 616 | * 617 | * @return 字符串shotName 618 | */ 619 | override def shortName(): String = "mysql" 620 | 621 | 622 | /** 623 | * 定义数据源的Schema 624 | * 625 | * @param sqlContext Spark SQL 上下文 626 | * @param schema 通过.schema()方法传入的schema 627 | * @param providerName Provider的名称,包名+类名 628 | * @param parameters 通过.option()方法传入的参数 629 | * @return 元组,(shotName,schema) 630 | */ 631 | override def sourceSchema( 632 | sqlContext: SQLContext, 633 | schema: Option[StructType], 634 | providerName: String, 635 | parameters: Map[String, String]): (String, StructType) = { 636 | (providerName, schema.get) 637 | } 638 | 639 | /** 640 | * 创建输入源 641 | * 642 | * @param sqlContext Spark SQL 上下文 643 | * @param metadataPath 元数据Path 644 | * @param schema 通过.schema()方法传入的schema 645 | * @param providerName Provider的名称,包名+类名 646 | * @param parameters 通过.option()方法传入的参数 647 | * @return 自定义source,需要继承Source接口实现 648 | */ 649 | override def createSource( 650 | sqlContext: SQLContext, 651 | metadataPath: String, schema: Option[StructType], 652 | providerName: String, parameters: Map[String, String]): Source = new MySQLSource(sqlContext, parameters, schema) 653 | 654 | /** 655 | * 创建输出源 656 | * 657 | * @param sqlContext Spark SQL 上下文 658 | * @param parameters 通过.option()方法传入的参数 659 | * @param partitionColumns 分区列名? 660 | * @param outputMode 输出模式 661 | * @return 662 | */ 663 | override def createSink( 664 | sqlContext: SQLContext, 665 | parameters: Map[String, String], 666 | partitionColumns: Seq[String], outputMode: OutputMode): Sink = new MySQLSink(sqlContext: SQLContext,parameters, outputMode) 667 | } 668 | ``` 669 | 670 | ### 3.4.2 创建MySQLSource.scala 671 | 672 | ```scala 673 | package org.apache.spark.sql.structured.datasource 674 | 675 | import java.sql.Connection 676 | 677 | import org.apache.spark.executor.InputMetrics 678 | import org.apache.spark.internal.Logging 679 | import org.apache.spark.sql.catalyst.InternalRow 680 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils 681 | import org.apache.spark.sql.execution.streaming.{Offset, Source} 682 | import org.apache.spark.sql.types.StructType 683 | import org.apache.spark.sql.{DataFrame, SQLContext} 684 | import org.json4s.jackson.Serialization 685 | import org.json4s.{Formats, NoTypeHints} 686 | 687 | 688 | /** 689 | * @author : shirukai 690 | * @date : 2019-01-25 09:41 691 | */ 692 | class MySQLSource(sqlContext: SQLContext, 693 | options: Map[String, String], 694 | schemaOption: Option[StructType]) extends Source with Logging { 695 | 696 | lazy val conn: Connection = C3p0Utils.getDataSource(options).getConnection 697 | 698 | val tableName: String = options("tableName") 699 | 700 | var currentOffset: Map[String, Long] = Map[String, Long](tableName -> 0) 701 | 702 | val maxOffsetPerBatch: Option[Long] = Option(100) 703 | 704 | val inputMetrics = new InputMetrics() 705 | 706 | override def schema: StructType = schemaOption.get 707 | 708 | /** 709 | * 获取Offset 710 | * 这里监控MySQL数据库表中条数变化情况 711 | * @return Option[Offset] 712 | */ 713 | override def getOffset: Option[Offset] = { 714 | val latest = getLatestOffset 715 | val offsets = maxOffsetPerBatch match { 716 | case None => MySQLSourceOffset(latest) 717 | case Some(limit) => 718 | MySQLSourceOffset(rateLimit(limit, currentOffset, latest)) 719 | } 720 | Option(offsets) 721 | } 722 | 723 | /** 724 | * 获取数据 725 | * @param start 上一次的offset 726 | * @param end 最新的offset 727 | * @return df 728 | */ 729 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = { 730 | 731 | var offset: Long = 0 732 | if (start.isDefined) { 733 | offset = offset2Map(start.get)(tableName) 734 | } 735 | val limit = offset2Map(end)(tableName) - offset 736 | val sql = s"SELECT * FROM $tableName limit $limit offset $offset" 737 | 738 | val st = conn.prepareStatement(sql) 739 | val rs = st.executeQuery() 740 | val rows: Iterator[InternalRow] = JdbcUtils.resultSetToSparkInternalRows(rs, schemaOption.get, inputMetrics) //todo 好用 741 | val rdd = sqlContext.sparkContext.parallelize(rows.toSeq) 742 | 743 | currentOffset = offset2Map(end) 744 | 745 | sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) 746 | } 747 | 748 | override def stop(): Unit = { 749 | conn.close() 750 | } 751 | 752 | def rateLimit(limit: Long, currentOffset: Map[String, Long], latestOffset: Map[String, Long]): Map[String, Long] = { 753 | val co = currentOffset(tableName) 754 | val lo = latestOffset(tableName) 755 | if (co + limit > lo) { 756 | Map[String, Long](tableName -> lo) 757 | } else { 758 | Map[String, Long](tableName -> (co + limit)) 759 | } 760 | } 761 | 762 | // 获取最新条数 763 | def getLatestOffset: Map[String, Long] = { 764 | var offset: Long = 0 765 | val sql = s"SELECT COUNT(1) FROM $tableName" 766 | val st = conn.prepareStatement(sql) 767 | val rs = st.executeQuery() 768 | while (rs.next()) { 769 | offset = rs.getLong(1) 770 | } 771 | Map[String, Long](tableName -> offset) 772 | } 773 | 774 | def offset2Map(offset: Offset): Map[String, Long] = { 775 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints) 776 | Serialization.read[Map[String, Long]](offset.json()) 777 | } 778 | } 779 | 780 | case class MySQLSourceOffset(offset: Map[String, Long]) extends Offset { 781 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints) 782 | 783 | override def json(): String = Serialization.write(offset) 784 | } 785 | ``` 786 | 787 | ### 3.4.3 测试MySQLSource 788 | 789 | ```scala 790 | package org.apache.spark.sql.structured.datasource 791 | 792 | import org.apache.spark.sql.SparkSession 793 | import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} 794 | 795 | /** 796 | * @author : shirukai 797 | * @date : 2019-01-25 15:12 798 | */ 799 | object MySQLSourceTest { 800 | def main(args: Array[String]): Unit = { 801 | val spark = SparkSession 802 | .builder() 803 | .appName(this.getClass.getSimpleName) 804 | .master("local[2]") 805 | .getOrCreate() 806 | val schema = StructType(List( 807 | StructField("name", StringType), 808 | StructField("creatTime", TimestampType), 809 | StructField("modifyTime", TimestampType) 810 | ) 811 | ) 812 | val options = Map[String, String]( 813 | "driverClass" -> "com.mysql.cj.jdbc.Driver", 814 | "jdbcUrl" -> "jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8", 815 | "user" -> "root", 816 | "password" -> "hollysys", 817 | "tableName" -> "model") 818 | val source = spark 819 | .readStream 820 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider") 821 | .options(options) 822 | .schema(schema) 823 | .load() 824 | 825 | import org.apache.spark.sql.functions._ 826 | val query = source.writeStream.format("console") 827 | // 是否压缩显示 828 | .option("truncate", value = false) 829 | // 显示条数 830 | .option("numRows", 30) 831 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 832 | .start() 833 | query.awaitTermination() 834 | } 835 | } 836 | 837 | ``` 838 | 839 | # 4 自定义输出源 840 | 841 | 相比较输入源的自定义性,输出源自定义的应用场景貌似更为常用。比如:数据写入关系型数据库、数据写入HBase、数据写入Redis等等。其实Structured提供的foreach以及2.4版本的foreachBatch方法已经可以实现绝大数的应用场景的,几乎是数据想写到什么地方都能实现。但是想要更优雅的实现,我们可以参考Spark SQL Sink规范,通过自定义的Sink的方式来实现。实现自定义Sink需要以下四个个步骤: 842 | 843 | 第一步:继承DataSourceRegister和StreamSinkProvider创建自定义SinkProvider类 844 | 845 | 第二步:重写DataSourceRegister类中的shotName和StreamSinkProvider中的createSink方法 846 | 847 | 第三步:继承Sink创建自定义Sink类 848 | 849 | 第四步:重写Sink中的addBatch方法 850 | 851 | ## 4.1 改写CustomDataSourceProvider类 852 | 853 | ### 4.1.1 新增继承StreamSinkProvider 854 | 855 | 在上面创建自定义输入源的基础上,新增继承StreamSourceProvider。如下所示: 856 | 857 | ```scala 858 | class CustomDataSourceProvider extends DataSourceRegister 859 | with StreamSourceProvider 860 | with StreamSinkProvider 861 | with Logging { 862 | //Override some functions …… 863 | } 864 | ``` 865 | 866 | ### 4.1.2 重写StreamSinkProvider中的createSink方法 867 | 868 | 通过传入的参数,来实例化我们自定义的DataSink,是我们自定义Sink的重要入口的地方 869 | 870 | ```scala 871 | /** 872 | * 创建输出源 873 | * 874 | * @param sqlContext Spark SQL 上下文 875 | * @param parameters 通过.option()方法传入的参数 876 | * @param partitionColumns 分区列名? 877 | * @param outputMode 输出模式 878 | * @return 879 | */ 880 | override def createSink(sqlContext: SQLContext, 881 | parameters: Map[String, String], 882 | partitionColumns: Seq[String], 883 | outputMode: OutputMode): Sink = new CustomDataSink(sqlContext,parameters,outputMode) 884 | ``` 885 | 886 | ## 4.2 创建CustomDataSink类 887 | 888 | ### 4.2.1 继承Sink创建CustomDataSink类 889 | 890 | 要创建自定义的DataSink必须要继承位于org.apache.spark.sql.sources包下的Sink。如下所示: 891 | 892 | ```scala 893 | class CustomDataSink(sqlContext: SQLContext, 894 | parameters: Map[String, String], 895 | outputMode: OutputMode) extends Sink with Logging { 896 | // Override some functions 897 | } 898 | ``` 899 | 900 | ### 4.2.2 重写Sink中的addBatch方法 901 | 902 | 该方法是当发生计算时会被触发,传入的是一个batchId和dataFrame,拿到DataFrame之后,我们有三种写出方式,第一种是使用Spark SQL内置的Sink写出,如 JSON数据源、CSV数据源、Text数据源、Parquet数据源、JDBC数据源等。第二种是通过DataFrame的foreachPartition写出。第三种就是自定义SparkSQL的输出源然后写出。 903 | 904 | ```scala 905 | /** 906 | * 添加Batch,即数据写出 907 | * 908 | * @param batchId batchId 909 | * @param data DataFrame 910 | * 触发机制:当发生计算时,会触发该方法,并且得到要输出的DataFrame 911 | * 实现摘要: 912 | * 1. 数据写入方式: 913 | * (1)通过SparkSQL内置的数据源写出 914 | * 我们拿到DataFrame之后可以通过SparkSQL内置的数据源将数据写出,如: 915 | * JSON数据源、CSV数据源、Text数据源、Parquet数据源、JDBC数据源等。 916 | * (2)通过自定义SparkSQL的数据源进行写出 917 | * (3)通过foreachPartition 将数据写出 918 | */ 919 | override def addBatch(batchId: Long, data: DataFrame): Unit = ??? 920 | ``` 921 | 922 | **注意**: 923 | 924 | 当我们使用第一种方式的时候要注意,此时拿到的DataFrame是一个流式的DataFrame,即isStreaming=ture,通过查看KafkaSink,如下代码所示,先是通过DataFrame.queryExecution执行查询,然后在wite里转成rdd,通过rdd的foreachPartition实现。同样的思路,我们可以利用这个rdd和schema,利用sqlContext.internalCreateDataFrame(rdd, data.schema)重新生成DataFrame,这个在MySQLSink中使用过。 925 | 926 | ```scala 927 | override def addBatch(batchId: Long, data: DataFrame): Unit = { 928 | if (batchId <= latestBatchId) { 929 | logInfo(s"Skipping already committed batch $batchId") 930 | } else { 931 | KafkaWriter.write(sqlContext.sparkSession, 932 | data.queryExecution, executorKafkaParams, topic) 933 | latestBatchId = batchId 934 | } 935 | } 936 | 937 | def write( 938 | sparkSession: SparkSession, 939 | queryExecution: QueryExecution, 940 | kafkaParameters: ju.Map[String, Object], 941 | topic: Option[String] = None): Unit = { 942 | val schema = queryExecution.analyzed.output 943 | validateQuery(schema, kafkaParameters, topic) 944 | queryExecution.toRdd.foreachPartition { iter => 945 | val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic) 946 | Utils.tryWithSafeFinally(block = writeTask.execute(iter))( 947 | finallyBlock = writeTask.close()) 948 | } 949 | } 950 | ``` 951 | 952 | 953 | 954 | ## 4.3 自定义DataSink的使用 955 | 956 | 自定义DataSink的使用与自定义DataSource的使用相同,在format里指定一些类Provider的类路径即可。 957 | 958 | ```scala 959 | val query = source.groupBy("creatTime").agg(collect_list("name")).writeStream 960 | .outputMode("update") 961 | .format("org.apache.spark.sql.kafka010.CustomDataSourceProvider") 962 | .option(options) 963 | .start() 964 | query.awaitTermination() 965 | ``` 966 | 967 | ## 4.4 实现MySQL自定义输出源 968 | 969 | ### 4.4.1 修改MySQLSourceProvider.scala 970 | 971 | 上面我们实现MySQL自定义输入源的时候,已经创建了MySQLSourceProvider类,我们需要在这个基础上新增继承StreamSinkProvider,并重写createSink方法,如下所示: 972 | 973 | ```scala 974 | package org.apache.spark.sql.structured.datasource 975 | 976 | import org.apache.spark.internal.Logging 977 | import org.apache.spark.sql.SQLContext 978 | import org.apache.spark.sql.execution.streaming.{Sink, Source} 979 | import org.apache.spark.sql.kafka010.{MySQLSink, MySQLSource} 980 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider} 981 | import org.apache.spark.sql.streaming.OutputMode 982 | import org.apache.spark.sql.types.StructType 983 | 984 | /** 985 | * @author : shirukai 986 | * @date : 2019-01-25 09:10 987 | * 自定义MySQL数据源 988 | */ 989 | class MySQLSourceProvider extends DataSourceRegister 990 | with StreamSourceProvider 991 | with StreamSinkProvider 992 | with Logging { 993 | 994 | //……省略自定义输入源的方法 995 | 996 | /** 997 | * 创建输出源 998 | * 999 | * @param sqlContext Spark SQL 上下文 1000 | * @param parameters 通过.option()方法传入的参数 1001 | * @param partitionColumns 分区列名? 1002 | * @param outputMode 输出模式 1003 | * @return 1004 | */ 1005 | override def createSink( 1006 | sqlContext: SQLContext, 1007 | parameters: Map[String, String], 1008 | partitionColumns: Seq[String], outputMode: OutputMode): Sink = new MySQLSink(sqlContext: SQLContext,parameters, outputMode) 1009 | } 1010 | 1011 | ``` 1012 | 1013 | ### 4.4.1 创建MySQLSink.scala 1014 | 1015 | ```scala 1016 | package org.apache.spark.sql.structured.datasource 1017 | 1018 | import org.apache.spark.internal.Logging 1019 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 1020 | import org.apache.spark.sql.execution.streaming.Sink 1021 | import org.apache.spark.sql.streaming.OutputMode 1022 | 1023 | /** 1024 | * @author : shirukai 1025 | * @date : 2019-01-25 17:35 1026 | */ 1027 | class MySQLSink(sqlContext: SQLContext,parameters: Map[String, String], outputMode: OutputMode) extends Sink with Logging { 1028 | override def addBatch(batchId: Long, data: DataFrame): Unit = { 1029 | val query = data.queryExecution 1030 | val rdd = query.toRdd 1031 | val df = sqlContext.internalCreateDataFrame(rdd, data.schema) 1032 | df.show(false) 1033 | df.write.format("jdbc").options(parameters).mode(SaveMode.Append).save() 1034 | } 1035 | } 1036 | ``` 1037 | 1038 | ### 4.2.3 测试MySQLSink 1039 | 1040 | ```scala 1041 | package org.apache.spark.sql.structured.datasource 1042 | 1043 | import org.apache.spark.sql.SparkSession 1044 | import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} 1045 | 1046 | /** 1047 | * @author : shirukai 1048 | * @date : 2019-01-29 09:57 1049 | * 测试自定义MySQLSource 1050 | */ 1051 | object MySQLSourceTest { 1052 | def main(args: Array[String]): Unit = { 1053 | val spark = SparkSession 1054 | .builder() 1055 | .appName(this.getClass.getSimpleName) 1056 | .master("local[2]") 1057 | .getOrCreate() 1058 | val schema = StructType(List( 1059 | StructField("name", StringType), 1060 | StructField("creatTime", TimestampType), 1061 | StructField("modifyTime", TimestampType) 1062 | ) 1063 | ) 1064 | val options = Map[String, String]( 1065 | "driverClass" -> "com.mysql.cj.jdbc.Driver", 1066 | "jdbcUrl" -> "jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8", 1067 | "user" -> "root", 1068 | "password" -> "hollysys", 1069 | "tableName" -> "model") 1070 | val source = spark 1071 | .readStream 1072 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider") 1073 | .options(options) 1074 | .schema(schema) 1075 | .load() 1076 | 1077 | import org.apache.spark.sql.functions._ 1078 | val query = source.groupBy("creatTime").agg(collect_list("name").cast(StringType).as("names")).writeStream 1079 | .outputMode("update") 1080 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider") 1081 | .option("checkpointLocation", "/tmp/MySQLSourceProvider11") 1082 | .option("user","root") 1083 | .option("password","hollysys") 1084 | .option("dbtable","test") 1085 | .option("url","jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8") 1086 | .start() 1087 | 1088 | query.awaitTermination() 1089 | } 1090 | } 1091 | ``` 1092 | 1093 | # 3 总结 1094 | 1095 | 通过上面的笔记,参看官网文档,可以学习到Structured支持的几种输入源:File Source、Socket Source、Rate Source、Kafka Source,平时我们会用到KafkaSource以及FileSource,SocketSource、RateSource多用于测试场景。关于输入源没有什么优雅的操作,只能通过重写Source来实现。对于输出源来说,Spark Structured提供的foreach以及foreachBatch已经能适用于大多数场景,没有重写Sink的必要。关于Spark SQL 自定义输入源、Streaming自定义数据源后期会慢慢整理出来。 1096 | 1097 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | spark.demo 8 | spark.structured.datasource 9 | 1.0 10 | 11 | 2.3.0 12 | 13 | 14 | 15 | 16 | org.apache.spark 17 | spark-sql_2.11 18 | ${spark.version} 19 | 20 | 21 | 22 | org.apache.spark 23 | spark-streaming_2.11 24 | ${spark.version} 25 | 26 | 27 | 28 | org.apache.spark 29 | spark-sql-kafka-0-10_2.11 30 | ${spark.version} 31 | 32 | 33 | 34 | 35 | org.apache.httpcomponents 36 | fluent-hc 37 | 4.5.6 38 | 39 | 40 | 41 | 42 | com.alibaba 43 | fastjson 44 | 1.2.47 45 | 46 | 47 | 48 | 49 | com.mchange 50 | c3p0 51 | 0.9.5.2 52 | 53 | 54 | 55 | 56 | mysql 57 | mysql-connector-java 58 | 8.0.12 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-shade-plugin 69 | 2.4.3 70 | 71 | 72 | package 73 | 74 | shade 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | org.apache.maven.plugins 96 | maven-compiler-plugin 97 | 98 | 1.8 99 | 1.8 100 | 101 | 102 | 103 | 104 | 105 | org.scala-tools 106 | maven-scala-plugin 107 | 2.15.2 108 | 109 | 110 | scala-compile-first 111 | 112 | compile 113 | 114 | 115 | 116 | **/*.scala 117 | 118 | 119 | 120 | 121 | scala-test-compile 122 | 123 | testCompile 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=ERROR,console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 24 | 25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 26 | # log level for this class is used to overwrite the root logger's log level, so that 27 | # the user can have different defaults for the shell and regular Spark apps. 28 | log4j.logger.org.apache.spark.repl.Main=WARN 29 | 30 | # Settings to quiet third party logs that are too verbose 31 | log4j.logger.org.spark_project.jetty=WARN 32 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 35 | log4j.logger.org.apache.parquet=ERROR 36 | log4j.logger.parquet=ERROR 37 | 38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 41 | 42 | 43 | log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender 44 | log4j.appender.flume.Hostname = localhost 45 | log4j.appender.flume.Port = 9999 46 | log4j.appender.flume.UnsafeMode = true 47 | 48 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/C3p0Utils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource 2 | 3 | import java.util.Properties 4 | 5 | import com.mchange.v2.c3p0.ComboPooledDataSource 6 | 7 | /** 8 | * @author : shirukai 9 | * @date : 2019-01-25 11:24 10 | */ 11 | object C3p0Utils { 12 | def getDataSource(dbOptions: Map[String, String]): ComboPooledDataSource 13 | = { 14 | val properties = new Properties() 15 | dbOptions.foreach(x => properties.setProperty(x._1, x._2)) 16 | val dataSource = new ComboPooledDataSource() 17 | dataSource.setDriverClass(dbOptions("driverClass")) 18 | dataSource.setJdbcUrl(dbOptions("jdbcUrl")) 19 | dataSource.setProperties(properties) 20 | dataSource 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/MySQLSink.scala: -------------------------------------------------------------------------------- 1 | 2 | package org.apache.spark.sql.structured.datasource 3 | 4 | import org.apache.spark.internal.Logging 5 | import org.apache.spark.sql.execution.streaming.Sink 6 | import org.apache.spark.sql.streaming.OutputMode 7 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 8 | 9 | /** 10 | * @author : shirukai 11 | * @date : 2019-01-25 17:35 12 | */ 13 | class MySQLSink(sqlContext: SQLContext,parameters: Map[String, String], outputMode: OutputMode) extends Sink with Logging { 14 | override def addBatch(batchId: Long, data: DataFrame): Unit = { 15 | val query = data.queryExecution 16 | val rdd = query.toRdd 17 | val df = sqlContext.internalCreateDataFrame(rdd, data.schema) 18 | df.show(false) 19 | df.write.format("jdbc").options(parameters).mode(SaveMode.Append).save() 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/MySQLSource.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource 2 | 3 | import java.sql.Connection 4 | 5 | import org.apache.spark.executor.InputMetrics 6 | import org.apache.spark.internal.Logging 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils 9 | import org.apache.spark.sql.execution.streaming.{Offset, Source} 10 | import org.apache.spark.sql.types.StructType 11 | import org.apache.spark.sql.{DataFrame, SQLContext} 12 | import org.json4s.jackson.Serialization 13 | import org.json4s.{Formats, NoTypeHints} 14 | 15 | 16 | /** 17 | * @author : shirukai 18 | * @date : 2019-01-25 09:41 19 | */ 20 | class MySQLSource(sqlContext: SQLContext, 21 | options: Map[String, String], 22 | schemaOption: Option[StructType]) extends Source with Logging { 23 | 24 | lazy val conn: Connection = C3p0Utils.getDataSource(options).getConnection 25 | 26 | val tableName: String = options("tableName") 27 | 28 | var currentOffset: Map[String, Long] = Map[String, Long](tableName -> 0) 29 | 30 | val maxOffsetPerBatch: Option[Long] = Option(100) 31 | 32 | val inputMetrics = new InputMetrics() 33 | 34 | override def schema: StructType = schemaOption.get 35 | 36 | /** 37 | * 获取Offset 38 | * 这里监控MySQL数据库表中条数变化情况 39 | * @return Option[Offset] 40 | */ 41 | override def getOffset: Option[Offset] = { 42 | val latest = getLatestOffset 43 | val offsets = maxOffsetPerBatch match { 44 | case None => MySQLSourceOffset(latest) 45 | case Some(limit) => 46 | MySQLSourceOffset(rateLimit(limit, currentOffset, latest)) 47 | } 48 | Option(offsets) 49 | } 50 | 51 | /** 52 | * 获取数据 53 | * @param start 上一次的offset 54 | * @param end 最新的offset 55 | * @return df 56 | */ 57 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = { 58 | 59 | var offset: Long = 0 60 | if (start.isDefined) { 61 | offset = offset2Map(start.get)(tableName) 62 | } 63 | val limit = offset2Map(end)(tableName) - offset 64 | val sql = s"SELECT * FROM $tableName limit $limit offset $offset" 65 | 66 | val st = conn.prepareStatement(sql) 67 | val rs = st.executeQuery() 68 | val rows: Iterator[InternalRow] = JdbcUtils.resultSetToSparkInternalRows(rs, schemaOption.get, inputMetrics) //todo 好用 69 | val rdd = sqlContext.sparkContext.parallelize(rows.toSeq) 70 | 71 | currentOffset = offset2Map(end) 72 | 73 | sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) 74 | } 75 | 76 | override def stop(): Unit = { 77 | conn.close() 78 | } 79 | 80 | def rateLimit(limit: Long, currentOffset: Map[String, Long], latestOffset: Map[String, Long]): Map[String, Long] = { 81 | val co = currentOffset(tableName) 82 | val lo = latestOffset(tableName) 83 | if (co + limit > lo) { 84 | Map[String, Long](tableName -> lo) 85 | } else { 86 | Map[String, Long](tableName -> (co + limit)) 87 | } 88 | } 89 | 90 | // 获取最新条数 91 | def getLatestOffset: Map[String, Long] = { 92 | var offset: Long = 0 93 | val sql = s"SELECT COUNT(1) FROM $tableName" 94 | val st = conn.prepareStatement(sql) 95 | val rs = st.executeQuery() 96 | while (rs.next()) { 97 | offset = rs.getLong(1) 98 | } 99 | Map[String, Long](tableName -> offset) 100 | } 101 | 102 | def offset2Map(offset: Offset): Map[String, Long] = { 103 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints) 104 | Serialization.read[Map[String, Long]](offset.json()) 105 | } 106 | } 107 | 108 | case class MySQLSourceOffset(offset: Map[String, Long]) extends Offset { 109 | implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints) 110 | 111 | override def json(): String = Serialization.write(offset) 112 | } 113 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/MySQLSourceProvider.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.sql.execution.streaming.{Sink, Source} 6 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider} 7 | import org.apache.spark.sql.streaming.OutputMode 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * @author : shirukai 12 | * @date : 2019-01-25 09:10 13 | * 自定义MySQL数据源 14 | */ 15 | class MySQLSourceProvider extends DataSourceRegister 16 | with StreamSourceProvider 17 | with StreamSinkProvider 18 | with Logging { 19 | /** 20 | * 数据源的描述名字,如:kafka、socket 21 | * 22 | * @return 字符串shotName 23 | */ 24 | override def shortName(): String = "mysql" 25 | 26 | 27 | /** 28 | * 定义数据源的Schema 29 | * 30 | * @param sqlContext Spark SQL 上下文 31 | * @param schema 通过.schema()方法传入的schema 32 | * @param providerName Provider的名称,包名+类名 33 | * @param parameters 通过.option()方法传入的参数 34 | * @return 元组,(shotName,schema) 35 | */ 36 | override def sourceSchema( 37 | sqlContext: SQLContext, 38 | schema: Option[StructType], 39 | providerName: String, 40 | parameters: Map[String, String]): (String, StructType) = { 41 | (providerName, schema.get) 42 | } 43 | 44 | /** 45 | * 创建输入源 46 | * 47 | * @param sqlContext Spark SQL 上下文 48 | * @param metadataPath 元数据Path 49 | * @param schema 通过.schema()方法传入的schema 50 | * @param providerName Provider的名称,包名+类名 51 | * @param parameters 通过.option()方法传入的参数 52 | * @return 自定义source,需要继承Source接口实现 53 | */ 54 | override def createSource( 55 | sqlContext: SQLContext, 56 | metadataPath: String, schema: Option[StructType], 57 | providerName: String, parameters: Map[String, String]): Source = new MySQLSource(sqlContext, parameters, schema) 58 | 59 | /** 60 | * 创建输出源 61 | * 62 | * @param sqlContext Spark SQL 上下文 63 | * @param parameters 通过.option()方法传入的参数 64 | * @param partitionColumns 分区列名? 65 | * @param outputMode 输出模式 66 | * @return 67 | */ 68 | override def createSink( 69 | sqlContext: SQLContext, 70 | parameters: Map[String, String], 71 | partitionColumns: Seq[String], outputMode: OutputMode): Sink = new MySQLSink(sqlContext: SQLContext,parameters, outputMode) 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/custom/CustomDataSink.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.custom 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.execution.streaming.Sink 5 | import org.apache.spark.sql.streaming.OutputMode 6 | import org.apache.spark.sql.{DataFrame, SQLContext} 7 | 8 | /** 9 | * @author : shirukai 10 | * @date : 2019-01-25 18:03 11 | * 自定义数据输出源 12 | */ 13 | class CustomDataSink(sqlContext: SQLContext, 14 | parameters: Map[String, String], 15 | outputMode: OutputMode) extends Sink with Logging { 16 | 17 | /** 18 | * 添加Batch,即数据写出 19 | * 20 | * @param batchId batchId 21 | * @param data DataFrame 22 | * 触发机制:当发生计算时,会触发该方法,并且得到要输出的DataFrame 23 | * 实现摘要: 24 | * 1. 数据写入方式: 25 | * (1)通过SparkSQL内置的数据源写出 26 | * 我们拿到DataFrame之后可以通过SparkSQL内置的数据源将数据写出,如: 27 | * JSON数据源、CSV数据源、Text数据源、Parquet数据源、JDBC数据源等。 28 | * (2)通过自定义SparkSQL的数据源进行写出 29 | * (3)通过foreachPartition 将数据写出 30 | */ 31 | override def addBatch(batchId: Long, data: DataFrame): Unit = ??? 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/custom/CustomDataSource.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.custom 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.execution.streaming.{Offset, Source} 5 | import org.apache.spark.sql.types.StructType 6 | import org.apache.spark.sql.{DataFrame, SQLContext} 7 | 8 | /** 9 | * @author : shirukai 10 | * @date : 2019-01-25 18:03 11 | * 自定义数据输入源:需要继承Source接口 12 | * 实现思路: 13 | * (1)通过重写schema方法来指定数据输入源的schema,这个schema需要与Provider中指定的schema保持一致 14 | * (2)通过重写getOffset方法来获取数据的偏移量,这个方法会一直被轮询调用,不断的获取偏移量 15 | * (3) 通过重写getBatch方法,来获取数据,这个方法是在偏移量发生改变后被触发 16 | * (4)通过stop方法,来进行一下关闭资源的操作 17 | * 18 | */ 19 | class CustomDataSource(sqlContext: SQLContext, 20 | parameters: Map[String, String], 21 | schemaOption: Option[StructType]) extends Source 22 | with Logging { 23 | 24 | /** 25 | * 指定数据源的schema,需要与Provider中sourceSchema中指定的schema保持一直,否则会报异常 26 | * 触发机制:当创建数据源的时候被触发执行 27 | * 28 | * @return schema 29 | */ 30 | override def schema: StructType = schemaOption.get 31 | 32 | /** 33 | * 获取offset,用来监控数据的变化情况 34 | * 触发机制:不断轮询调用 35 | * 实现要点: 36 | * (1)Offset的实现: 37 | * 由函数返回值可以看出,我们需要提供一个标准的返回值Option[Offset] 38 | * 我们可以通过继承 org.apache.spark.sql.sources.v2.reader.streaming.Offset实现,这里面其实就是保存了个json字符串 39 | * 40 | * (2) JSON转化 41 | * 因为Offset里实现的是一个json字符串,所以我们需要将我们存放offset的集合或者case class转化重json字符串 42 | * spark里是通过org.json4s.jackson这个包来实现case class 集合类(Map、List、Seq、Set等)与json字符串的相互转化 43 | * 44 | * @return Offset 45 | */ 46 | override def getOffset: Option[Offset] = ??? 47 | 48 | /** 49 | * 获取数据 50 | * 51 | * @param start 上一个批次的end offset 52 | * @param end 通过getOffset获取的新的offset 53 | * 触发机制:当不断轮询的getOffset方法,获取的offset发生改变时,会触发该方法 54 | * 55 | * 实现要点: 56 | * (1)DataFrame的创建: 57 | * 可以通过生成RDD,然后使用RDD创建DataFrame 58 | * RDD创建:sqlContext.sparkContext.parallelize(rows.toSeq) 59 | * DataFrame创建:sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) 60 | * @return DataFrame 61 | */ 62 | override def getBatch(start: Option[Offset], end: Offset): DataFrame = ??? 63 | 64 | /** 65 | * 关闭资源 66 | * 将一些需要关闭的资源放到这里来关闭,如MySQL的数据库连接等 67 | */ 68 | override def stop(): Unit = ??? 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/custom/CustomDataSourceProvider.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.custom 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.sql.execution.streaming.{Sink, Source} 6 | import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider, StreamSourceProvider} 7 | import org.apache.spark.sql.streaming.OutputMode 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * @author : shirukai 12 | * @date : 2019-01-25 17:49 13 | * 自定义Structured Streaming数据源 14 | * 15 | * (1)继承DataSourceRegister类 16 | * 需要重写shortName方法,用来向Spark注册该组件 17 | * 18 | * (2)继承StreamSourceProvider类 19 | * 需要重写createSource以及sourceSchema方法,用来创建数据输入源 20 | * 21 | * (3)继承StreamSinkProvider类 22 | * 需要重写createSink方法,用来创建数据输出源 23 | * 24 | * 25 | */ 26 | class CustomDataSourceProvider extends DataSourceRegister 27 | with StreamSourceProvider 28 | with StreamSinkProvider 29 | with Logging { 30 | 31 | 32 | /** 33 | * 数据源的描述名字,如:kafka、socket 34 | * 35 | * @return 字符串shotName 36 | */ 37 | override def shortName(): String = "custom" 38 | 39 | 40 | /** 41 | * 定义数据源的Schema 42 | * 43 | * @param sqlContext Spark SQL 上下文 44 | * @param schema 通过.schema()方法传入的schema 45 | * @param providerName Provider的名称,包名+类名 46 | * @param parameters 通过.option()方法传入的参数 47 | * @return 元组,(shotName,schema) 48 | */ 49 | override def sourceSchema(sqlContext: SQLContext, 50 | schema: Option[StructType], 51 | providerName: String, 52 | parameters: Map[String, String]): (String, StructType) = (shortName(),schema.get) 53 | 54 | /** 55 | * 创建输入源 56 | * 57 | * @param sqlContext Spark SQL 上下文 58 | * @param metadataPath 元数据Path 59 | * @param schema 通过.schema()方法传入的schema 60 | * @param providerName Provider的名称,包名+类名 61 | * @param parameters 通过.option()方法传入的参数 62 | * @return 自定义source,需要继承Source接口实现 63 | **/ 64 | 65 | override def createSource(sqlContext: SQLContext, 66 | metadataPath: String, 67 | schema: Option[StructType], 68 | providerName: String, 69 | parameters: Map[String, String]): Source = new CustomDataSource(sqlContext,parameters,schema) 70 | 71 | 72 | /** 73 | * 创建输出源 74 | * 75 | * @param sqlContext Spark SQL 上下文 76 | * @param parameters 通过.option()方法传入的参数 77 | * @param partitionColumns 分区列名? 78 | * @param outputMode 输出模式 79 | * @return 80 | */ 81 | override def createSink(sqlContext: SQLContext, 82 | parameters: Map[String, String], 83 | partitionColumns: Seq[String], 84 | outputMode: OutputMode): Sink = new CustomDataSink(sqlContext,parameters,outputMode) 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/ConsoleSinkExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import java.util.UUID 4 | 5 | import org.apache.spark.sql.SparkSession 6 | 7 | /** 8 | * @author : shirukai 9 | * @date : 2019-01-26 09:58 10 | * Spark Structured 内置ConsoleSink用例 11 | */ 12 | object ConsoleSinkExample { 13 | def main(args: Array[String]): Unit = { 14 | 15 | val spark = SparkSession 16 | .builder() 17 | .appName(this.getClass.getSimpleName) 18 | .master("local[2]") 19 | .getOrCreate() 20 | 21 | val source = spark.readStream 22 | .format("rate") 23 | // 每秒生成的行数,默认值为1 24 | .option("rowsPerSecond", 10) 25 | .option("numPartitions", 10) 26 | .load() 27 | 28 | val consoleSink = source.writeStream 29 | .format("console") 30 | // 是否压缩显示 31 | .option("truncate", value = false) 32 | // 显示条数 33 | .option("numRows", 30) 34 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 35 | .start() 36 | 37 | consoleSink.awaitTermination() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/FileSinkExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import java.util.UUID 4 | 5 | import org.apache.spark.sql.SparkSession 6 | 7 | /** 8 | * @author : shirukai 9 | * @date : 2019-01-26 09:58 10 | * Spark Structured 内置Sink用例 11 | */ 12 | object FileSinkExample { 13 | def main(args: Array[String]): Unit = { 14 | 15 | val spark = SparkSession 16 | .builder() 17 | .appName(this.getClass.getSimpleName) 18 | .master("local[2]") 19 | .getOrCreate() 20 | 21 | val source = spark.readStream 22 | .format("rate") 23 | // 每秒生成的行数,默认值为1 24 | .option("rowsPerSecond", 10) 25 | .option("numPartitions", 10) 26 | .load() 27 | 28 | val fileSink = source.writeStream 29 | .format("parquet") 30 | //.format("csv") 31 | //.format("orc") 32 | // .format("json") 33 | .option("path", "data/sink") 34 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 35 | .start() 36 | 37 | fileSink.awaitTermination() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/FileSourceExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types._ 5 | 6 | /** 7 | * @author : shirukai 8 | * @date : 2019-01-25 19:18 9 | * 文件数据源测试 10 | */ 11 | object FileSourceExample { 12 | def main(args: Array[String]): Unit = { 13 | val spark = SparkSession 14 | .builder() 15 | .appName(this.getClass.getSimpleName) 16 | .master("local[2]") 17 | .getOrCreate() 18 | 19 | val source = spark 20 | .readStream 21 | // Schema must be specified when creating a streaming source DataFrame. 22 | .schema(StructType(List( 23 | StructField("name", StringType), 24 | StructField("value", IntegerType) 25 | ))) 26 | // 每个trigger最大文件数量 27 | .option("maxFilesPerTrigger", 100) 28 | // 是否首先计算最新的文件,默认为false 29 | .option("latestFirst", value = true) 30 | // 是否值检查名字,如果名字相同,则不视为更新,默认为false 31 | .option("fileNameOnly", value = true) 32 | .csv("*.csv") 33 | 34 | val query = source.writeStream 35 | .outputMode("update") 36 | .format("console") 37 | //.option("checkpointLocation", checkpointLocation) 38 | .option("truncate", value = false) 39 | .start() 40 | 41 | query.awaitTermination() 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/ForeachSinkExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import org.apache.spark.sql.{ForeachWriter, Row, SparkSession} 4 | 5 | /** 6 | * @author : shirukai 7 | * @date : 2019-01-26 09:58 8 | * Spark Structured 内置ForeachSink用例 9 | */ 10 | object ForeachSinkExample { 11 | def main(args: Array[String]): Unit = { 12 | 13 | val spark = SparkSession 14 | .builder() 15 | .appName(this.getClass.getSimpleName) 16 | .master("local[2]") 17 | .getOrCreate() 18 | 19 | val source = spark.readStream 20 | .format("rate") 21 | // 每秒生成的行数,默认值为1 22 | .option("rowsPerSecond", 10) 23 | .option("numPartitions", 10) 24 | .load() 25 | 26 | val foreachSink = source.writeStream 27 | .foreach(new ForeachWriter[Row] { 28 | override def open(partitionId: Long, version: Long): Boolean = { 29 | println(s"partitionId=$partitionId,version=$version") 30 | true 31 | 32 | } 33 | 34 | override def process(value: Row): Unit = { 35 | println(value) 36 | } 37 | 38 | override def close(errorOrNull: Throwable): Unit = { 39 | println("close") 40 | } 41 | }) 42 | .start() 43 | 44 | foreachSink.awaitTermination() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/KafkaSinkExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import java.util.UUID 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.types.StringType 7 | 8 | /** 9 | * @author : shirukai 10 | * @date : 2019-01-26 09:58 11 | * Spark Structured 内置KafkaSink用例 12 | */ 13 | object KafkaSinkExample { 14 | def main(args: Array[String]): Unit = { 15 | 16 | val spark = SparkSession 17 | .builder() 18 | .appName(this.getClass.getSimpleName) 19 | .master("local[2]") 20 | .getOrCreate() 21 | 22 | val source = spark.readStream 23 | .format("rate") 24 | // 每秒生成的行数,默认值为1 25 | .option("rowsPerSecond", 10) 26 | .option("numPartitions", 10) 27 | .load() 28 | import org.apache.spark.sql.functions._ 29 | import spark.implicits._ 30 | val kafkaSink = source.select(array(to_json(struct("*"))).as("value").cast(StringType), 31 | $"timestamp".as("key").cast(StringType)).writeStream 32 | .format("kafka") 33 | .option("kafka.bootstrap.servers", "localhost:9092") 34 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 35 | .option("topic", "hiacloud-ts-dev") 36 | .start() 37 | 38 | 39 | kafkaSink.awaitTermination() 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/KafkaSourceExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * @author : shirukai 7 | * @date : 2019-01-26 09:46 8 | * Kafka输入源测试 9 | */ 10 | object KafkaSourceExample { 11 | def main(args: Array[String]): Unit = { 12 | val spark = SparkSession 13 | .builder() 14 | .appName(this.getClass.getSimpleName) 15 | .master("local[2]") 16 | .getOrCreate() 17 | 18 | val source = spark 19 | .readStream 20 | .format("kafka") 21 | .option("kafka.bootstrap.servers", "localhost:9092") 22 | .option("subscribe", "hiacloud-ts-dev") 23 | //.option("startingOffsets", "earliest") 24 | .option("failOnDataLoss", "true") 25 | .load() 26 | 27 | val query = source.writeStream 28 | .outputMode("update") 29 | .format("console") 30 | //.option("checkpointLocation", checkpointLocation) 31 | .option("truncate", value = false) 32 | .start() 33 | 34 | query.awaitTermination() 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/MemorySinkExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import java.util.UUID 4 | 5 | import org.apache.spark.sql.SparkSession 6 | 7 | /** 8 | * @author : shirukai 9 | * @date : 2019-01-26 09:58 10 | * Spark Structured 内置MemorySink用例 11 | */ 12 | object MemorySinkExample { 13 | def main(args: Array[String]): Unit = { 14 | 15 | val spark = SparkSession 16 | .builder() 17 | .appName(this.getClass.getSimpleName) 18 | .master("local[2]") 19 | .getOrCreate() 20 | 21 | val source = spark.readStream 22 | .format("rate") 23 | // 每秒生成的行数,默认值为1 24 | .option("rowsPerSecond", 10) 25 | .option("numPartitions", 10) 26 | .load() 27 | 28 | val memorySink = source.writeStream 29 | .format("memory") 30 | .queryName("memorySinkTable") 31 | .option("checkpointLocation", "/tmp/temporary-" + UUID.randomUUID.toString) 32 | .start() 33 | 34 | 35 | new Thread(new Runnable { 36 | override def run(): Unit = { 37 | while (true) { 38 | spark.sql("select * from memorySinkTable").show(false) 39 | Thread.sleep(1000) 40 | } 41 | } 42 | }).start() 43 | memorySink.awaitTermination() 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/RateSourceExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * @author : shirukai 7 | * @date : 2019-01-25 20:04 8 | * 基于RateSource的数据源测试 9 | */ 10 | object RateSourceExample { 11 | def main(args: Array[String]): Unit = { 12 | val spark = SparkSession 13 | .builder() 14 | .appName(this.getClass.getSimpleName) 15 | .master("local[2]") 16 | .getOrCreate() 17 | 18 | val rate = spark.readStream 19 | .format("rate") 20 | // 每秒生成的行数,默认值为1 21 | .option("rowsPerSecond", 10) 22 | .option("numPartitions", 10) 23 | .load() 24 | 25 | val query =rate.writeStream 26 | .outputMode("update") 27 | .format("console") 28 | .option("truncate", value = false) 29 | .start() 30 | 31 | query.awaitTermination() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/structured/datasource/example/SocketSourceExample.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * @author : shirukai 7 | * @date : 2019-01-25 19:57 8 | * 基于Socket的数据源 nc -lc 9090 9 | */ 10 | object SocketSourceExample { 11 | def main(args: Array[String]): Unit = { 12 | val spark = SparkSession 13 | .builder() 14 | .appName(this.getClass.getSimpleName) 15 | .master("local[2]") 16 | .getOrCreate() 17 | 18 | val lines = spark.readStream 19 | .format("socket") 20 | .option("host", "localhost") 21 | .option("port", 9090) 22 | .load() 23 | 24 | val query = lines.writeStream 25 | .outputMode("update") 26 | .format("console") 27 | .option("truncate", value = false) 28 | .start() 29 | 30 | query.awaitTermination() 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/java/org/apache/spark/sql/structured/datasource/MySQLSourceTest.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.structured.datasource 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} 5 | 6 | /** 7 | * @author : shirukai 8 | * @date : 2019-01-29 09:57 9 | * 测试自定义MySQLSource 10 | */ 11 | object MySQLSourceTest { 12 | def main(args: Array[String]): Unit = { 13 | val spark = SparkSession 14 | .builder() 15 | .appName(this.getClass.getSimpleName) 16 | .master("local[2]") 17 | .getOrCreate() 18 | val schema = StructType(List( 19 | StructField("name", StringType), 20 | StructField("creatTime", TimestampType), 21 | StructField("modifyTime", TimestampType) 22 | ) 23 | ) 24 | val options = Map[String, String]( 25 | "driverClass" -> "com.mysql.cj.jdbc.Driver", 26 | "jdbcUrl" -> "jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8", 27 | "user" -> "root", 28 | "password" -> "hollysys", 29 | "tableName" -> "model") 30 | val source = spark 31 | .readStream 32 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider") 33 | .options(options) 34 | .schema(schema) 35 | .load() 36 | 37 | import org.apache.spark.sql.functions._ 38 | val query = source.groupBy("creatTime").agg(collect_list("name").cast(StringType).as("names")).writeStream 39 | .outputMode("update") 40 | .format("org.apache.spark.sql.structured.datasource.MySQLSourceProvider") 41 | .option("checkpointLocation", "/tmp/MySQLSourceProvider11") 42 | .option("user","root") 43 | .option("password","hollysys") 44 | .option("dbtable","test") 45 | .option("url","jdbc:mysql://localhost:3306/spark-source?useSSL=false&characterEncoding=utf-8") 46 | .start() 47 | 48 | query.awaitTermination() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=ERROR,console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 24 | 25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 26 | # log level for this class is used to overwrite the root logger's log level, so that 27 | # the user can have different defaults for the shell and regular Spark apps. 28 | log4j.logger.org.apache.spark.repl.Main=WARN 29 | 30 | # Settings to quiet third party logs that are too verbose 31 | log4j.logger.org.spark_project.jetty=WARN 32 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 35 | log4j.logger.org.apache.parquet=ERROR 36 | log4j.logger.parquet=ERROR 37 | 38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 41 | 42 | 43 | log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender 44 | log4j.appender.flume.Hostname = localhost 45 | log4j.appender.flume.Port = 9999 46 | log4j.appender.flume.UnsafeMode = true 47 | 48 | --------------------------------------------------------------------------------