├── README.md ├── flink-coding ├── pom.xml └── src │ └── main │ ├── resources │ └── hive-site.xml │ └── scala │ └── com │ └── anryg │ ├── FlinkDSFromKafka2HDFS.scala │ ├── FlinkTest04.scala │ ├── hive_cdc │ ├── FlinkReadKafka2Hive.scala │ └── FlinkWithHive.scala │ └── window_and_watermark │ ├── FlinkDSFromKafkaWithWatermark.scala │ ├── FlinkSQLFromKafkaWithWatermarkAndWindow.scala │ └── FlinkTBFromKafkaWithWatermark.scala ├── pom.xml ├── redis ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── anryg │ └── bigdata │ ├── IPUtils.java │ ├── IpSearch.java │ ├── RedisClientUtils.java │ └── RedisParam.java └── spark-coding ├── pom.xml ├── spark-coding.iml └── src └── main ├── java └── com │ └── anryg │ └── bigdata │ └── clickhouse │ └── CKSink.java └── scala └── com └── anryg └── bigdata ├── hive ├── ConnectHive.scala └── Spark3ConnectHive3.scala ├── streaming ├── Kafka2CK.scala ├── StreamingProcessHelper.scala ├── StructuredStreamingTest.scala ├── demo │ ├── StructuredStreaming4Kafka2CSV.scala │ ├── StructuredStreamingFromKafka.scala │ ├── StructuredStreamingFromKafka2ES.scala │ ├── StructuredStreamingFromKafka2Hive.scala │ ├── StructuredStreamingReadHive.scala │ └── window_watermark │ │ └── WorldCountWithWatermark.scala ├── dwd │ └── StreamingFromOds2Dwd.scala └── ods │ └── StreamingSource2HiveOds.scala └── test ├── data_skew ├── DataSkew01.scala ├── DataSkew02.scala └── MyPartitioner.scala └── map_pk_mappartition ├── MapPartitionTest.scala └── MapTest.scala /README.md: -------------------------------------------------------------------------------- 1 | # internet_behavior_project 2 | 大数据项目之用户上网行为分析 3 | 4 | 5 | 数据源解读 6 | 这份数据长这样,有非常规整的9个字段(我都替你清洗过了),为了方便你们读取,我把它导出成CSV文件,其中第一行是schema。 7 | 8 | 为了方便大家获取,我把它放到了云盘上,原文件有12G,我通过压缩之后,也有3G,为了保证大家是真的用这份数据在学习,而不是干别的,这个下载地址需要你加我微信后告诉你。 9 | 10 | 现在来帮你解读下这份数据,一共个9个字段,其字段意义解释分别如下: 11 | client_ip: 指上网用户的ip地址,你可以根据这个ip知道这个用户大概的位置信息,这个有专门的api可以查询; 12 | domain:指上网人要上的网站地址,你可以根据该网站的性质来判断这个人的上网行为; 13 | time:上网人的上网时间; 14 | target_ip: 上网人要上的网站的目标ip地址; 15 | rcode:网站返回状态码,0为正常响应,2为不正常; 16 | query_type: 查询类型,几乎都是1,即正常上网行为; 17 | authority_recode:网站服务器真正返回的域名,可能跟domain不一样,如果不一样的话,可能说明是个钓鱼网站之类的,你可以去分析分析; 18 | add_msg: 附加信息,几乎都为空,你可以看看如果有内容的话,到底是什么玩意; 19 | dns_ip:当前要上的这个网站由哪个DNS服务器给提供的解析,一般一个DNS服务器会服务一个区域,如果由同一个DNS服务器进行解析的,说明他们在同一片大的区域; 20 | 21 | 以上是对这份数据的字段解读,相信从这些解释中,你已经大概能了解这份数据的作用了。 22 | -------------------------------------------------------------------------------- /flink-coding/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | internet_behavior_project 7 | com.anryg.bigdata 8 | 1.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | flink-coding 13 | 14 | flink-coding 15 | 16 | http://www.example.com 17 | 18 | 19 | UTF-8 20 | 1.8 21 | 1.8 22 | 23 | 1.15.2 24 | 3.1.0 25 | 26 | 27 | 28 | 29 | 30 | org.apache.flink 31 | flink-streaming-scala_2.12 32 | ${flink.version} 33 | 34 | 35 | commons-math3 36 | org.apache.commons 37 | 38 | 39 | 40 | 41 | org.apache.flink 42 | flink-clients 43 | ${flink.version} 44 | 45 | 46 | 47 | org.apache.flink 48 | flink-connector-kafka 49 | ${flink.version} 50 | 51 | 52 | 53 | org.apache.flink 54 | flink-connector-hive_2.12 55 | ${flink.version} 56 | 57 | 58 | org.apache.hive 59 | hive-exec 60 | ${hadoop.version} 61 | 62 | 63 | calcite-core 64 | org.apache.calcite 65 | 66 | 67 | calcite-linq4j 68 | org.apache.calcite 69 | 70 | 71 | 72 | 73 | 74 | 75 | org.apache.flink 76 | flink-table-api-scala-bridge_2.12 77 | ${flink.version} 78 | 79 | 80 | org.apache.flink 81 | flink-table-planner_2.12 82 | ${flink.version} 83 | provided 84 | 85 | 86 | 87 | 88 | org.apache.flink 89 | flink-connector-elasticsearch7 90 | ${flink.version} 91 | 92 | 93 | 94 | org.apache.hadoop 95 | hadoop-common 96 | ${hadoop.version} 97 | 98 | 99 | commons-compress 100 | org.apache.commons 101 | 102 | 103 | 104 | 105 | org.apache.hadoop 106 | hadoop-client 107 | ${hadoop.version} 108 | 109 | 110 | commons-compress 111 | org.apache.commons 112 | 113 | 114 | 115 | 116 | org.apache.hadoop 117 | hadoop-hdfs 118 | ${hadoop.version} 119 | 120 | 121 | 122 | org.apache.flink 123 | flink-csv 124 | ${flink.version} 125 | 126 | 127 | org.apache.flink 128 | flink-hadoop-compatibility_2.12 129 | ${flink.version} 130 | 131 | 132 | 133 | com.alibaba 134 | fastjson 135 | 1.2.71 136 | 137 | 138 | junit 139 | junit 140 | 4.11 141 | test 142 | 143 | 144 | 145 | 146 | src/main/scala 147 | src/main/test 148 | 149 | 150 | 151 | 152 | org.apache.maven.plugins 153 | maven-shade-plugin 154 | 3.2.0 155 | 156 | true 157 | with-dependencies 158 | 159 | 160 | *:* 161 | 162 | 163 | junit:junit 164 | 165 | 166 | 167 | 168 | 169 | *:* 170 | 171 | META-INF/*.SF 172 | META-INF/*.DSA 173 | META-INF/*.RSA 174 | 175 | 176 | 177 | false 178 | 179 | 184 | 185 | 186 | 187 | package 188 | 189 | shade 190 | 191 | 192 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | org.codehaus.mojo 218 | build-helper-maven-plugin 219 | 3.0.0 220 | 221 | 222 | add-source 223 | generate-sources 224 | 225 | add-source 226 | 227 | 228 | 229 | src/main/java 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | net.alchim31.maven 238 | scala-maven-plugin 239 | 3.2.1 240 | 241 | 242 | 243 | compile 244 | testCompile 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | -------------------------------------------------------------------------------- /flink-coding/src/main/resources/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ambari.hive.db.schema.name 5 | hive 6 | 7 | 8 | 9 | atlas.hook.hive.maxThreads 10 | 1 11 | 12 | 13 | 14 | atlas.hook.hive.minThreads 15 | 1 16 | 17 | 18 | 19 | credentialStoreClassPath 20 | /var/lib/ambari-agent/cred/lib/* 21 | 22 | 23 | 24 | datanucleus.autoCreateSchema 25 | false 26 | 27 | 28 | 29 | datanucleus.cache.level2.type 30 | none 31 | 32 | 33 | 34 | datanucleus.fixedDatastore 35 | true 36 | 37 | 38 | 39 | hadoop.security.credential.provider.path 40 | jceks://file/usr/hdp/current/hive-server2/conf/hive-site.jceks 41 | 42 | 43 | 44 | hive.auto.convert.join 45 | true 46 | 47 | 48 | 49 | hive.auto.convert.join.noconditionaltask 50 | true 51 | 52 | 53 | 54 | hive.auto.convert.join.noconditionaltask.size 55 | 2147483648 56 | 57 | 58 | 59 | hive.auto.convert.sortmerge.join 60 | true 61 | 62 | 63 | 64 | hive.auto.convert.sortmerge.join.to.mapjoin 65 | true 66 | 67 | 68 | 69 | hive.cbo.enable 70 | true 71 | 72 | 73 | 74 | hive.cli.print.header 75 | false 76 | 77 | 78 | 79 | hive.cluster.delegation.token.store.class 80 | org.apache.hadoop.hive.thrift.ZooKeeperTokenStore 81 | 82 | 83 | 84 | hive.cluster.delegation.token.store.zookeeper.connectString 85 | hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181 86 | 87 | 88 | 89 | hive.cluster.delegation.token.store.zookeeper.znode 90 | /hive/cluster/delegation 91 | 92 | 93 | 94 | hive.compactor.abortedtxn.threshold 95 | 1000 96 | 97 | 98 | 99 | hive.compactor.check.interval 100 | 300 101 | 102 | 103 | 104 | hive.compactor.delta.num.threshold 105 | 10 106 | 107 | 108 | 109 | hive.compactor.delta.pct.threshold 110 | 0.1f 111 | 112 | 113 | 114 | hive.compactor.initiator.on 115 | true 116 | 117 | 118 | 119 | hive.compactor.worker.threads 120 | 7 121 | 122 | 123 | 124 | hive.compactor.worker.timeout 125 | 86400 126 | 127 | 128 | 129 | hive.compute.query.using.stats 130 | true 131 | 132 | 133 | 134 | hive.convert.join.bucket.mapjoin.tez 135 | false 136 | 137 | 138 | 139 | hive.create.as.insert.only 140 | true 141 | 142 | 143 | 144 | hive.default.fileformat 145 | TextFile 146 | 147 | 148 | 149 | hive.default.fileformat.managed 150 | ORC 151 | 152 | 153 | 154 | hive.driver.parallel.compilation 155 | true 156 | 157 | 158 | 159 | hive.enforce.sortmergebucketmapjoin 160 | true 161 | 162 | 163 | 164 | hive.exec.compress.intermediate 165 | false 166 | 167 | 168 | 169 | hive.exec.compress.output 170 | false 171 | 172 | 173 | 174 | hive.exec.dynamic.partition 175 | true 176 | 177 | 178 | 179 | hive.exec.dynamic.partition.mode 180 | nonstrict 181 | 182 | 183 | 184 | hive.exec.failure.hooks 185 | org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook 186 | 187 | 188 | 189 | hive.exec.max.created.files 190 | 100000 191 | 192 | 193 | 194 | hive.exec.max.dynamic.partitions 195 | 5000 196 | 197 | 198 | 199 | hive.exec.max.dynamic.partitions.pernode 200 | 2000 201 | 202 | 203 | 204 | hive.exec.orc.split.strategy 205 | HYBRID 206 | 207 | 208 | 209 | hive.exec.parallel 210 | false 211 | 212 | 213 | 214 | hive.exec.parallel.thread.number 215 | 8 216 | 217 | 218 | 219 | hive.exec.post.hooks 220 | org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook 221 | 222 | 223 | 224 | hive.exec.pre.hooks 225 | org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook 226 | 227 | 228 | 229 | hive.exec.reducers.bytes.per.reducer 230 | 1083179008 231 | 232 | 233 | 234 | hive.exec.reducers.max 235 | 1009 236 | 237 | 238 | 239 | hive.exec.scratchdir 240 | /tmp/hive 241 | 242 | 243 | 244 | hive.exec.submit.local.task.via.child 245 | true 246 | 247 | 248 | 249 | hive.exec.submitviachild 250 | false 251 | 252 | 253 | 254 | hive.execution.engine 255 | tez 256 | 257 | 258 | 259 | hive.execution.mode 260 | container 261 | 262 | 263 | 264 | hive.fetch.task.aggr 265 | false 266 | 267 | 268 | 269 | hive.fetch.task.conversion 270 | more 271 | 272 | 273 | 274 | hive.fetch.task.conversion.threshold 275 | 1073741824 276 | 277 | 278 | 279 | hive.heapsize 280 | 1024 281 | 282 | 283 | 284 | hive.hook.proto.base-directory 285 | /warehouse/tablespace/external/hive/sys.db/query_data/ 286 | 287 | 288 | 289 | hive.limit.optimize.enable 290 | true 291 | 292 | 293 | 294 | hive.limit.pushdown.memory.usage 295 | 0.04 296 | 297 | 298 | 299 | hive.load.data.owner 300 | hive 301 | 302 | 303 | 304 | hive.lock.manager 305 | 306 | 307 | 308 | 309 | hive.map.aggr 310 | true 311 | 312 | 313 | 314 | hive.map.aggr.hash.force.flush.memory.threshold 315 | 0.9 316 | 317 | 318 | 319 | hive.map.aggr.hash.min.reduction 320 | 0.5 321 | 322 | 323 | 324 | hive.map.aggr.hash.percentmemory 325 | 0.5 326 | 327 | 328 | 329 | hive.mapjoin.bucket.cache.size 330 | 10000 331 | 332 | 333 | 334 | hive.mapjoin.hybridgrace.hashtable 335 | false 336 | 337 | 338 | 339 | hive.mapjoin.optimized.hashtable 340 | true 341 | 342 | 343 | 344 | hive.mapred.reduce.tasks.speculative.execution 345 | false 346 | 347 | 348 | 349 | hive.materializedview.rewriting.incremental 350 | false 351 | 352 | 353 | 354 | hive.merge.mapfiles 355 | true 356 | 357 | 358 | 359 | hive.merge.mapredfiles 360 | false 361 | 362 | 363 | 364 | hive.merge.orcfile.stripe.level 365 | true 366 | 367 | 368 | 369 | hive.merge.rcfile.block.level 370 | true 371 | 372 | 373 | 374 | hive.merge.size.per.task 375 | 256000000 376 | 377 | 378 | 379 | hive.merge.smallfiles.avgsize 380 | 16000000 381 | 382 | 383 | 384 | hive.merge.tezfiles 385 | false 386 | 387 | 388 | 389 | hive.metastore.authorization.storage.checks 390 | false 391 | 392 | 393 | 394 | hive.metastore.cache.pinobjtypes 395 | Table,Database,Type,FieldSchema,Order 396 | 397 | 398 | 399 | hive.metastore.client.connect.retry.delay 400 | 5s 401 | 402 | 403 | 404 | hive.metastore.client.socket.timeout 405 | 1800s 406 | 407 | 408 | 409 | hive.metastore.connect.retries 410 | 24 411 | 412 | 413 | 414 | hive.metastore.db.type 415 | MYSQL 416 | 417 | 418 | 419 | hive.metastore.dml.events 420 | true 421 | 422 | 423 | 424 | hive.metastore.event.listeners 425 | 426 | 427 | 428 | 429 | hive.metastore.execute.setugi 430 | true 431 | 432 | 433 | 434 | hive.metastore.failure.retries 435 | 24 436 | 437 | 438 | 439 | hive.metastore.kerberos.keytab.file 440 | /etc/security/keytabs/hive.service.keytab 441 | 442 | 443 | 444 | hive.metastore.kerberos.principal 445 | hive/_HOST@EXAMPLE.COM 446 | 447 | 448 | 449 | hive.metastore.pre.event.listeners 450 | org.apache.hadoop.hive.ql.security.authorization.AuthorizationPreEventListener 451 | 452 | 453 | 454 | hive.metastore.sasl.enabled 455 | false 456 | 457 | 458 | 459 | hive.metastore.server.max.threads 460 | 100000 461 | 462 | 463 | 464 | hive.metastore.transactional.event.listeners 465 | org.apache.hive.hcatalog.listener.DbNotificationListener 466 | 467 | 468 | 469 | hive.metastore.uris 470 | thrift://hdp01.pcl-test.com:9083 471 | 472 | 473 | 474 | hive.metastore.warehouse.dir 475 | /warehouse/tablespace/managed/hive 476 | 477 | 478 | 479 | hive.metastore.warehouse.external.dir 480 | /warehouse/tablespace/external/hive 481 | 482 | 483 | 484 | hive.optimize.bucketmapjoin 485 | true 486 | 487 | 488 | 489 | hive.optimize.bucketmapjoin.sortedmerge 490 | false 491 | 492 | 493 | 494 | hive.optimize.constant.propagation 495 | true 496 | 497 | 498 | 499 | hive.optimize.dynamic.partition.hashjoin 500 | true 501 | 502 | 503 | 504 | hive.optimize.index.filter 505 | true 506 | 507 | 508 | 509 | hive.optimize.metadataonly 510 | true 511 | 512 | 513 | 514 | hive.optimize.null.scan 515 | true 516 | 517 | 518 | 519 | hive.optimize.reducededuplication 520 | true 521 | 522 | 523 | 524 | hive.optimize.reducededuplication.min.reducer 525 | 4 526 | 527 | 528 | 529 | hive.optimize.sort.dynamic.partition 530 | false 531 | 532 | 533 | 534 | hive.orc.compute.splits.num.threads 535 | 10 536 | 537 | 538 | 539 | hive.orc.splits.include.file.footer 540 | false 541 | 542 | 543 | 544 | hive.prewarm.enabled 545 | false 546 | 547 | 548 | 549 | hive.prewarm.numcontainers 550 | 3 551 | 552 | 553 | 554 | hive.repl.cm.enabled 555 | 556 | 557 | 558 | 559 | hive.repl.cmrootdir 560 | 561 | 562 | 563 | 564 | hive.repl.rootdir 565 | 566 | 567 | 568 | 569 | hive.security.metastore.authenticator.manager 570 | org.apache.hadoop.hive.ql.security.HadoopDefaultMetastoreAuthenticator 571 | 572 | 573 | 574 | hive.security.metastore.authorization.auth.reads 575 | true 576 | 577 | 578 | 579 | hive.security.metastore.authorization.manager 580 | org.apache.hadoop.hive.ql.security.authorization.StorageBasedAuthorizationProvider 581 | 582 | 583 | 584 | hive.server2.allow.user.substitution 585 | true 586 | 587 | 588 | 589 | hive.server2.authentication 590 | NONE 591 | 592 | 593 | 594 | hive.server2.authentication.spnego.keytab 595 | HTTP/_HOST@EXAMPLE.COM 596 | 597 | 598 | 599 | hive.server2.authentication.spnego.principal 600 | /etc/security/keytabs/spnego.service.keytab 601 | 602 | 603 | 604 | hive.server2.enable.doAs 605 | true 606 | 607 | 608 | 609 | hive.server2.idle.operation.timeout 610 | 6h 611 | 612 | 613 | 614 | hive.server2.idle.session.timeout 615 | 1d 616 | 617 | 618 | 619 | hive.server2.logging.operation.enabled 620 | true 621 | 622 | 623 | 624 | hive.server2.logging.operation.log.location 625 | /tmp/hive/operation_logs 626 | 627 | 628 | 629 | hive.server2.max.start.attempts 630 | 5 631 | 632 | 633 | 634 | hive.server2.support.dynamic.service.discovery 635 | true 636 | 637 | 638 | 639 | hive.server2.table.type.mapping 640 | CLASSIC 641 | 642 | 643 | 644 | hive.server2.tez.default.queues 645 | default,llap 646 | 647 | 648 | 649 | hive.server2.tez.initialize.default.sessions 650 | false 651 | 652 | 653 | 654 | hive.server2.tez.sessions.per.default.queue 655 | 1 656 | 657 | 658 | 659 | hive.server2.thrift.http.path 660 | cliservice 661 | 662 | 663 | 664 | hive.server2.thrift.http.port 665 | 10001 666 | 667 | 668 | 669 | hive.server2.thrift.max.worker.threads 670 | 500 671 | 672 | 673 | 674 | hive.server2.thrift.port 675 | 10000 676 | 677 | 678 | 679 | hive.server2.thrift.sasl.qop 680 | auth 681 | 682 | 683 | 684 | hive.server2.transport.mode 685 | binary 686 | 687 | 688 | 689 | hive.server2.use.SSL 690 | false 691 | 692 | 693 | 694 | hive.server2.webui.cors.allowed.headers 695 | X-Requested-With,Content-Type,Accept,Origin,X-Requested-By,x-requested-by 696 | 697 | 698 | 699 | hive.server2.webui.enable.cors 700 | true 701 | 702 | 703 | 704 | hive.server2.webui.port 705 | 10002 706 | 707 | 708 | 709 | hive.server2.webui.use.ssl 710 | false 711 | 712 | 713 | 714 | hive.server2.zookeeper.namespace 715 | hiveserver2 716 | 717 | 718 | 719 | hive.service.metrics.codahale.reporter.classes 720 | org.apache.hadoop.hive.common.metrics.metrics2.JsonFileMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.JmxMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.Metrics2Reporter 721 | 722 | 723 | 724 | hive.smbjoin.cache.rows 725 | 10000 726 | 727 | 728 | 729 | hive.stats.autogather 730 | true 731 | 732 | 733 | 734 | hive.stats.dbclass 735 | fs 736 | 737 | 738 | 739 | hive.stats.fetch.column.stats 740 | true 741 | 742 | 743 | 744 | hive.stats.fetch.partition.stats 745 | true 746 | 747 | 748 | 749 | hive.stats.jdbc.timeout 750 | 0 751 | 752 | 753 | 754 | hive.strict.managed.tables 755 | false 756 | 757 | 758 | 759 | hive.support.concurrency 760 | true 761 | 762 | 763 | 764 | hive.tez.auto.reducer.parallelism 765 | true 766 | 767 | 768 | 769 | hive.tez.bucket.pruning 770 | true 771 | 772 | 773 | 774 | hive.tez.cartesian-product.enabled 775 | true 776 | 777 | 778 | 779 | hive.tez.container.size 780 | 7680 781 | 782 | 783 | 784 | hive.tez.cpu.vcores 785 | -1 786 | 787 | 788 | 789 | hive.tez.dynamic.partition.pruning 790 | true 791 | 792 | 793 | 794 | hive.tez.dynamic.partition.pruning.max.data.size 795 | 104857600 796 | 797 | 798 | 799 | hive.tez.dynamic.partition.pruning.max.event.size 800 | 1048576 801 | 802 | 803 | 804 | hive.tez.exec.print.summary 805 | true 806 | 807 | 808 | 809 | hive.tez.input.format 810 | org.apache.hadoop.hive.ql.io.HiveInputFormat 811 | 812 | 813 | 814 | hive.tez.input.generate.consistent.splits 815 | true 816 | 817 | 818 | 819 | hive.tez.java.opts 820 | -server -Djava.net.preferIPv4Stack=true -XX:NewRatio=8 -XX:+UseNUMA -XX:+UseG1GC -XX:+ResizeTLAB -XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps 821 | 822 | 823 | 824 | hive.tez.log.level 825 | INFO 826 | 827 | 828 | 829 | hive.tez.max.partition.factor 830 | 2.0 831 | 832 | 833 | 834 | hive.tez.min.partition.factor 835 | 0.25 836 | 837 | 838 | 839 | hive.tez.smb.number.waves 840 | 0.5 841 | 842 | 843 | 844 | hive.txn.manager 845 | org.apache.hadoop.hive.ql.lockmgr.DbTxnManager 846 | 847 | 848 | 849 | hive.txn.max.open.batch 850 | 1000 851 | 852 | 853 | 854 | hive.txn.strict.locking.mode 855 | false 856 | 857 | 858 | 859 | hive.txn.timeout 860 | 1000 861 | 862 | 863 | 864 | hive.user.install.directory 865 | /user/ 866 | 867 | 868 | 869 | hive.vectorized.execution.enabled 870 | true 871 | 872 | 873 | 874 | hive.vectorized.execution.mapjoin.minmax.enabled 875 | true 876 | 877 | 878 | 879 | hive.vectorized.execution.mapjoin.native.enabled 880 | true 881 | 882 | 883 | 884 | hive.vectorized.execution.mapjoin.native.fast.hashtable.enabled 885 | true 886 | 887 | 888 | 889 | hive.vectorized.execution.reduce.enabled 890 | true 891 | 892 | 893 | 894 | hive.vectorized.groupby.checkinterval 895 | 4096 896 | 897 | 898 | 899 | hive.vectorized.groupby.flush.percent 900 | 0.1 901 | 902 | 903 | 904 | hive.vectorized.groupby.maxentries 905 | 100000 906 | 907 | 908 | 909 | hive.zookeeper.client.port 910 | 2181 911 | 912 | 913 | 914 | hive.zookeeper.namespace 915 | hive_zookeeper_namespace 916 | 917 | 918 | 919 | hive.zookeeper.quorum 920 | hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181 921 | 922 | 923 | 924 | javax.jdo.option.ConnectionDriverName 925 | com.mysql.jdbc.Driver 926 | 927 | 928 | 929 | javax.jdo.option.ConnectionURL 930 | jdbc:mysql://hdp01.pcl-test.com/hive 931 | 932 | 933 | 934 | javax.jdo.option.ConnectionUserName 935 | hive 936 | 937 | 938 | 939 | metastore.create.as.acid 940 | true 941 | 942 | 943 | 944 | 953 | 954 | -------------------------------------------------------------------------------- /flink-coding/src/main/scala/com/anryg/FlinkDSFromKafka2HDFS.scala: -------------------------------------------------------------------------------- 1 | package com.anryg 2 | 3 | import java.time.Duration 4 | 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 6 | import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema} 7 | import org.apache.flink.configuration.MemorySize 8 | import org.apache.flink.connector.kafka.source.KafkaSource 9 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 10 | import org.apache.flink.core.fs.Path 11 | import org.apache.flink.runtime.state.CheckpointStorage 12 | import org.apache.flink.streaming.api.CheckpointingMode 13 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink 14 | import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy 15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 16 | 17 | 18 | /** 19 | * @DESC: 读取kafka数据,从DataStream到HDFS 20 | * @Auther: Anryg 21 | * @Date: 2022/8/14 19:08 22 | */ 23 | object FlinkDSFromKafka2HDFS { 24 | 25 | private final val hdfsPrefix = "hdfs://192.168.211.106:8020" 26 | 27 | def main(args: Array[String]): Unit = { 28 | //获取流任务的环境变量 29 | val env = StreamExecutionEnvironment.getExecutionEnvironment 30 | .enableCheckpointing(3000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能 31 | 32 | env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkDSFromKafka2HDFS") //设置checkpoint的hdfs目录 33 | 34 | val kafkaSource = KafkaSource.builder() //获取kafka数据源 35 | .setBootstrapServers("192.168.211.107:6667") 36 | .setTopics("qianxin") 37 | .setGroupId("FlinkDSFromKafka2HDFS2") 38 | .setStartingOffsets(OffsetsInitializer.latest()) 39 | .setValueOnlyDeserializer(new SimpleStringSchema()) 40 | .build() 41 | 42 | import org.apache.flink.streaming.api.scala._ //引入隐私转换函数 43 | val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data") //读取数据源生成DataStream对象 44 | 45 | val targetDS = kafkaDS.map(line => { //对数据源做简单的ETL处理 46 | line.split("\\|") 47 | }).filter(_.length == 9).map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) 48 | 49 | /**基于flink1.14之后新的,文件系统的sink策略,跟官网提供的不一致,有坑*/ 50 | val hdfsSink2 = StreamingFileSink.forRowFormat(new Path(hdfsPrefix + "/tmp/flink_sink3"), 51 | new SimpleStringEncoder[(String,String,String,String,String,String,String,String,String)]("UTF-8")) 52 | //.withBucketAssigner(new DateTimeBucketAssigner) /**默认基于时间分配器*/ 53 | .withRollingPolicy( //设置文件的滚动策略,也就是分文件策略,也可以同时设置文件的命名规则,这里暂时用默认 54 | DefaultRollingPolicy.builder() 55 | .withRolloverInterval(Duration.ofSeconds(300)) //文件滚动间隔,设为5分钟,即每5分钟生成一个新文件 56 | .withInactivityInterval(Duration.ofSeconds(20)) //空闲间隔时间,也就是当前文件有多久没有写入数据,则进行滚动 57 | .withMaxPartSize(MemorySize.ofMebiBytes(800)) //单个文件的最大文件大小,设置为500MB 58 | .build()).build() 59 | 60 | targetDS.addSink(hdfsSink2) //目标DataStream添加sink策略 61 | 62 | env.execute("FlinkDSFromKafka2HDFS") //启动任务 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /flink-coding/src/main/scala/com/anryg/FlinkTest04.scala: -------------------------------------------------------------------------------- 1 | package com.anryg 2 | 3 | import com.alibaba.fastjson.JSON 4 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema 6 | import org.apache.flink.connector.kafka.source.KafkaSource 7 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 8 | import org.apache.flink.streaming.api.scala._ 9 | import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment 10 | 11 | 12 | /** 13 | * @DESC: 读取kafka数据,从DataStream转为Table 把结果写ES 14 | * @Auther: Anryg 15 | * @Date: 2022/8/14 19:08 16 | */ 17 | object FlinkTest04 { 18 | case class InternetBehavior(id:String, client_ip:String, domain:String, do_time:String, target_ip:String,rcode:String, query_type:String, authority_record:String, add_msg:String, dns_ip:String)//定义当前数据对象 19 | 20 | def main(args: Array[String]): Unit = { 21 | val env = StreamExecutionEnvironment.getExecutionEnvironment 22 | 23 | val tableEnv = StreamTableEnvironment.create(env) 24 | 25 | val kafkaSource = KafkaSource.builder() 26 | .setBootstrapServers("192.168.211.107:6667") 27 | .setTopics("test") 28 | .setGroupId("group01") 29 | .setStartingOffsets(OffsetsInitializer.earliest()) 30 | .setValueOnlyDeserializer(new SimpleStringSchema()) 31 | .build() 32 | 33 | val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data") 34 | val targetDS = kafkaDS.map(line => { 35 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析 36 | val message = rawJson.getString("message") //获取业务数据部分 37 | val msgArray = message.split(",") //指定分隔符进行字段切分 38 | msgArray 39 | }).filter(_.length == 9).map(array => { 40 | InternetBehavior(array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8)) 41 | }) 42 | 43 | val targetTable = tableEnv.fromDataStream(targetDS)//转化成为Table类型 44 | //targetTable.execute().print() 45 | 46 | /**定义sink*/ 47 | tableEnv.executeSql("CREATE TABLE InternetBehavior (\n\tid String,\n client_ip STRING,\n domain STRING,\n do_time STRING,\n target_ip STRING,\n rcode int,\n query_type string,\n authority_record string,\n add_msg string,\n dns_ip string,\n PRIMARY KEY (id) NOT ENFORCED\n) WITH (\n 'connector' = 'elasticsearch-7',\n 'hosts' = 'http://192.168.211.106:9201',\n 'index' = 'internet_behavior-flink'\n)") 48 | 49 | targetTable.executeInsert("InternetBehavior") 50 | //targetDS.addSink() 51 | //targetTable.executeInsert() 52 | 53 | //env.execute("FlinkTest03") 54 | 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /flink-coding/src/main/scala/com/anryg/hive_cdc/FlinkReadKafka2Hive.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.hive_cdc 2 | 3 | import org.apache.flink.configuration.Configuration 4 | import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect, TableEnvironment} 5 | import org.apache.flink.table.catalog.hive.HiveCatalog 6 | 7 | /** 8 | * @DESC: Flink读取kafka写hive动态分区表 9 | * @Auther: Anryg 10 | * @Date: 2022/12/19 10:36 11 | */ 12 | object FlinkReadKafka2Hive { 13 | 14 | def main(args: Array[String]): Unit = { 15 | val settings = EnvironmentSettings.newInstance().inStreamingMode() 16 | .withConfiguration(setConf()) 17 | .build() //读设置 18 | val tableEnv = TableEnvironment.create(settings) //获取table env 19 | setHive(tableEnv) 20 | 21 | /**读取kafka source*/ 22 | getDataSource(tableEnv) 23 | 24 | tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) //设置当前SQL语法为hive方言,该方言可以在整个上下文过程中来回切换 25 | /**创建hive表*/ 26 | createHiveTable(tableEnv) 27 | 28 | tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT) //设置当前SQL语法为flink方言, 29 | /**将数据Sink到*/ 30 | sinkData(tableEnv) 31 | 32 | } 33 | 34 | /** 35 | * @DESC: 设置Flink相关配置 36 | * */ 37 | private def setConf(): Configuration ={ 38 | val config = new Configuration() //设置checkpoint 39 | config.setString("execution.checkpointing.interval","10000") 40 | config.setString("state.backend", "filesystem") 41 | config.setString("state.checkpoints.dir","hdfs://192.168.211.106:8020/tmp/checkpoint/FlinkWithHive") 42 | config 43 | } 44 | 45 | /** 46 | * @DESC: 设置hive catalog 47 | * */ 48 | private def setHive(tableEnv: TableEnvironment): Unit ={ 49 | val name = "hive_test" //取个catalog名字 50 | val database = "test" //指定hive的database 51 | //val hiveConf = "./flink-coding/src/main/resources/" //指定hive-site.xml配置文件所在的地方 52 | 53 | /**读取hive配置,并生成hive的catalog对象*/ 54 | val hive = new HiveCatalog(name,database, null) //hiveConf为null后,程序会自动到classpath下找hive-site.xml 55 | tableEnv.registerCatalog(name, hive) //将该catalog登记到Flink的table env环境中,这样flink就可以直接访问hive中的表 56 | 57 | tableEnv.useCatalog(name) //让当前的flink环境使用该catalog 58 | } 59 | 60 | /** 61 | * @DESC: 读取Kafka数据源 62 | * */ 63 | private def getDataSource(tableEnv: TableEnvironment): Unit ={ 64 | tableEnv.executeSql( 65 | """ 66 | |drop table if exists test.kafkaTable; 67 | """.stripMargin) 68 | 69 | tableEnv.executeSql( 70 | """ 71 | |Create table test.kafkaTable( 72 | |client_ip STRING, 73 | |domain STRING, 74 | |`time` STRING, 75 | |target_ip STRING, 76 | |rcode STRING, 77 | |query_type STRING, 78 | |authority_record STRING, 79 | |add_msg STRING, 80 | |dns_ip STRING 81 | |) 82 | |with( 83 | |'connector' = 'kafka', 84 | |'topic' = 'qianxin', 85 | |'properties.bootstrap.servers' = '192.168.211.107:6667', 86 | |'properties.group.id' = 'FlinkWithHive', 87 | |'scan.startup.mode' = 'latest-offset', 88 | |'value.format'='csv', //确定数据源为文本格式 89 | |'value.csv.field-delimiter'='|' //确定文本数据源的分隔符 90 | |); 91 | """.stripMargin) 92 | } 93 | 94 | /** 95 | * @DESC: 创建hive目标数据表 96 | * */ 97 | private def createHiveTable(tableEnv: TableEnvironment): Unit ={ 98 | tableEnv.executeSql( 99 | """ 100 | |CREATE TABLE if not exists test.kafka_flink_hive ( 101 | |client_ip STRING, 102 | |domain STRING, 103 | |target_ip STRING, 104 | |rcode STRING, 105 | |query_type STRING, 106 | |authority_record STRING, 107 | |add_msg STRING, 108 | |dns_ip STRING 109 | |) 110 | |PARTITIONED BY (`time` STRING) 111 | |STORED AS textfile TBLPROPERTIES ( 112 | | 'partition.time-extractor.timestamp-pattern'='$time', 113 | | 'sink.partition-commit.trigger'='partition-time', 114 | | 'sink.partition-commit.delay'='1 h', 115 | | 'sink.partition-commit.policy.kind'='metastore,success-file' 116 | |); 117 | """.stripMargin) 118 | } 119 | 120 | /** 121 | * @DESC: 将数据写入到目标表中 122 | * */ 123 | private def sinkData(tableEnv: TableEnvironment): Unit ={ 124 | tableEnv.executeSql( 125 | """ 126 | |INSERT INTO test.kafka_flink_hive 127 | |SELECT client_ip,domain,target_ip,rcode,query_type,authority_record,add_msg,dns_ip,`time` 128 | |FROM test.kafkaTable; 129 | """.stripMargin) 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /flink-coding/src/main/scala/com/anryg/hive_cdc/FlinkWithHive.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.hive_cdc 2 | 3 | import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect, TableEnvironment} 4 | import org.apache.flink.table.catalog.hive.HiveCatalog 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.hive.conf.HiveConf 7 | 8 | /** 9 | * @DESC: Flink连接hive 10 | * @Auther: Anryg 11 | * @Date: 2022/12/19 10:36 12 | */ 13 | object FlinkWithHive { 14 | 15 | def main(args: Array[String]): Unit = { 16 | val settings = EnvironmentSettings.newInstance().inStreamingMode().build() //读取默认设置 17 | val tableEnv = TableEnvironment.create(settings) //获取table env 18 | tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) //设置当前SQL语法为hive方言,该方言可以在整个上下文过程中来回切换 19 | setHive(tableEnv) 20 | 21 | /**查看当前database有哪些表*/ 22 | tableEnv.executeSql( 23 | """ 24 | |SHOW tables; 25 | """.stripMargin).print() 26 | 27 | 28 | /**将数据Sink到*/ 29 | } 30 | 31 | /** 32 | * @DESC: 设置hive catalog 33 | * */ 34 | private def setHive(tableEnv: TableEnvironment): Unit ={ 35 | val name = "hive_test" //取个catalog名字 36 | val database = "test" //指定hive的database 37 | //val hiveConf = "./flink-coding/src/main/resources/" //指定hive-site.xml配置文件所在的地方 38 | 39 | /**读取hive配置,并生成hive的catalog对象*/ 40 | val hive = new HiveCatalog(name,database, null) //hiveConf为null后,程序会自动到classpath下找hive-site.xml 41 | tableEnv.registerCatalog(name, hive) //将该catalog登记到Flink的table env环境中,这样flink就可以直接访问hive中的表 42 | 43 | tableEnv.useCatalog(name) //让当前的flink环境使用该catalog 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkDSFromKafkaWithWatermark.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.window_and_watermark 2 | 3 | import java.text.SimpleDateFormat 4 | import java.time.Duration 5 | import java.util.Locale 6 | 7 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy} 8 | import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema} 9 | import org.apache.flink.configuration.MemorySize 10 | import org.apache.flink.connector.kafka.source.KafkaSource 11 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 12 | import org.apache.flink.core.fs.Path 13 | import org.apache.flink.streaming.api.CheckpointingMode 14 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup 15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 16 | import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, SlidingProcessingTimeWindows, TumblingEventTimeWindows, TumblingProcessingTimeWindows} 17 | import org.apache.flink.streaming.api.windowing.time 18 | import org.apache.flink.streaming.api.windowing.time.Time 19 | 20 | 21 | /** 22 | * @DESC: 读取kafka数据,从DataStream到HDFS 23 | * @Auther: Anryg 24 | * @Date: 2022/8/14 19:08 25 | */ 26 | object FlinkDSFromKafkaWithWatermark { 27 | 28 | private final val hdfsPrefix = "hdfs://192.168.211.106:8020" 29 | 30 | def main(args: Array[String]): Unit = { 31 | //获取流任务的环境变量 32 | val env = StreamExecutionEnvironment.getExecutionEnvironment 33 | .enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能 34 | 35 | env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkDSFromKafkaWithWatermark") //设置checkpoint的hdfs目录 36 | env.getCheckpointConfig.setExternalizedCheckpointCleanup(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //设置checkpoint记录的保留策略 37 | 38 | val kafkaSource = KafkaSource.builder() //获取kafka数据源 39 | .setBootstrapServers("192.168.211.107:6667") 40 | .setTopics("qianxin") 41 | .setGroupId("FlinkDSFromKafkaWithWatermark") 42 | .setStartingOffsets(OffsetsInitializer.latest()) 43 | .setValueOnlyDeserializer(new SimpleStringSchema()) 44 | .build() 45 | 46 | import org.apache.flink.streaming.api.scala._ //引入隐私转换函数 47 | 48 | val kafkaDS = env.fromSource(kafkaSource, 49 | WatermarkStrategy.noWatermarks() 50 | ,"kafka-data") //读取数据源生成DataStream对象 51 | 52 | val targetDS = kafkaDS.map(line => { //对数据源做简单的ETL处理 53 | line.split("\\|") 54 | }).filter(_.length == 9).filter(_(1).endsWith("com")) 55 | .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofHours(10)) //指定watermark 56 | .withTimestampAssigner(new SerializableTimestampAssigner[Array[String]] { 57 | override def extractTimestamp(element: Array[String], recordTimestamp: Long): Long = { 58 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss") 59 | sdf.parse(element(2)).getTime //指定的watermark字段必须是Long类型的时间戳 60 | } 61 | })) 62 | .map(array => (array(0), 1)) 63 | .keyBy(kv => kv._1) //根据client_ip聚合 64 | .window(SlidingProcessingTimeWindows.of(Time.minutes(2), Time.seconds(30))) //指定window,这里的window assigner必须是基于Process Time而不是Event Time,因为数据时间跟当前真实时间相差有点多 65 | .sum(1) 66 | 67 | targetDS.print() //打印结果 68 | 69 | env.execute("FlinkDSFromKafkaWithWatermark") //启动任务 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkSQLFromKafkaWithWatermarkAndWindow.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.window_and_watermark 2 | 3 | import org.apache.flink.configuration.Configuration 4 | import org.apache.flink.table.api.{EnvironmentSettings, TableEnvironment} 5 | 6 | 7 | /** 8 | * @DESC: 用SQL API读取kafka数据,并利用watermark和window功能来对数据进行统计 9 | * @Auther: Anryg 10 | * @Date: 2022/8/14 19:08 11 | */ 12 | object FlinkSQLFromKafkaWithWatermarkAndWindow { 13 | 14 | def main(args: Array[String]): Unit = { 15 | val streamingSetting = EnvironmentSettings.newInstance().inStreamingMode().build() 16 | 17 | val config = new Configuration() //设置checkpoint 18 | config.setString("execution.checkpointing.interval","10000") 19 | config.setString("state.backend", "filesystem") 20 | config.setString("state.checkpoints.dir","hdfs://192.168.211.106:8020/tmp/checkpoint/FlinkSQLFromKafkaWithWatermarkAndWindow") 21 | 22 | streamingSetting.getConfiguration.addAll(config) 23 | 24 | val tableEnv = TableEnvironment.create(streamingSetting) 25 | 26 | tableEnv.executeSql( 27 | """ 28 | |Create table kafkaTable( 29 | |client_ip STRING, 30 | |domain STRING, 31 | |`time` STRING, 32 | |target_ip STRING, 33 | |rcode STRING, 34 | |query_type STRING, 35 | |authority_record STRING, 36 | |add_msg STRING, 37 | |dns_ip STRING, 38 | |event_time AS to_timestamp(`time`, 'yyyyMMddHHmmss'), //设置事件时间为实际数据的产生时间,注意time这个字段必须要用``符合括起来 39 | |watermark for event_time as event_time - interval '10' second //设置watermark,确定watermark字段 40 | |) 41 | |with( 42 | |'connector' = 'kafka', 43 | |'topic' = 'qianxin', 44 | |'properties.bootstrap.servers' = '192.168.211.107:6667', 45 | |'properties.group.id' = 'FlinkSQLFromKafkaWithWatermarkAndWindow', 46 | |'scan.startup.mode' = 'latest-offset', 47 | |'value.format'='csv', //确定数据源为文本格式 48 | |'value.csv.field-delimiter'='|' //确定文本数据源的分隔符 49 | |) 50 | """.stripMargin) 51 | 52 | tableEnv.executeSql( 53 | """ 54 | |SELECT 55 | |window_start, 56 | |window_end, 57 | |client_ip, 58 | |count(client_ip) as ip_count 59 | |FROM TABLE( 60 | |HOP( //确定window策略 61 | |TABLE kafkaTable, 62 | |DESCRIPTOR(event_time), 63 | |INTERVAL '30' SECONDS, //确定滑动周期 64 | |INTERVAL '2' MINUTES) //确定窗口时间间隔 65 | |) 66 | |GROUP BY 67 | |window_start, 68 | |window_end, 69 | |client_ip 70 | |ORDER BY ip_count 71 | |DESC 72 | |LIMIT 10 73 | """.stripMargin 74 | ).print() 75 | 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkTBFromKafkaWithWatermark.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.window_and_watermark 2 | 3 | import java.sql.Timestamp 4 | import java.text.SimpleDateFormat 5 | import java.time.Duration 6 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy} 7 | import org.apache.flink.api.common.serialization.SimpleStringSchema 8 | import org.apache.flink.connector.kafka.source.KafkaSource 9 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 10 | import org.apache.flink.streaming.api.CheckpointingMode 11 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup 12 | import org.apache.flink.streaming.api.scala._ 13 | import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment 14 | 15 | 16 | 17 | /** 18 | * @DESC: 读取kafka数据,从DataStream API转为Table API,并利用watermark 19 | * @Auther: Anryg 20 | * @Date: 2022/8/14 19:08 21 | */ 22 | object FlinkTBFromKafkaWithWatermark { 23 | private final val hdfsPrefix = "hdfs://192.168.211.106:8020"//HDFS地址前缀 24 | 25 | def main(args: Array[String]): Unit = { 26 | val env = StreamExecutionEnvironment.getExecutionEnvironment //获取流环境变量 27 | .enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能 28 | 29 | val tableEnv = StreamTableEnvironment.create(env) //创建Table环境变量 30 | env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkTBFromKafkaWithWatermark") //设置checkpoint的hdfs目录 31 | env.getCheckpointConfig.setExternalizedCheckpointCleanup(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //设置checkpoint记录的保留策略 32 | 33 | val kafkaSource = KafkaSource.builder() 34 | .setBootstrapServers("192.168.211.107:6667") 35 | .setTopics("qianxin") 36 | .setGroupId("FlinkTBFromKafkaWithWatermark") 37 | .setStartingOffsets(OffsetsInitializer.latest()) 38 | .setValueOnlyDeserializer(new SimpleStringSchema()) 39 | .build() 40 | val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data") 41 | val targetDS = kafkaDS.map(_.split("\\|")) 42 | .filter(_.length == 9) 43 | .filter(_(1).endsWith("com")) 44 | .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(10)) //给业务字段分配watermark 45 | .withTimestampAssigner(new SerializableTimestampAssigner[Array[String]] { 46 | override def extractTimestamp(element: Array[String], recordTimestamp: Long): Long = { //实现watermark字段的分配 47 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss") 48 | sdf.parse(element(2)).getTime 49 | } 50 | })) 51 | .map(array => (array(0), array(2))) 52 | .map(kv => { 53 | val date = kv._2 54 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss").parse(date) 55 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(sdf) 56 | (kv._1, Timestamp.valueOf(time)) //将时间转为要求的Time Attributes 也就是Timestamp类型 57 | }) 58 | 59 | import org.apache.flink.table.api._ //加入隐式转换,否则下面的$无法识别 60 | 61 | val targetTable = tableEnv.fromDataStream(targetDS) 62 | .as("client_ip", "time") //添加schema 63 | .window( 64 | Slide over 1.minute every 30.seconds() on $"time" as $"w" //加入window 65 | ) 66 | .groupBy($"client_ip", $"w") 67 | .select( 68 | $"client_ip", 69 | $"w".start(), //时间窗口的开始时间 70 | $"w".end(), //时间窗口的解释时间 71 | $"client_ip".count() as "count" 72 | ) 73 | .orderBy($"count") 74 | .limit(10) 75 | targetTable.execute().print() 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.anryg.bigdata 8 | internet_behavior_project 9 | pom 10 | 1.0-SNAPSHOT 11 | 12 | spark-coding 13 | flink-coding 14 | redis 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /redis/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | internet_behavior_project 7 | com.anryg.bigdata 8 | 1.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | com.anryg.bigdata 13 | redis 14 | 1.0-SNAPSHOT 15 | jar 16 | redis 17 | 18 | http://www.example.com 19 | 20 | 21 | UTF-8 22 | 1.8 23 | 1.8 24 | 25 | 26 | 27 | 28 | redis.clients 29 | jedis 30 | 3.3.0 31 | 32 | 33 | junit 34 | junit 35 | 4.11 36 | test 37 | 38 | 39 | 40 | 41 | src/main/scala 42 | 43 | 44 | 45 | org.apache.maven.plugins 46 | maven-shade-plugin 47 | 3.1.0 48 | 49 | true 50 | with-dependencies 51 | 52 | 53 | *:* 54 | 55 | META-INF/*.SF 56 | META-INF/*.DSA 57 | META-INF/*.RSA 58 | 59 | 60 | 61 | 62 | 63 | 64 | junit:junit 65 | 66 | 67 | 68 | 69 | 70 | package 71 | 72 | shade 73 | 74 | 75 | 76 | 77 | com.google.guava 78 | com.shade2.google.guava 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | org.codehaus.mojo 91 | build-helper-maven-plugin 92 | 3.0.0 93 | 94 | 95 | add-source 96 | generate-sources 97 | 98 | add-source 99 | 100 | 101 | 102 | src/main/java 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | net.alchim31.maven 111 | scala-maven-plugin 112 | 3.2.1 113 | 114 | 115 | 116 | compile 117 | testCompile 118 | 119 | 120 | 121 | -make:transitive 122 | -dependencyfile 123 | ${project.build.directory}/.scala_dependencies 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /redis/src/main/java/com/anryg/bigdata/IPUtils.java: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata; 2 | 3 | //import com.googlecode.ipv6.IPv6Network; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | import redis.clients.jedis.Jedis; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.FileInputStream; 10 | import java.io.InputStreamReader; 11 | import java.math.BigInteger; 12 | import java.net.InetAddress; 13 | import java.net.UnknownHostException; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | 17 | /** 18 | * @DESC: 提供对IP地址数据相关的操作 19 | * @Author Anryg 20 | * */ 21 | 22 | public class IPUtils { 23 | private static Logger logger = LoggerFactory.getLogger(IPUtils.class); 24 | 25 | 26 | /** 27 | * @DESC: 将本地ip.merge.txt文件中的IP地址导入到redis zset中 28 | * @param filePath : IP地址与地理位置关系文件 29 | * @param dbNo : redis的数据库名 30 | * */ 31 | public static void ipCountryImport(String filePath, int dbNo) throws Exception { 32 | FileInputStream inputStream = new FileInputStream(filePath); 33 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); 34 | String line = null; /*读取文件中的每一行*/ 35 | HashMap map = new HashMap(1024,1); //key为数据体,value为ip范围的结束ip地址 36 | int i = 0; 37 | while((line=bufferedReader.readLine()) != null){ 38 | String[] args=line.split("\\|"); 39 | String ipStart=args[0]; 40 | String ipEnd=args[1]; 41 | //Long ipStartLong= IPUtils.ip2Long(ipStart); 42 | Long ipEndLong= IPUtils.ip2Long(ipEnd); //将每条IP地址范围的结束IP地址,转换为long类型的数值 43 | String country = args[2]; //获取国家信息 44 | String province = args[4]; //或者省份信息 45 | String city = args[5]; //获取城市信息 46 | String operator = args[6]; //获取运营商信息 47 | StringBuilder rowBuffer = new StringBuilder(11); //用来存放组装后的IP地址与地理位置信息 48 | rowBuffer.append(ipStart).append("-").append(ipEnd).append("-").append(country).append("-") 49 | .append(province).append("-").append(city).append("-").append(operator); 50 | map.put(rowBuffer.toString(),ipEndLong.doubleValue()); 51 | ++i; 52 | if (i == 1024) {/**每1024个为一批*/ 53 | toRedis(RedisClientUtils.getSingleRedisClient(),map, dbNo,"ipAndAddr"); 54 | map.clear(); 55 | i = 0; 56 | } 57 | } 58 | if (map.size() > 0) toRedis(RedisClientUtils.getSingleRedisClient(),map, dbNo,"ipAndAddr"); 59 | } 60 | 61 | /** 62 | * @DESC: 将IP转为10进制 63 | * */ 64 | public static long ip2Long(String ipstr) { 65 | InetAddress ip = null; 66 | try { 67 | ip = InetAddress.getByName(ipstr); 68 | } catch (UnknownHostException e) { 69 | logger.error("UnknownHost...",e); 70 | } 71 | byte[] octets = ip.getAddress(); 72 | long result = 0; 73 | for (byte octet : octets) { 74 | result <<= 8; 75 | result |= octet & 0xff; 76 | } 77 | return result; 78 | } 79 | 80 | /** 81 | * @DESC: 经10进制转换成为IPV4地址字符串 82 | * */ 83 | public static String Long2Ip(long ten) { 84 | StringBuilder sb = new StringBuilder(); 85 | for (int i = 0; i < 4; i++) { 86 | sb.insert(0, Long.toString(ten & 0xff)); 87 | if (i < 3) { 88 | sb.insert(0, '.'); 89 | } 90 | ten = ten >> 8; 91 | } 92 | return sb.toString(); 93 | } 94 | 95 | /** 96 | * 根据IPV4地址和子网掩码计算IPV4地址范围,例如:192.168.1.53/27 --》3232235808,3232235839 97 | * @param ipAndMask 98 | * @return IPV4地址范围 99 | */ 100 | public static long[] getIPLongScope(String ipAndMask) { 101 | String[] ipArr = ipAndMask.split("/"); 102 | if (ipArr.length != 2) { 103 | throw new IllegalArgumentException("invalid ipAndMask with: " 104 | + ipAndMask); 105 | } 106 | int netMask = Integer.valueOf(ipArr[1].trim()); 107 | if (netMask < 0 || netMask > 32) { 108 | throw new IllegalArgumentException("invalid ipAndMask with: " 109 | + ipAndMask); 110 | } 111 | long ipInt = ip2Long(ipArr[0]); 112 | long netIP = ipInt & (0xFFFFFFFF << (32 - netMask)); 113 | long hostScope = (0xFFFFFFFF >>> netMask); 114 | return new long[] { netIP, netIP + hostScope }; 115 | } 116 | 117 | /** 118 | * 根据IPV4地址和子网掩码计算IPV4地址范围,例如:ip:192.168.1.53,子网掩码:255.255.255.224--》3232235808,3232235839 119 | * @param ipaddr,mask IPV4地址,子网掩码 192.168.1.53,255.255.255.224 120 | * @return IPV4地址范围字符串 121 | */ 122 | public static String getIPNetworkAddr(String ipaddr, String mask){ 123 | //IP地址和子网掩码与得到网络地址 124 | Long ipNetworkAddr = ip2Long(ipaddr)&ip2Long(mask); 125 | Long ipBroadcastAddr = ((ipNetworkAddr^ip2Long(mask))^0xffffffffL); 126 | 127 | //System.out.println(Long.toBinaryString(ipBroadcastAddr)); 128 | return Long2Ip(ipNetworkAddr+1)+"-->"+Long2Ip(ipBroadcastAddr-1); 129 | } 130 | 131 | /** 132 | * ipv6字符串转整数 133 | * @param ipv6 134 | * @return 135 | */ 136 | public static BigInteger ipv6ToBigInt(String ipv6) 137 | { 138 | 139 | int compressIndex = ipv6.indexOf("::"); 140 | if (compressIndex != -1) 141 | { 142 | String part1s = ipv6.substring(0, compressIndex); 143 | String part2s = ipv6.substring(compressIndex + 1); 144 | BigInteger part1 = ipv6ToBigInt(part1s); 145 | BigInteger part2 = ipv6ToBigInt(part2s); 146 | int part1hasDot = 0; 147 | char[] ch = part1s.toCharArray(); 148 | for (char c : ch) 149 | { 150 | if (c == ':') 151 | { 152 | part1hasDot++; 153 | } 154 | } 155 | // ipv6 has most 7 dot 156 | return part1.shiftLeft(16 * (7 - part1hasDot )).add(part2); 157 | } 158 | String[] str = ipv6.split(":"); 159 | BigInteger big = BigInteger.ZERO; 160 | for (int i = 0; i < str.length; i++) 161 | { 162 | //::1 163 | if (str[i].isEmpty()) 164 | { 165 | str[i] = "0"; 166 | } 167 | big = big.add(BigInteger.valueOf(Long.valueOf(str[i], 16)) 168 | .shiftLeft(16 * (str.length - i - 1))); 169 | } 170 | return big; 171 | } 172 | 173 | 174 | /** 175 | * @Author liuxh02 176 | * @Description 整数转为ipv6地址字符串 177 | * @Date 2020/8/5 178 | * @Param [big] 179 | * @return java.lang.String 180 | **/ 181 | public static String bigIntToipv6(BigInteger big) 182 | { 183 | String str = ""; 184 | BigInteger ff = BigInteger.valueOf(0xffff); 185 | for (int i = 0; i < 8 ; i++) 186 | { 187 | str = big.and(ff).toString(16) + ":" + str; 188 | 189 | big = big.shiftRight(16); 190 | } 191 | //the last : 192 | str = str.substring(0, str.length() - 1); 193 | 194 | return str.replaceFirst("(^|:)(0+(:|$)){2,8}", "::"); 195 | } 196 | 197 | 198 | /** 199 | * @DESC: 批量方式写入Redis 200 | * */ 201 | private static Long toRedis(Jedis jedis, Map map, int dbno, String key) { 202 | try { 203 | jedis.select(dbno); 204 | return jedis.zadd(key,map); 205 | } finally { 206 | RedisClientUtils.returnResource(jedis); 207 | } 208 | 209 | } 210 | 211 | 212 | /** 213 | * @Author liuxh02 214 | * @Description 根据ipv6地址和子网掩码计算IP范围,返回数组 215 | * @Date 2020/8/6 216 | * @Param 【起始IP,结束IP】 217 | * @return java.math.BigInteger[] 218 | **/ 219 | /* public static BigInteger[] getIPV6LongScope(String ipv6AndMask ){ 220 | 221 | IPv6Network network = IPv6Network.fromString(ipv6AndMask); 222 | BigInteger start=network.getFirst().toBigInteger();//起始IP 223 | BigInteger end=network.getLast().toBigInteger();//结束IP 224 | System.out.println(end); 225 | return new BigInteger[]{start,end}; 226 | 227 | }*/ 228 | } 229 | -------------------------------------------------------------------------------- /redis/src/main/java/com/anryg/bigdata/IpSearch.java: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata; 2 | 3 | 4 | import org.slf4j.LoggerFactory; 5 | import redis.clients.jedis.Jedis; 6 | import redis.clients.jedis.Tuple; 7 | 8 | import java.math.BigInteger; 9 | import java.net.UnknownHostException; 10 | import java.util.Set; 11 | 12 | public class IpSearch { 13 | private static org.slf4j.Logger logger = LoggerFactory.getLogger(IpSearch.class); 14 | 15 | /** 16 | * 在redis db1数据库中查找IP所在的地址信息 17 | * @param jedis 18 | * @param ip 19 | * @return 给定IP所在范围 20 | * @throws UnknownHostException 21 | */ 22 | public static String getAddrByIP(Jedis jedis, String ip) { 23 | try { 24 | jedis.select(1); 25 | long ipscore = IPUtils.ip2Long(ip); 26 | Set tuples = jedis.zrangeByScoreWithScores("ipAndAddr", String.valueOf(ipscore),"+inf",0,1); 27 | String value = ""; 28 | for (Tuple tuple : tuples) { 29 | value = tuple.getElement(); 30 | } 31 | String[] valueSplits = value.split("-"); 32 | long begin = IPUtils.ip2Long(valueSplits[0]); 33 | long end = IPUtils.ip2Long(valueSplits[1]); 34 | //String[] scope = value.substring(startpos+1,endpos).split(","); 35 | if(ipscore >= begin && ipscore <= end){ 36 | return value; 37 | } 38 | else return ""; 39 | } finally { 40 | //RedisClientUtils.returnResource(jedis);/**归还到连接池*/ 41 | 42 | } 43 | } 44 | /** 45 | * @Author liuxh02 46 | * @Description 在redis db2数据库中查询ipv4,ipv6地址信息 47 | * @Date 2020/8/6 48 | * @Param [jedis, ip] 49 | * @return java.lang.String 50 | **/ 51 | public static String getAddr(Jedis jedis, String ip) { 52 | jedis.select(2); 53 | //ip地址转整数 54 | BigInteger ipscore=null; 55 | if(ip.contains(":")){ 56 | //ipv6转整数 57 | ipscore=IPUtils.ipv6ToBigInt(ip); 58 | }else{ 59 | //ipv4转整数 60 | ipscore = BigInteger.valueOf(IPUtils.ip2Long(ip)); 61 | } 62 | Set tuples = jedis.zrangeByScoreWithScores("ipAndAddr",ipscore.toString(),"+inf",0,1); 63 | String value = ""; 64 | for (Tuple tuple : tuples) { 65 | value = tuple.getElement(); 66 | } 67 | String[] valueArray = value.split("-"); 68 | //获取IP和子网掩码 69 | String ipAndMask=valueArray[0]; 70 | BigInteger start=null; 71 | BigInteger end=null; 72 | if(ipAndMask.contains(":")){ 73 | //ipv6地址计算 74 | BigInteger[] ipv6AndMask = null; 75 | start=ipv6AndMask[0]; 76 | end=ipv6AndMask[1]; 77 | }else{ 78 | //ipv4地址计算 79 | long[] ipv4AndMask=IPUtils.getIPLongScope(ipAndMask); 80 | start= BigInteger.valueOf(ipv4AndMask[0]); 81 | end= BigInteger.valueOf(ipv4AndMask[1]); 82 | } 83 | if(ipscore.compareTo(start)>0 && ipscore.compareTo(end)<0){ 84 | return value; 85 | } 86 | else return ""; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /redis/src/main/java/com/anryg/bigdata/RedisClientUtils.java: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata; 2 | 3 | import redis.clients.jedis.Jedis; 4 | import redis.clients.jedis.JedisPool; 5 | import redis.clients.jedis.JedisPoolConfig; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Set; 10 | 11 | /** 12 | * Created by Anryg on 2018/5/9. 13 | */ 14 | public class RedisClientUtils implements RedisParam { 15 | private static volatile JedisPool jedisPool = null;/**用连接池进行管理,避免多线程情况下连接redis出现的各种问题*/ 16 | private static volatile Jedis jedis = null; 17 | 18 | /** 19 | * @DESC: 初始化连接池 20 | * */ 21 | private static void initPool(){ 22 | JedisPoolConfig config = null; 23 | try { 24 | config = new JedisPoolConfig(); 25 | config.setMaxTotal(MAX_ACTIVE); 26 | config.setMaxIdle(MAX_IDLE); 27 | config.setMaxWaitMillis(MAX_WAIT); 28 | config.setTestOnBorrow(TEST_ON_BORROW);//使用时进行扫描,确保都可用 29 | config.setTestWhileIdle(true);//Idle时进行连接扫描 30 | config.setTestOnReturn(true);//还回线程池时进行扫描 31 | } catch (Exception e) { 32 | throw e; 33 | } 34 | /*表示idle object evitor两次扫描之间要sleep的毫秒数 35 | config.setTimeBetweenEvictionRunsMillis(30000); 36 | 表示idle object evitor每次扫描的最多的对象数 37 | config.setNumTestsPerEvictionRun(10); 38 | 表示一个对象至少停留在idle状态的最短时间,然后才能被idle object evitor扫描并驱逐;这一项只有在timeBetweenEvictionRunsMillis大于0时才有意义 39 | config.setMinEvictableIdleTimeMillis(60000);*/ 40 | jedisPool = new JedisPool(config, HOSTS.split(",")[0], PORT, TIMEOUT, PASSWD); 41 | } 42 | /** 43 | *@DESC: 多线程环境下确保只初始化一个连接池 44 | */ 45 | private static void poolInit() { 46 | if (jedisPool == null){ 47 | synchronized (RedisClientUtils.class){ 48 | if (jedisPool == null) initPool(); 49 | } 50 | } 51 | } 52 | 53 | /** 54 | * @DESC: 获取连接池对象,适用多线程时,利用其获取多个jedis客户端 55 | * */ 56 | public static JedisPool getJedisPool(){ 57 | poolInit(); 58 | return jedisPool; 59 | } 60 | /** 61 | * @DESC: 同步获取Jedis实例,适合单线程 62 | * @return Jedis 63 | */ 64 | public static Jedis getSingleRedisClient() { 65 | poolInit(); 66 | if (jedis == null){ 67 | synchronized (RedisClientUtils.class){ 68 | if (jedis == null) { 69 | jedis = jedisPool.getResource(); 70 | } 71 | } 72 | } 73 | return jedis; 74 | } 75 | /** 76 | * @DESC: 释放jedis资源,将资源放回连接池 77 | * @param jedis 78 | */ 79 | public static void returnResource(final Jedis jedis) { 80 | if (jedis != null && jedisPool != null) jedis.close(); 81 | } 82 | 83 | /** 84 | * @DESC: 删除某个库下的所有数据 85 | * */ 86 | public static void delDataPerDB(Jedis redis, int dbNum){ 87 | redis.select(dbNum); 88 | Set keySet = redis.keys("*"); 89 | for (String key:keySet){ 90 | try { 91 | Set fields = redis.hkeys(key); 92 | redis.hdel(key,fields.toArray(new String[fields.size()]));//Set转Array 93 | } catch (Exception e) { 94 | throw e; 95 | }finally { 96 | //redis.close(); 97 | } 98 | } 99 | } 100 | 101 | /** 102 | * @DESC: 存储set对象 103 | * */ 104 | public static boolean save2RedisBySet(Jedis redis, int redisNo, String key, String[] strArray){ 105 | redis.select(redisNo); 106 | long count = 0; 107 | try { 108 | count = redis.sadd(key,strArray);/**存储不重复的*/ 109 | } catch (Exception e) { 110 | e.printStackTrace(); 111 | } finally { 112 | redis.close(); 113 | } 114 | if (count > 0) return true; 115 | else return false; 116 | } 117 | 118 | /** 119 | * @DESC: 用来批量存储key:value 120 | * @param kvList : 为容器,奇数位的key,偶数位的为value,且总数必须是偶数个 121 | * */ 122 | public static void save2RedisByKVs(Jedis redis, int redisNo, List kvList){ 123 | redis.select(redisNo); 124 | try { 125 | redis.mset(kvList.toArray(new String[kvList.size()])); 126 | } finally { 127 | redis.close(); 128 | } 129 | } 130 | /** 131 | * @DESC: 获取set对象的结果 132 | * */ 133 | public static Set getSetResult(Jedis redis, int redisNo, String key){ 134 | redis.select(redisNo); 135 | Set scanResult = null; 136 | try { 137 | scanResult = redis.smembers(key); 138 | } catch (Exception e) { 139 | e.printStackTrace(); 140 | } finally { 141 | redis.close(); 142 | } 143 | return scanResult; 144 | } 145 | 146 | /** 147 | *@DESC: 删除指定的key集合(调用时所在环境指定的数据库) 148 | * */ 149 | public static void deleteKeys(Jedis redis , List keys){ 150 | redis.del(keys.toArray(new String[keys.size()])); 151 | } 152 | 153 | /** 154 | * @DESC: 删除指定key(hash类型)下的字段集(调用时所在环境指定的数据库) 155 | * */ 156 | public static void deleteFieldByKey(Jedis redis, String key, List fields){ 157 | redis.hdel(key,fields.toArray(new String[fields.size()])); 158 | } 159 | 160 | } 161 | -------------------------------------------------------------------------------- /redis/src/main/java/com/anryg/bigdata/RedisParam.java: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata; 2 | 3 | 4 | 5 | /** 6 | * Created by Anryg on 2018/5/9. 7 | * @DESC: 提供Redis的基础属性配置 8 | */ 9 | public interface RedisParam { 10 | String HOSTS = "192.168.211.106";/**redis服务器列表,目前为单点*/ 11 | int PORT = 6379; 12 | String PASSWD = "pcl@2020"; 13 | //可用连接实例的最大数目,默认值为8; 14 | //如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽) 15 | int MAX_ACTIVE = 1500; 16 | //控制一个pool最多有多少个状态为idle(空闲的)的jedis实例,默认值也是8 17 | int MAX_IDLE = 100; 18 | //等待可用连接的最大时间,单位毫秒,默认值为-1,表示永不超时。如果超过等待时间,则直接抛出JedisConnectionException 19 | int MAX_WAIT = 100 * 1000; 20 | int TIMEOUT = 100 * 1000;//超时时间 21 | //在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的; 22 | boolean TEST_ON_BORROW = true; 23 | } 24 | -------------------------------------------------------------------------------- /spark-coding/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | internet_behavior_project 7 | com.anryg.bigdata 8 | 1.0-SNAPSHOT 9 | 10 | 4.0.0 11 | spark-coding 12 | spark-coding 13 | 14 | http://www.example.com 15 | 16 | 17 | UTF-8 18 | 1.8 19 | 1.8 20 | 21 | 22 | 23 | 24 | 25 | org.apache.spark 26 | spark-core_2.12 27 | 3.2.0 28 | 29 | 30 | 31 | org.apache.spark 32 | spark-sql_2.12 33 | 3.2.0 34 | 35 | 36 | 37 | org.apache.spark 38 | spark-sql-kafka-0-10_2.12 39 | 3.2.0 40 | 41 | 42 | 43 | com.alibaba 44 | fastjson 45 | 1.2.71 46 | 47 | 48 | 49 | org.apache.spark 50 | spark-hive_2.12 51 | 3.2.0 52 | 53 | 54 | 55 | com.anryg.bigdata 56 | redis 57 | 1.0-SNAPSHOT 58 | 59 | 60 | 61 | org.elasticsearch 62 | elasticsearch-spark-30_2.12 63 | 7.12.0 64 | 65 | 66 | scala-library 67 | org.scala-lang 68 | 69 | 70 | spark-core_2.12 71 | org.apache.spark 72 | 73 | 74 | spark-sql_2.12 75 | org.apache.spark 76 | 77 | 78 | spark-catalyst_2.12 79 | org.apache.spark 80 | 81 | 82 | slf4j-api 83 | org.slf4j 84 | 85 | 86 | 87 | 88 | commons-httpclient 89 | commons-httpclient 90 | 3.1 91 | 92 | 93 | 94 | 95 | com.clickhouse 96 | clickhouse-jdbc 97 | 0.4.6 98 | 99 | 100 | 101 | 102 | 103 | junit 104 | junit 105 | 4.11 106 | test 107 | 108 | 109 | 110 | 111 | 112 | 113 | src/main/scala 114 | src/main/test 115 | 116 | 117 | 118 | 119 | org.apache.maven.plugins 120 | maven-shade-plugin 121 | 3.2.0 122 | 123 | true 124 | with-dependencies 125 | 126 | 127 | *:* 128 | 129 | 130 | junit:junit 131 | 132 | 133 | 134 | 135 | 136 | *:* 137 | 138 | META-INF/*.SF 139 | META-INF/*.DSA 140 | META-INF/*.RSA 141 | 142 | 143 | 144 | false 145 | 146 | 151 | 152 | 153 | 154 | package 155 | 156 | shade 157 | 158 | 159 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | org.codehaus.mojo 185 | build-helper-maven-plugin 186 | 3.0.0 187 | 188 | 189 | add-source 190 | generate-sources 191 | 192 | add-source 193 | 194 | 195 | 196 | src/main/java 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | net.alchim31.maven 205 | scala-maven-plugin 206 | 3.2.1 207 | 208 | 209 | 210 | compile 211 | testCompile 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /spark-coding/spark-coding.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /spark-coding/src/main/java/com/anryg/bigdata/clickhouse/CKSink.java: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.clickhouse; 2 | 3 | import com.clickhouse.jdbc.ClickHouseConnection; 4 | import com.clickhouse.jdbc.ClickHouseDataSource; 5 | import org.apache.spark.sql.ForeachWriter; 6 | import org.apache.spark.sql.Row; 7 | 8 | import java.sql.PreparedStatement; 9 | import java.sql.SQLException; 10 | 11 | /** 12 | * @DESC: 自定义structured streaming的外部sink,通过jdbc写数据到clickhouse中 13 | * @Auther: Anryg 14 | * @Date: 2023/7/3 20:24 15 | */ 16 | public class CKSink extends ForeachWriter { 17 | private static final String jdbcUrl = "jdbc:ch://192.168.211.107:8123,192.168.211.108:8123,192.168.211.109:8123/local_db"; //为了防止找不到本地表,把整个集群的配置都写上 18 | //private static final Properties properties = new Properties(); 19 | private static volatile ClickHouseDataSource ckDtaSource; 20 | private static volatile ClickHouseConnection connection; 21 | 22 | private static final String user = "default"; //用CK的默认用户 23 | private static final String pwd = ""; //默认用户没有设置密码 24 | private static final String tableName = "dns_logs_from_spark"; //写入的CK目标表 25 | 26 | 27 | /** 28 | * @DESC: 执行数据处理之前的准备工作,创建数据库连接,并确保单例,其中open会以partition为单位执行 29 | * */ 30 | @Override 31 | public boolean open(long partitionId, long epochId){ 32 | if (ckDtaSource == null || connection == null) { 33 | synchronized (CKSink.class){ 34 | if (ckDtaSource == null || connection == null) { 35 | try { 36 | ckDtaSource = new ClickHouseDataSource(jdbcUrl); 37 | connection = ckDtaSource.getConnection(user, pwd); 38 | } catch (SQLException e) { 39 | e.printStackTrace(); 40 | System.exit(-1); //捕获到异常后进程退出 41 | } 42 | } 43 | } 44 | } 45 | 46 | if (connection == null) return false; 47 | else return true; 48 | } 49 | 50 | 51 | /** 52 | * @DESC: 当open函数返回为true之后,会针对partition中的每个ROW进行调用 53 | * */ 54 | @Override 55 | public void process(Row value) { 56 | try { 57 | PreparedStatement preparedStatement = connection.prepareStatement("insert into " + tableName + " values(?,?,?,?,?,?,?,?,?)"); 58 | preparedStatement.setString(1,value.getString(0)); 59 | preparedStatement.setString(2,value.getString(1)); 60 | preparedStatement.setString(3,value.getString(2)); 61 | preparedStatement.setString(4,value.getString(3)); 62 | preparedStatement.setString(5,value.getString(4)); 63 | preparedStatement.setString(6,value.getString(5)); 64 | preparedStatement.setString(7,value.getString(6)); 65 | preparedStatement.setString(8,value.getString(7)); 66 | preparedStatement.setString(9,value.getString(8)); 67 | preparedStatement.addBatch(); 68 | preparedStatement.executeBatch(); 69 | } catch (SQLException e) { 70 | e.printStackTrace(); 71 | System.exit(-1); //捕获到异常后进程退出 72 | } 73 | 74 | } 75 | 76 | /** 77 | * @DESC: 上两个函数执行完后,开始调用,一般用于关闭连接 78 | * */ 79 | @Override 80 | public void close(Throwable errorOrNull) { 81 | //长连接,不关闭 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/hive/ConnectHive.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.hive 2 | 3 | import org.apache.spark.SparkConf 4 | //import org.apache.spark.sql.SparkSession 5 | 6 | /** 7 | * @DESC: 8 | * @Auther: Anryg 9 | * @Date: 2022/4/8 16:30 10 | */ 11 | /*object ConnectHive { 12 | 13 | def main(args: Array[String]): Unit = { 14 | val conf = new SparkConf() 15 | conf.setAppName("connect_hive") 16 | val sparkSession = SparkSession.builder().config(conf) 17 | //.config("spark.sql.warehouse.dir","hdfs://192.168.211.106:8020/warehouse/tablespace/managed/hive") 18 | //.config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2") 19 | //.config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083") 20 | .enableHiveSupport() 21 | .getOrCreate() 22 | 23 | val result = sparkSession.sql("select * from xas.as_bgp_bak limit 3") 24 | result.show() 25 | 26 | sparkSession.close() 27 | sparkSession.stop() 28 | } 29 | }*/ 30 | 31 | 32 | /*val hive = HiveWarehouseSession.session(sparkSession).build()//获取HWC对象 33 | val result = hive.executeQuery("select * from doi_data limit 2")//查询hive表数据 34 | result.show()*/ -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/hive/Spark3ConnectHive3.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.hive 2 | 3 | import org.apache.spark.SparkConf 4 | //import org.apache.spark.sql.SparkSession 5 | 6 | /** 7 | * @DESC: 必须是非ACID表,否则读取为空表,但不报错 8 | * @Auther: Anryg 9 | * @Date: 2022/4/8 16:30 10 | */ 11 | /*object Spark3ConnectHive3 { 12 | 13 | def main(args: Array[String]): Unit = { 14 | val conf = new SparkConf() 15 | conf.setAppName("connect_hive") 16 | val sparkSession = SparkSession.builder().config(conf) 17 | //.config("spark.sql.warehouse.dir","hdfs://192.168.211.106:8020/warehouse/tablespace/managed/hive") 18 | //.config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2") 19 | //.config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083") 20 | //.config("spark.sql.hive.strict.managed.tables", false) 21 | .enableHiveSupport() 22 | .getOrCreate() 23 | 24 | val result = sparkSession.sql("select * from xas.as_bgp_bak limit 3") 25 | result.show() 26 | 27 | sparkSession.close() 28 | sparkSession.stop() 29 | } 30 | }*/ 31 | 32 | 33 | /*val hive = HiveWarehouseSession.session(sparkSession).build()//获取HWC对象 34 | val result = hive.executeQuery("select * from doi_data limit 2")//查询hive表数据 35 | result.show()*/ -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/Kafka2CK.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.clickhouse 2 | 3 | 4 | import com.anryg.bigdata.clickhouse.CKSink 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 8 | 9 | /** 10 | * @DESC: 通过spark structured streaming消费kafka,并通过自定义的ForeachWriter写数据到clickhouse 11 | * @Auther: Anryg 12 | * @Date: 2023/7/3 10:18 13 | */ 14 | object Kafka2CK { 15 | 16 | def main(args: Array[String]): Unit = { 17 | val conf = new SparkConf().setAppName("Kafka2CK").setMaster("local[*]") 18 | val spark = SparkSession.builder().config(conf).getOrCreate() 19 | 20 | val rawDF = spark.readStream //获取数据源 21 | .format("kafka") //确定数据源的来源格式 22 | .option("kafka.bootstrap.servers", "192.168.211.107:6667,192.168.211.108:6667,192.168.211.109:6667") //指定kafka集群的地址,理论上写一个broker就可以了 23 | .option("subscribe","qianxin") //指定topic 24 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/ 25 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常 26 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数 27 | //.option("maxOffsetsPerTrigger",99000000)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible,但是每次只取最新的数据,不取旧的*/ 28 | .option("startingOffsets","latest") //第一次消费时,读取kafka数据的位置 29 | .load() 30 | 31 | import spark.implicits._ 32 | 33 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型 34 | .map(row => { 35 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field 36 | val msgArray = line.split("\\|") //指定分隔符进行字段切分 37 | msgArray 38 | }).filter(_.length == 9) //只留字段数为9的数据 39 | .map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组,为了方便下一步赋予schema 40 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名 41 | 42 | 43 | val query = ds.writeStream 44 | .outputMode(OutputMode.Append()) //指定数据的写入方式 45 | .foreach(new CKSink) 46 | //.format("console") //指定外部输出介质,注意:不能同时指定2个外部输出,否则只会以最后一个为准 47 | //.trigger(Trigger.ProcessingTime(6,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/ 48 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/Kafka2CK2") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/ 49 | .start() 50 | 51 | query.awaitTermination() 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/StreamingProcessHelper.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming 2 | 3 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 4 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter, OutputMode} 5 | 6 | /** 7 | * @DESC: 对数据处理的共性部分进行提取 8 | * @Auther: Anryg 9 | * @Date: 2022/8/31 17:50 10 | */ 11 | trait StreamingProcessHelper[Any] { 12 | 13 | /** 14 | * @DESC: 以流的方式获取数据source 15 | * @param sparkSession: 16 | * @param dataSource: 数据源形式,比如kafka 17 | * @param config: 对流式数据源的配置 18 | * */ 19 | def getStreamingReader(sparkSession:SparkSession, dataSource:String, config:Map[String,String]): DataStreamReader ={ 20 | val streamingReader = sparkSession.readStream 21 | .format(dataSource) 22 | .options(config) 23 | streamingReader 24 | } 25 | 26 | /** 27 | * @DESC: 以流方式对数据进行sink 28 | * @param dataSet: 处理完成的结果数据集 29 | * @param outputMode: sink的类型:Complete、append、update 30 | * @param config: 对sink对象的配置 31 | * */ 32 | def getStreamingWriter(dataSet:DataFrame, outputMode:OutputMode, outputFormat:String, config:Map[String,String]): DataStreamWriter[Row] ={ 33 | val streamingWriter = dataSet.writeStream 34 | .format(outputFormat) 35 | .outputMode(outputMode) 36 | .options(config) 37 | streamingWriter 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/StructuredStreamingTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | package com.anryg.bigdata.streaming 3 | 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.streaming.OutputMode 7 | 8 | import scala.collection.mutable 9 | 10 | /** 11 | * @DESC: 测试daemon,注意如果在Windows下测试运行,需要在Windows上设置HADOOP_HOME环境变量,且需要在$HADOOP_HOME/bin目录下放置hadoop.dll和winutils两个文件 12 | * @Auther: Anryg 13 | * @Date: 2021/3/1 11:09 14 | */ 15 | object StructuredStreamingTest { 16 | 17 | 18 | def main(args: Array[String]): Unit = { 19 | val conf = new SparkConf().setMaster("local[2]").setAppName("Structured streaming test") 20 | //val conf = SparkConfFactory.newSparkConf().setMaster("local[2]").setAppName("Structured streaming test") 21 | 22 | val spark = SparkSession.builder().config(conf).getOrCreate() 23 | 24 | val rawDF = spark.readStream.format("socket") /**如果结果输出为complete模式,原始DF不能直接作为结果输出,必须经过聚合处理才可以,否则会有如下报错*/ 25 | /*Exception in thread "main" org.apache.spark.sql.AnalysisException: 26 | Complete output mode not supported when there are no streaming aggregations on streaming DataFrames/Datasets;;*/ 27 | .option("host","192.168.211.106") 28 | .option("port",9998) 29 | .load() 30 | 31 | import spark.implicits._ 32 | 33 | 34 | 35 | val xxx = rawDF.as[String].foreachPartition(iter => { 36 | while (iter.hasNext) println(iter.next()) 37 | }) 38 | /*mapPartitions(iterator => { 39 | val array = new mutable.ArrayBuffer[String] 40 | while (iterator.hasNext){ 41 | val next = iterator.next() 42 | array.+=(next) 43 | } 44 | array.toIterator 45 | })*/ 46 | 47 | val query = rawDF.writeStream 48 | .outputMode(OutputMode.Append()) 49 | .format("console") 50 | .start() 51 | 52 | query.awaitTermination() 53 | 54 | 55 | //rawDF.take(10).foreach(println(_)) 56 | 57 | 58 | } 59 | 60 | } 61 | */ 62 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreaming4Kafka2CSV.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.demo; 2 | import java.util.concurrent.TimeUnit 3 | 4 | import com.alibaba.fastjson.JSON 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 8 | /** 9 | * @DESC: 对接实时上网的数据源到HDFS的CSV文件中 10 | * @Auther: Anryg 11 | * @Date: 2020/12/17 09:56 12 | */ 13 | object StructuredStreaming4Kafka2CSV { 14 | 15 | def main(args: Array[String]): Unit = { 16 | val conf = new SparkConf().setAppName("StructuredStreaming4Kafka2CSV").setMaster("local[*]") 17 | val spark = SparkSession.builder().config(conf).getOrCreate() 18 | 19 | val rawDF = spark.readStream 20 | .format("kafka") 21 | .option("kafka.bootstrap.servers", "192.168.211.107:6667") 22 | .option("subscribe","qianxin") 23 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/ 24 | .option("failOnDataLoss",false) 25 | .option("fetchOffset.numRetries",3) 26 | .option("maxOffsetsPerTrigger",90000000)/**用于限流,限定每个批次取的数据条数,确定写入HDFS单个文件的条数*/ 27 | .option("startingOffsets","earliest") 28 | .load() 29 | 30 | import spark.implicits._ 31 | val ds = rawDF.selectExpr("CAST(value AS STRING)") 32 | .map(row => { 33 | val line = row.getAs[String]("value") 34 | val fieldArray:Array[String] = line.split("\\|") 35 | fieldArray 36 | }).filter(_.length == 9).map(array =>(array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) 37 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") 38 | 39 | ds.printSchema() 40 | 41 | //val ds1 = ds.select($"client_ip") 42 | val query = ds.writeStream 43 | .outputMode(OutputMode.Append()).trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒写文件一次*/ 44 | .option("format", "append") /**会在同一个目录下追加新文件,否则只能在特定目录下写一个批次的的数据后就报错*/ 45 | .option("header", "true") /**添加文件的scheme*/ 46 | .format("csv").option("path","hdfs://192.168.211.106:8020/DATA/qianxin/3/") 47 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-03") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/ 48 | .start() 49 | 50 | query.awaitTermination() 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.demo 2 | 3 | import com.alibaba.fastjson.JSON 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.streaming.OutputMode 7 | ; 8 | 9 | /** 10 | * @DESC: 从kafka读取上网上网数据 11 | * @Auther: Anryg 12 | * @Date: 2020/12/17 09:56 13 | */ 14 | object StructuredStreamingFromKafka { 15 | 16 | def main(args: Array[String]): Unit = { 17 | val conf = new SparkConf().setAppName("StructuredStreamingFromKafka").setMaster("local[*]") 18 | val spark = SparkSession.builder().config(conf).getOrCreate() 19 | 20 | val rawDF = spark.readStream //获取数据源 21 | .format("kafka") //确定数据源的来源格式 22 | .option("kafka.bootstrap.servers", "192.168.211.108:6667") //指定kafka集群的地址,理论上写一个broker就可以了 23 | .option("subscribe","test") //指定topic 24 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/ 25 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常 26 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数 27 | .option("maxOffsetsPerTrigger",10)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible,但是每次只取最新的数据,不取旧的*/ 28 | .option("startingOffsets","latest") //第一次消费时,读取kafka数据的位置 29 | //.option("startingOffsets","""{"test":{"0":-2,"1":-2,"2":-2,"3":-2}}""") 30 | .load() 31 | 32 | import spark.implicits._ 33 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型 34 | .map(row => { 35 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field 36 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析 37 | val message = rawJson.getString("message") //获取业务数据部分 38 | val msgArray = message.split(",") //指定分隔符进行字段切分 39 | msgArray 40 | }).filter(_.length == 9) //只留字段数为9的数据 41 | .map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组,为了方便下一步赋予schema 42 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名 43 | 44 | ds.printSchema() //打印schema,确认没有问题 45 | 46 | val query = ds.writeStream 47 | .outputMode(OutputMode.Append()) //指定数据的写入方式 48 | .format("console") //指定外部输出介质 49 | //.trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/ 50 | .option("format", "append") /**会在同一个目录下追加新文件,否则只能在特定目录下写一个批次的的数据后就报错*/ 51 | //.option("header", "true") /**添加文件的scheme*/ 52 | // .format("csv").option("path","hdfs://192.168.211.106:8020/DATA/qianxin/3/") 53 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-08") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/ 54 | .start() 55 | 56 | query.awaitTermination() 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka2ES.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.demo; 2 | import com.alibaba.fastjson.JSON 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.OutputMode 6 | /** 7 | * @DESC: 从kafka读取上网上网数据,写入ES 8 | * @Auther: Anryg 9 | * @Date: 2020/12/17 09:56 10 | */ 11 | object StructuredStreamingFromKafka2ES { 12 | 13 | def main(args: Array[String]): Unit = { 14 | val conf = new SparkConf().setAppName("StructuredStreamingFromKafka").setMaster("local[*]") 15 | val spark = SparkSession.builder().config(conf).getOrCreate() 16 | 17 | val rawDF = spark.readStream 18 | .format("kafka") //确定数据源的来源格式 19 | .option("kafka.bootstrap.servers", "192.168.211.107:6667") //指定kafka集群的地址,理论上写一个broker就可以了 20 | .option("subscribe","test") //指定topic 21 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/ 22 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常 23 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数 24 | //.option("maxOffsetsPerTrigger",100)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible*/ 25 | .option("startingOffsets","earliest") //第一次消费时,读取kafka数据的位置 26 | .load() 27 | 28 | import spark.implicits._ 29 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型 30 | .map(row => { 31 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field 32 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析 33 | val message = rawJson.getString("message") //获取业务数据部分 34 | val msgArray = message.split(",") //指定分隔符进行字段切分 35 | msgArray 36 | }).filter(_.length == 9) //只留字段数为9的数据 37 | .map(array => (array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组,为了方便下一步赋予schema 38 | .toDF("id","client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名 39 | 40 | ds.printSchema() //打印schema,确认没有问题 41 | 42 | val query = ds.writeStream 43 | .outputMode(OutputMode.Append()) //指定数据的写入方式 44 | .format("org.elasticsearch.spark.sql") //指定外部输出为ES 45 | .option("es.nodes","192.168.211.106") 46 | .option("es.port","9201") 47 | .option("es.write.operation","upsert") 48 | .option("es.mapping.id","id") 49 | //.option("es.mapping.exclude","id") 50 | //.trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/ 51 | .option("format", "append") /**追加写入*/ 52 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-05") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/ 53 | .start("internet_behavior-flink") 54 | 55 | query.awaitTermination() 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka2Hive.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.demo 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import com.alibaba.fastjson.JSON 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 9 | ; 10 | 11 | /** 12 | * @DESC: 从kafka读取上网数据,写入hive动态分区表 13 | * @Auther: Anryg 14 | * @Date: 2020/12/17 09:56 15 | */ 16 | object StructuredStreamingFromKafka2Hive { 17 | 18 | def main(args: Array[String]): Unit = { 19 | val conf = new SparkConf() 20 | .setAppName("StructuredStreamingFromKafka2Hive") 21 | .setMaster("local[*]")//本地运行模式,如果提交集群,注释掉这行 22 | val spark = SparkSession.builder().config(conf) 23 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2") 24 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083") 25 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog 26 | .getOrCreate() 27 | 28 | val rawDF = spark.readStream 29 | .format("kafka") //确定数据源的来源格式 30 | .option("kafka.bootstrap.servers", "192.168.211.107:6667") //指定kafka集群的地址,理论上写一个broker就可以了 31 | .option("subscribe","test") //指定topic 32 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/ 33 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常 34 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数 35 | .option("maxOffsetsPerTrigger",500)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible*/ 36 | .option("startingOffsets","earliest") //第一次消费时,读取kafka数据的位置 37 | .load() 38 | 39 | import spark.implicits._ 40 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型 41 | .map(row => { 42 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field 43 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析 44 | val message = rawJson.getString("message") //获取业务数据部分 45 | val msgArray = message.split(",") //指定分隔符进行字段切分 46 | msgArray 47 | }).filter(_.length == 9) //只留字段数为9的数据 48 | .filter(array => array(2).length >= 8)//确保日期字段符合规范 49 | .map(array => (array(0)+array(1)+array(2),array(0),array(1),array(2),array(3), 50 | if(array(4).isInstanceOf[Int]) array(4).toInt else 99,array(5),array(6),array(7),array(8),array(2).substring(0,4),array(2).substring(4,6),array(2).substring(6,8))) //将其转化成为元组,为了方便下一步赋予schema 51 | .toDF("id","client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day") //给裸数据添加字段名 52 | 53 | ds.printSchema() //打印schema,确认没有问题 54 | spark.sql("show databases;").show() 55 | 56 | val query = ds.writeStream 57 | .outputMode(OutputMode.Append()) //指定数据的写入方式 58 | .format("orc") //指定外部输出的文件存储格式 59 | .option("format", "append") 60 | .trigger(Trigger.ProcessingTime(10,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/ 61 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StructuredStreamingFromKafka2Hive01") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化,写hive的时候一定不要轻易变动*/ 62 | .partitionBy("year","month","day")//提供分区字段 63 | .toTable("test.test")//写入hive表 64 | query.awaitTermination() 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingReadHive.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.demo; 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.SparkSession 6 | 7 | /** 8 | * @DESC: 读取通过streaming写入hive动态分区表的数据 9 | * @Auther: Anryg 10 | * @Date: 2022/08/31 09:56 11 | */ 12 | object StructuredStreamingReadHive { 13 | 14 | def main(args: Array[String]): Unit = { 15 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 16 | val conf = new SparkConf() 17 | .setAppName("StructuredStreamingReadHive") 18 | .setMaster("local[*]")//本地运行模式,如果提交集群,注释掉这行 19 | val spark = SparkSession.builder().config(conf) 20 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2") 21 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083") 22 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog 23 | .getOrCreate() 24 | 25 | spark.readStream 26 | .table("ods.ods_kafka_internetlog1") 27 | .select("client_ip") 28 | .writeStream 29 | .format("console") 30 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StructuredStreamingReadHive1") 31 | .start().awaitTermination() 32 | 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/window_watermark/WorldCountWithWatermark.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.demo.window_watermark 2 | 3 | import java.sql.Timestamp 4 | import java.text.SimpleDateFormat 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.streaming.OutputMode 9 | 10 | /** 11 | * @DESC: 用时间窗口和watermark来进行client_ip的worldcount统计 12 | * @Auther: Anryg 13 | * @Date: 2022/11/30 10:04 14 | */ 15 | object WorldCountWithWatermark { 16 | 17 | def main(args: Array[String]): Unit = { 18 | val conf = new SparkConf().setAppName("WorldCountWithWatermark").setMaster("local") 19 | val spark = SparkSession.builder() 20 | .config(conf) 21 | .getOrCreate() 22 | Logger.getLogger("org.apache").setLevel(Level.WARN) //减少INFO日志的输出 23 | 24 | val rawDF = spark.readStream 25 | .format("kafka") 26 | .option("kafka.bootstrap.servers", "192.168.211.107:6667") 27 | .option("subscribe", "qianxin") 28 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/ 29 | .option("failOnDataLoss",false) 30 | .option("fetchOffset.numRetries",3) 31 | //.option("maxOffsetsPerTrigger",Integer.MAX_VALUE)/**用于限流,限定每个批次取的数据条数,确定写入HDFS单个文件的条数*/ 32 | .option("startingOffsets","latest") 33 | .load() 34 | 35 | import spark.implicits._ 36 | val df1 = rawDF.selectExpr("CAST(value AS string)") 37 | .map(row =>{ 38 | val line = row.getAs[String]("value") 39 | val fieldArray:Array[String] = line.split("\\|") 40 | fieldArray 41 | }) 42 | .filter(_.length == 9) //确定字段数必须为9个 43 | .filter(_(1).endsWith("com")) //防止数量太大,对访问的网站做的一点限制 44 | .map(array =>{ 45 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss").parse(array(2)) 46 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(sdf) 47 | (array(0), Timestamp.valueOf(time)) //因为time这个字段要作为watermark字段,它必须是yyyy-MM-dd HH:mm:ss样式的Timestamp类型 48 | }) 49 | .toDF("client_ip", "time") //添加schema 50 | 51 | import org.apache.spark.sql.functions._ /**引入spark内置函数*/ 52 | 53 | val df2 = df1.withWatermark("time", "10 seconds") //一般需要跟window一起配合使用 54 | .groupBy(window($"time","2 minutes","30 seconds"), $"client_ip") //确定具体字段,以及对应的聚合时间窗口,和滑动窗口 55 | .count() 56 | .orderBy($"count".desc) 57 | .limit(10) 58 | 59 | val query = df2.writeStream 60 | .format("console") //打印到控制台 61 | .option("truncate", false) //将结果的内容完整输出,默认会砍掉内容过长的部分 62 | .option("numRows",30) //一次最多打印多少行,默认20行 63 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/WorldCountWithWatermark") //确定checkpoint目录 64 | //.outputMode(OutputMode.Update())//不支持排序的结果 65 | .outputMode(OutputMode.Complete()) //确定输出模式,默认为Append 66 | .start() 67 | 68 | query.awaitTermination() 69 | } 70 | 71 | } -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/dwd/StreamingFromOds2Dwd.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.dwd 2 | 3 | import com.anryg.bigdata.{IpSearch, RedisClientUtils} 4 | import com.anryg.bigdata.streaming.StreamingProcessHelper 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} 7 | import org.apache.spark.sql.{DataFrame, SparkSession} 8 | 9 | 10 | /** 11 | * @DESC: 读取ods层数据,并加工写入到dwd 12 | * @Auther: Anryg 13 | * @Date: 2022/9/1 09:53 14 | */ 15 | object StreamingFromOds2Dwd extends StreamingProcessHelper[Any]{ 16 | 17 | def main(args: Array[String]): Unit = { 18 | val conf = new SparkConf().setAppName("StreamingFromOds2Dwd").setMaster("local") 19 | val spark = SparkSession.builder().config(conf) 20 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2") 21 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083") 22 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog 23 | .getOrCreate() 24 | 25 | clickProcess(spark,"ods.ods_kafka_internetlog","dwd.dwd_internetlog_detail") 26 | } 27 | 28 | /** 29 | *@DESC: 流方式读取hive数据源 30 | * */ 31 | def readHive2DF(sparkSession: SparkSession, sourceTable:String): DataFrame ={ 32 | sparkSession.readStream.table(sourceTable) 33 | } 34 | 35 | /** 36 | *@DESC: 对ods数据进行字段补齐等处理 37 | * */ 38 | def handleData(sparkSession: SparkSession, dataFrame: DataFrame, tableName:String): DataFrame ={ 39 | import sparkSession.implicits._ 40 | dataFrame.printSchema() 41 | dataFrame.map(row => { 42 | val clientIP = row.getAs[String]("client_ip") 43 | val ipAndAddr = IpSearch.getAddrByIP(RedisClientUtils.getSingleRedisClient,clientIP).split("-") 44 | val country = ipAndAddr(2) 45 | val province = ipAndAddr(3) 46 | val city = ipAndAddr(4) 47 | val operator = ipAndAddr(5) 48 | val domain = row.getAs[String]("domain").toLowerCase//将域名转成小写 49 | val time = row.getAs[String]("time") 50 | val targetIP = row.getAs[String]("target_ip") 51 | val rcode = row.getAs[String]("rcode") 52 | val queryType = row.getAs[String]("query_type") 53 | val authRecord = row.getAs[String]("authority_record").toLowerCase 54 | val addMsg = row.getAs[String]("add_msg") 55 | val dnsIP = row.getAs[String]("dns_ip") 56 | val year = row.getAs[String]("year") 57 | val month = row.getAs[String]("month") 58 | val day = row.getAs[String]("day") 59 | (clientIP,country,province,city,operator,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP,year,month,day) 60 | }).toDF("client_ip","country","province","city","operator","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day") 61 | } 62 | 63 | /** 64 | *@DESC: 将处理好的数据sink到dwd表中 65 | * */ 66 | def sinkData(targetDS:DataFrame, tableName:String): StreamingQuery ={ 67 | val config = Map(("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StreamingFromOds2Dwd"), 68 | ("format","append")) 69 | getStreamingWriter(targetDS,OutputMode.Append(),"orc",config) 70 | .partitionBy("year","month","day") 71 | .toTable(tableName) 72 | } 73 | 74 | /** 75 | * @DESC: 将所有数据步骤串起来 76 | * */ 77 | def clickProcess(sparkSession: SparkSession,sourceTable:String, sinkTable:String): Unit ={ 78 | val rawDF = readHive2DF(sparkSession, sourceTable) 79 | val targetDS = handleData(sparkSession, rawDF, sourceTable) 80 | sinkData(targetDS, sinkTable).awaitTermination() 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/streaming/ods/StreamingSource2HiveOds.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.streaming.ods 2 | 3 | import com.alibaba.fastjson.JSON 4 | import com.anryg.bigdata.streaming.StreamingProcessHelper 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} 7 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} 8 | 9 | 10 | /** 11 | * @DESC: 12 | * @Auther: Anryg 13 | * @Date: 2022/8/31 19:03 14 | */ 15 | object StreamingSource2HiveOds extends StreamingProcessHelper[Any]{ 16 | 17 | 18 | /** 19 | * @DESC: 主函数,应用运行入口 20 | * */ 21 | def main(args: Array[String]): Unit = { 22 | val conf = new SparkConf().setAppName("StreamingSource2HiveOds").setMaster("local[*]") 23 | val spark = SparkSession.builder().config(conf) 24 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2") 25 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083") 26 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog 27 | .getOrCreate() 28 | 29 | clickProcess(spark) 30 | 31 | } 32 | 33 | /** 34 | *@DESC: 将kafka数据源数据读取出来成为DataFrame 35 | * */ 36 | def readKafka2DF(sparkSession: SparkSession): DataFrame ={ 37 | val config = Map(("kafka.bootstrap.servers", "192.168.211.107:6667"),("subscribe","test"), 38 | ("failOnDataLoss","false"),("fetchOffset.numRetries","3"),("startingOffsets","earliest")) 39 | 40 | getStreamingReader(sparkSession,"kafka",config).load() 41 | } 42 | 43 | /** 44 | *@DESC: 将原始DF进行业务逻辑处理 45 | * */ 46 | 47 | def handleData(sparkSession: SparkSession, rawDF:DataFrame): DataFrame ={ 48 | import sparkSession.implicits._ 49 | val targetDS = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型 50 | .map(row => { 51 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field 52 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析 53 | val message = rawJson.getString("message") //获取业务数据部分 54 | val msgArray = message.split(",") //指定分隔符进行字段切分 55 | msgArray 56 | }).filter(_.length == 9).filter(array => array(2).length >= 8)//确保日期字段符合规范 57 | .map(array =>(array(0),array(1),array(2),array(3), array(4),array(5),array(6),array(7),array(8), 58 | array(2).substring(0,4),array(2).substring(4,6),array(2).substring(6,8))) 59 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day") //给裸数据添加字段名 60 | 61 | targetDS 62 | } 63 | 64 | /** 65 | *@DESC: 将目标数据集进行写入hive的ODS 66 | * */ 67 | def sinkData(targetDS:DataFrame): StreamingQuery ={ 68 | val config = Map(("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StreamingSource2HiveOds"), 69 | ("format","append")) 70 | getStreamingWriter(targetDS, OutputMode.Append(),"orc",config) 71 | .partitionBy("year","month","day") 72 | .toTable("ods.ods_kafka_internetlog") 73 | } 74 | 75 | /** 76 | * @DESC: 将所有数据步骤串起来 77 | * */ 78 | def clickProcess(sparkSession: SparkSession): Unit ={ 79 | val rawDF = readKafka2DF(sparkSession) 80 | val targetDS = handleData(sparkSession, rawDF) 81 | sinkData(targetDS).awaitTermination() 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/DataSkew01.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.test.data_skew 2 | 3 | import java.util 4 | 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | /** 8 | * @DESC: 一个数据倾斜的例子 9 | * @Auther: Anryg 10 | * @Date: 2022/10/10 17:00 11 | */ 12 | object DataSkew01 { 13 | 14 | def main(args: Array[String]): Unit = { 15 | val conf = new SparkConf().setAppName("DataSkewTest01")/*.setMaster("local[*]")*/ 16 | val spark = new SparkContext(conf) 17 | 18 | val rawRDD = spark.textFile(args(0))//读取数据源 19 | 20 | val filteredRDD = rawRDD.filter(line => { /**筛选满足需要的数据,已到达数据倾斜的目的*/ 21 | val array = line.split(",") 22 | val target_ip = array(3) 23 | target_ip.equals("106.38.176.185") || target_ip.equals("106.38.176.117") || target_ip.equals("106.38.176.118") || target_ip.equals("106.38.176.116") 24 | }) 25 | 26 | val reducedRDD = filteredRDD.map(line => {/**根据目的ip进行汇总,将访问同一个目的ip的所有客户端ip进行汇总*/ 27 | val array = line.split(",") 28 | val target_ip = array(3) 29 | val client_ip = array(0) 30 | val index = client_ip.lastIndexOf(".") 31 | val subClientIP = client_ip.substring(0, index) //为了让后续聚合后的value数据量尽可能的少,只取ip的前段部分 32 | (target_ip,Array(subClientIP)) 33 | }).reduceByKey(new MyPartitioner(4), _++_)//将Array中的元素进行合并 34 | 35 | val targetRDD = reducedRDD.map(kv => {/**将访问同一个目的ip的客户端,再次根据客户端ip进行进一步统计*/ 36 | val map = new util.HashMap[String,Int]() 37 | val target_ip = kv._1 38 | val clientIPArray = kv._2 39 | clientIPArray.foreach(clientIP => { 40 | if (map.containsKey(clientIP)) { 41 | val sum = map.get(clientIP) + 1 42 | map.put(clientIP,sum) 43 | } 44 | else map.put(clientIP,1) 45 | }) 46 | (target_ip,map) 47 | }) 48 | 49 | targetRDD.saveAsTextFile("/tmp/DataSkew01") //结果数据保存目录 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/DataSkew02.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.test.data_skew 2 | 3 | import java.util 4 | 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | import scala.util.Random 8 | 9 | /** 10 | * @DESC: 一个数据倾斜的例子 11 | * @Auther: Anryg 12 | * @Date: 2022/10/10 17:00 13 | */ 14 | object DataSkew02 { 15 | 16 | def main(args: Array[String]): Unit = { 17 | val conf = new SparkConf().setAppName("DataSkewTest02")/*.setMaster("local[*]")*/ 18 | val spark = new SparkContext(conf) 19 | 20 | val rawRDD = spark.textFile(args(0)) //读取原始数据源 21 | 22 | val filteredRDD = rawRDD.filter(line => { /**筛选满足需要的数据,已到达数据倾斜的目的*/ 23 | val array = line.split(",") 24 | val target_ip = array(3) 25 | target_ip.equals("106.38.176.185") || target_ip.equals("106.38.176.117") || target_ip.equals("106.38.176.118") || target_ip.equals("106.38.176.116") 26 | }) 27 | 28 | val reducedRDD_01 = filteredRDD.map(line => {/**解决倾斜第一步:加盐操作将原本1个分区的数据扩大到100个分区*/ 29 | val array = line.split(",") 30 | val target_ip = array(3) 31 | val client_ip = array(0) 32 | val index = client_ip.lastIndexOf(".") 33 | val subClientIP = client_ip.substring(0, index)//为了让后续聚合后的value数据量尽可能的少,只取ip的前段部分 34 | if (target_ip.equals("106.38.176.185")){/**针对特定倾斜的key进行加盐操作*/ 35 | val saltNum = 99 //将原来的1个key增加到100个key 36 | val salt = new Random().nextInt(saltNum) 37 | (target_ip + "-" + salt,Array(subClientIP)) 38 | } 39 | else (target_ip,Array(subClientIP)) 40 | }).reduceByKey(_++_,103)//将Array中的元素进行合并,并确定分区数量 41 | 42 | val targetRDD_01 = reducedRDD_01.map(kv => {/**第二步:将各个分区中的数据进行初步统计,减少单个分区中value的大小*/ 43 | val map = new util.HashMap[String,Int]() 44 | val target_ip = kv._1 45 | val clientIPArray = kv._2 46 | clientIPArray.foreach(clientIP => {//对clientIP进行统计 47 | if (map.containsKey(clientIP)) { 48 | val sum = map.get(clientIP) + 1 49 | map.put(clientIP,sum) 50 | } 51 | else map.put(clientIP,1) 52 | }) 53 | (target_ip,map) 54 | }) 55 | 56 | val reducedRDD_02 = targetRDD_01.map(kv => {/**第3步:对倾斜的数据进行减盐操作,将分区数从100减到10*/ 57 | val targetIPWithSalt01 = kv._1 58 | val clientIPMap = kv._2 59 | if (targetIPWithSalt01.startsWith("106.38.176.185")){ 60 | val targetIP = targetIPWithSalt01.split("-")(0) 61 | val saltNum = 9 //将原来的100个分区减少到10个分区 62 | val salt = new Random().nextInt(saltNum) 63 | (targetIP + "-" + salt,clientIPMap) 64 | } 65 | else kv 66 | }).reduceByKey((map1,map2) => { /**合并2个map中的元素,key相同则value值相加*/ 67 | val map3 = new util.HashMap[String,Int](map1) 68 | map2.forEach((key,value) => { 69 | map3.merge(key, value, (v1,v2) => v1 + v2) //将map1和map2中的结果merge到map3中,相同的key,则value相加 70 | }) 71 | map3 72 | },13)//调整分区数量 73 | 74 | val finalRDD = reducedRDD_02.map(kv => {/**第4步:继续减盐,将原本10个分区数的数据恢复到1个*/ 75 | val targetIPWithSalt01 = kv._1 76 | val clientIPMap = kv._2 77 | if (targetIPWithSalt01.startsWith("106.38.176.185")){ 78 | val targetIP = targetIPWithSalt01.split("-")(0) 79 | (targetIP,clientIPMap)//彻底将盐去掉 80 | } 81 | else kv 82 | }).reduceByKey(new MyPartitioner(4), (map1,map2) => { /**合并2个map中的元素,key相同则value值相加*/ 83 | val map3 = new util.HashMap[String,Int](map1) 84 | map2.forEach((key,value) => { 85 | map3.merge(key, value, (v1,v2) => v1 + v2) 86 | }) 87 | map3 88 | })//调整分区数量 89 | 90 | finalRDD.saveAsTextFile(args(1)) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/MyPartitioner.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.test.data_skew 2 | 3 | import org.apache.spark.Partitioner 4 | 5 | /** 6 | * @DESC: 实现自定义的分区策略 7 | * @Auther: Anryg 8 | * @Date: 2022/10/13 09:52 9 | */ 10 | class MyPartitioner(partitionNum: Int) extends Partitioner{ 11 | override def numPartitions: Int = partitionNum //确定总分区数量 12 | 13 | override def getPartition(key: Any): Int = {//确定数据进入分区的具体策略 14 | val keyStr = key.toString 15 | val keyTag = keyStr.substring(keyStr.length - 1, keyStr.length) 16 | keyTag.toInt % partitionNum 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/test/map_pk_mappartition/MapPartitionTest.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.test.map_pk_mappartition 2 | 3 | import java.util 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.SparkSession 7 | 8 | import scala.collection.mutable 9 | 10 | /** 11 | * @DESC: 12 | * @Auther: Anryg 13 | * @Date: 2022/9/20 10:10 14 | */ 15 | object MapPartitionTest { 16 | 17 | def main(args: Array[String]): Unit = { 18 | val conf = new SparkConf().setAppName("MapPartitionTest")/*.setMaster("local")*/ 19 | val spark = SparkSession.builder().config(conf).getOrCreate() 20 | val rawDF = spark.read/*.option("header",true)*/.csv(args(0)) 21 | 22 | import spark.implicits._ 23 | rawDF.printSchema() 24 | rawDF.show() 25 | val resultDF = rawDF.mapPartitions(iterator => { 26 | //val array = new mutable.ArrayBuffer[(String,String,String,String,String,String,String,String,String)] 27 | //val seq = mutable.Seq[(String,String,String,String,String,String,String,String,String)] 28 | //val list = new util.LinkedList[(String,String,String,String,String,String,String,String,String)] 29 | val set = new mutable.LinkedHashSet[(String,String,String,String,String,String,String,String,String)] 30 | while (iterator.hasNext){ 31 | val next = iterator.next() 32 | val clientIP = next.getAs[String]("_c0") 33 | val domain = next.getAs[String]("_c1").toLowerCase//将域名转成小写 34 | val time = next.getAs[String]("_c2") 35 | val targetIP = next.getAs[String]("_c3") 36 | val rcode = next.getAs[String]("_c4") 37 | val queryType = next.getAs[String]("_c5") 38 | val authRecord = if (next.getAs[String]("_c6") == null ) "" else next.getAs[String]("_c6").toLowerCase 39 | val addMsg = if (next.getAs[String]("_c7") == null ) "" else next.getAs[String]("_c7") 40 | val dnsIP = next.getAs[String]("_c8") 41 | 42 | set.+=((clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP)) 43 | //array.+=((clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP)) 44 | } 45 | //array.toIterator 46 | set.toIterator 47 | }).toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") 48 | 49 | resultDF.write.csv(args(1)) 50 | } 51 | 52 | 53 | 54 | } 55 | -------------------------------------------------------------------------------- /spark-coding/src/main/scala/com/anryg/bigdata/test/map_pk_mappartition/MapTest.scala: -------------------------------------------------------------------------------- 1 | package com.anryg.bigdata.test.map_pk_mappartition 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | /** 7 | * @DESC: 8 | * @Auther: Anryg 9 | * @Date: 2022/9/20 10:10 10 | */ 11 | object MapTest { 12 | 13 | def main(args: Array[String]): Unit = { 14 | val conf = new SparkConf().setAppName("MapTest")/*.setMaster("local")*/ 15 | val spark = SparkSession.builder().config(conf).getOrCreate() 16 | val rawDF = spark.read/*.option("header",true)*/.csv(args(0))//读取HDFS的数据源 17 | 18 | import spark.implicits._ 19 | rawDF.printSchema() //spark job 1 20 | rawDF.show() //spark job 2 21 | val resultDF = rawDF.map(row => { 22 | val clientIP = row.getAs[String]("_c0") 23 | val domain = row.getAs[String]("_c1").toLowerCase//将域名转成小写 24 | val time = row.getAs[String]("_c2") 25 | val targetIP = row.getAs[String]("_c3") 26 | val rcode = row.getAs[String]("_c4") 27 | val queryType = row.getAs[String]("_c5") 28 | val authRecord = if (row.getAs[String]("_c6") == null ) "" else row.getAs[String]("_c6").toLowerCase 29 | val addMsg = if (row.getAs[String]("_c7") == null ) "" else row.getAs[String]("_c7") 30 | val dnsIP = row.getAs[String]("_c8") 31 | (clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP) 32 | }).toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") 33 | 34 | 35 | /**将转换后的数据写入HDFS*/ 36 | resultDF.write.csv(args(1))//spark job 3 37 | } 38 | } 39 | --------------------------------------------------------------------------------