├── README.md
├── flink-coding
├── pom.xml
└── src
│ └── main
│ ├── resources
│ └── hive-site.xml
│ └── scala
│ └── com
│ └── anryg
│ ├── FlinkDSFromKafka2HDFS.scala
│ ├── FlinkTest04.scala
│ ├── hive_cdc
│ ├── FlinkReadKafka2Hive.scala
│ └── FlinkWithHive.scala
│ └── window_and_watermark
│ ├── FlinkDSFromKafkaWithWatermark.scala
│ ├── FlinkSQLFromKafkaWithWatermarkAndWindow.scala
│ └── FlinkTBFromKafkaWithWatermark.scala
├── pom.xml
├── redis
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── anryg
│ └── bigdata
│ ├── IPUtils.java
│ ├── IpSearch.java
│ ├── RedisClientUtils.java
│ └── RedisParam.java
└── spark-coding
├── pom.xml
├── spark-coding.iml
└── src
└── main
├── java
└── com
│ └── anryg
│ └── bigdata
│ └── clickhouse
│ └── CKSink.java
└── scala
└── com
└── anryg
└── bigdata
├── hive
├── ConnectHive.scala
└── Spark3ConnectHive3.scala
├── streaming
├── Kafka2CK.scala
├── StreamingProcessHelper.scala
├── StructuredStreamingTest.scala
├── demo
│ ├── StructuredStreaming4Kafka2CSV.scala
│ ├── StructuredStreamingFromKafka.scala
│ ├── StructuredStreamingFromKafka2ES.scala
│ ├── StructuredStreamingFromKafka2Hive.scala
│ ├── StructuredStreamingReadHive.scala
│ └── window_watermark
│ │ └── WorldCountWithWatermark.scala
├── dwd
│ └── StreamingFromOds2Dwd.scala
└── ods
│ └── StreamingSource2HiveOds.scala
└── test
├── data_skew
├── DataSkew01.scala
├── DataSkew02.scala
└── MyPartitioner.scala
└── map_pk_mappartition
├── MapPartitionTest.scala
└── MapTest.scala
/README.md:
--------------------------------------------------------------------------------
1 | # internet_behavior_project
2 | 大数据项目之用户上网行为分析
3 |
4 |
5 | 数据源解读
6 | 这份数据长这样,有非常规整的9个字段(我都替你清洗过了),为了方便你们读取,我把它导出成CSV文件,其中第一行是schema。
7 |
8 | 为了方便大家获取,我把它放到了云盘上,原文件有12G,我通过压缩之后,也有3G,为了保证大家是真的用这份数据在学习,而不是干别的,这个下载地址需要你加我微信后告诉你。
9 |
10 | 现在来帮你解读下这份数据,一共个9个字段,其字段意义解释分别如下:
11 | client_ip: 指上网用户的ip地址,你可以根据这个ip知道这个用户大概的位置信息,这个有专门的api可以查询;
12 | domain:指上网人要上的网站地址,你可以根据该网站的性质来判断这个人的上网行为;
13 | time:上网人的上网时间;
14 | target_ip: 上网人要上的网站的目标ip地址;
15 | rcode:网站返回状态码,0为正常响应,2为不正常;
16 | query_type: 查询类型,几乎都是1,即正常上网行为;
17 | authority_recode:网站服务器真正返回的域名,可能跟domain不一样,如果不一样的话,可能说明是个钓鱼网站之类的,你可以去分析分析;
18 | add_msg: 附加信息,几乎都为空,你可以看看如果有内容的话,到底是什么玩意;
19 | dns_ip:当前要上的这个网站由哪个DNS服务器给提供的解析,一般一个DNS服务器会服务一个区域,如果由同一个DNS服务器进行解析的,说明他们在同一片大的区域;
20 |
21 | 以上是对这份数据的字段解读,相信从这些解释中,你已经大概能了解这份数据的作用了。
22 |
--------------------------------------------------------------------------------
/flink-coding/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 | internet_behavior_project
7 | com.anryg.bigdata
8 | 1.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | flink-coding
13 |
14 | flink-coding
15 |
16 | http://www.example.com
17 |
18 |
19 | UTF-8
20 | 1.8
21 | 1.8
22 |
23 | 1.15.2
24 | 3.1.0
25 |
26 |
27 |
28 |
29 |
30 | org.apache.flink
31 | flink-streaming-scala_2.12
32 | ${flink.version}
33 |
34 |
35 | commons-math3
36 | org.apache.commons
37 |
38 |
39 |
40 |
41 | org.apache.flink
42 | flink-clients
43 | ${flink.version}
44 |
45 |
46 |
47 | org.apache.flink
48 | flink-connector-kafka
49 | ${flink.version}
50 |
51 |
52 |
53 | org.apache.flink
54 | flink-connector-hive_2.12
55 | ${flink.version}
56 |
57 |
58 | org.apache.hive
59 | hive-exec
60 | ${hadoop.version}
61 |
62 |
63 | calcite-core
64 | org.apache.calcite
65 |
66 |
67 | calcite-linq4j
68 | org.apache.calcite
69 |
70 |
71 |
72 |
73 |
74 |
75 | org.apache.flink
76 | flink-table-api-scala-bridge_2.12
77 | ${flink.version}
78 |
79 |
80 | org.apache.flink
81 | flink-table-planner_2.12
82 | ${flink.version}
83 | provided
84 |
85 |
86 |
87 |
88 | org.apache.flink
89 | flink-connector-elasticsearch7
90 | ${flink.version}
91 |
92 |
93 |
94 | org.apache.hadoop
95 | hadoop-common
96 | ${hadoop.version}
97 |
98 |
99 | commons-compress
100 | org.apache.commons
101 |
102 |
103 |
104 |
105 | org.apache.hadoop
106 | hadoop-client
107 | ${hadoop.version}
108 |
109 |
110 | commons-compress
111 | org.apache.commons
112 |
113 |
114 |
115 |
116 | org.apache.hadoop
117 | hadoop-hdfs
118 | ${hadoop.version}
119 |
120 |
121 |
122 | org.apache.flink
123 | flink-csv
124 | ${flink.version}
125 |
126 |
127 | org.apache.flink
128 | flink-hadoop-compatibility_2.12
129 | ${flink.version}
130 |
131 |
132 |
133 | com.alibaba
134 | fastjson
135 | 1.2.71
136 |
137 |
138 | junit
139 | junit
140 | 4.11
141 | test
142 |
143 |
144 |
145 |
146 | src/main/scala
147 | src/main/test
148 |
149 |
150 |
151 |
152 | org.apache.maven.plugins
153 | maven-shade-plugin
154 | 3.2.0
155 |
156 | true
157 | with-dependencies
158 |
159 |
160 | *:*
161 |
162 |
163 | junit:junit
164 |
165 |
166 |
167 |
168 |
169 | *:*
170 |
171 | META-INF/*.SF
172 | META-INF/*.DSA
173 | META-INF/*.RSA
174 |
175 |
176 |
177 | false
178 |
179 |
184 |
185 |
186 |
187 | package
188 |
189 | shade
190 |
191 |
192 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 | org.codehaus.mojo
218 | build-helper-maven-plugin
219 | 3.0.0
220 |
221 |
222 | add-source
223 | generate-sources
224 |
225 | add-source
226 |
227 |
228 |
229 | src/main/java
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 | net.alchim31.maven
238 | scala-maven-plugin
239 | 3.2.1
240 |
241 |
242 |
243 | compile
244 | testCompile
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
--------------------------------------------------------------------------------
/flink-coding/src/main/resources/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ambari.hive.db.schema.name
5 | hive
6 |
7 |
8 |
9 | atlas.hook.hive.maxThreads
10 | 1
11 |
12 |
13 |
14 | atlas.hook.hive.minThreads
15 | 1
16 |
17 |
18 |
19 | credentialStoreClassPath
20 | /var/lib/ambari-agent/cred/lib/*
21 |
22 |
23 |
24 | datanucleus.autoCreateSchema
25 | false
26 |
27 |
28 |
29 | datanucleus.cache.level2.type
30 | none
31 |
32 |
33 |
34 | datanucleus.fixedDatastore
35 | true
36 |
37 |
38 |
39 | hadoop.security.credential.provider.path
40 | jceks://file/usr/hdp/current/hive-server2/conf/hive-site.jceks
41 |
42 |
43 |
44 | hive.auto.convert.join
45 | true
46 |
47 |
48 |
49 | hive.auto.convert.join.noconditionaltask
50 | true
51 |
52 |
53 |
54 | hive.auto.convert.join.noconditionaltask.size
55 | 2147483648
56 |
57 |
58 |
59 | hive.auto.convert.sortmerge.join
60 | true
61 |
62 |
63 |
64 | hive.auto.convert.sortmerge.join.to.mapjoin
65 | true
66 |
67 |
68 |
69 | hive.cbo.enable
70 | true
71 |
72 |
73 |
74 | hive.cli.print.header
75 | false
76 |
77 |
78 |
79 | hive.cluster.delegation.token.store.class
80 | org.apache.hadoop.hive.thrift.ZooKeeperTokenStore
81 |
82 |
83 |
84 | hive.cluster.delegation.token.store.zookeeper.connectString
85 | hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181
86 |
87 |
88 |
89 | hive.cluster.delegation.token.store.zookeeper.znode
90 | /hive/cluster/delegation
91 |
92 |
93 |
94 | hive.compactor.abortedtxn.threshold
95 | 1000
96 |
97 |
98 |
99 | hive.compactor.check.interval
100 | 300
101 |
102 |
103 |
104 | hive.compactor.delta.num.threshold
105 | 10
106 |
107 |
108 |
109 | hive.compactor.delta.pct.threshold
110 | 0.1f
111 |
112 |
113 |
114 | hive.compactor.initiator.on
115 | true
116 |
117 |
118 |
119 | hive.compactor.worker.threads
120 | 7
121 |
122 |
123 |
124 | hive.compactor.worker.timeout
125 | 86400
126 |
127 |
128 |
129 | hive.compute.query.using.stats
130 | true
131 |
132 |
133 |
134 | hive.convert.join.bucket.mapjoin.tez
135 | false
136 |
137 |
138 |
139 | hive.create.as.insert.only
140 | true
141 |
142 |
143 |
144 | hive.default.fileformat
145 | TextFile
146 |
147 |
148 |
149 | hive.default.fileformat.managed
150 | ORC
151 |
152 |
153 |
154 | hive.driver.parallel.compilation
155 | true
156 |
157 |
158 |
159 | hive.enforce.sortmergebucketmapjoin
160 | true
161 |
162 |
163 |
164 | hive.exec.compress.intermediate
165 | false
166 |
167 |
168 |
169 | hive.exec.compress.output
170 | false
171 |
172 |
173 |
174 | hive.exec.dynamic.partition
175 | true
176 |
177 |
178 |
179 | hive.exec.dynamic.partition.mode
180 | nonstrict
181 |
182 |
183 |
184 | hive.exec.failure.hooks
185 | org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook
186 |
187 |
188 |
189 | hive.exec.max.created.files
190 | 100000
191 |
192 |
193 |
194 | hive.exec.max.dynamic.partitions
195 | 5000
196 |
197 |
198 |
199 | hive.exec.max.dynamic.partitions.pernode
200 | 2000
201 |
202 |
203 |
204 | hive.exec.orc.split.strategy
205 | HYBRID
206 |
207 |
208 |
209 | hive.exec.parallel
210 | false
211 |
212 |
213 |
214 | hive.exec.parallel.thread.number
215 | 8
216 |
217 |
218 |
219 | hive.exec.post.hooks
220 | org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook
221 |
222 |
223 |
224 | hive.exec.pre.hooks
225 | org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook
226 |
227 |
228 |
229 | hive.exec.reducers.bytes.per.reducer
230 | 1083179008
231 |
232 |
233 |
234 | hive.exec.reducers.max
235 | 1009
236 |
237 |
238 |
239 | hive.exec.scratchdir
240 | /tmp/hive
241 |
242 |
243 |
244 | hive.exec.submit.local.task.via.child
245 | true
246 |
247 |
248 |
249 | hive.exec.submitviachild
250 | false
251 |
252 |
253 |
254 | hive.execution.engine
255 | tez
256 |
257 |
258 |
259 | hive.execution.mode
260 | container
261 |
262 |
263 |
264 | hive.fetch.task.aggr
265 | false
266 |
267 |
268 |
269 | hive.fetch.task.conversion
270 | more
271 |
272 |
273 |
274 | hive.fetch.task.conversion.threshold
275 | 1073741824
276 |
277 |
278 |
279 | hive.heapsize
280 | 1024
281 |
282 |
283 |
284 | hive.hook.proto.base-directory
285 | /warehouse/tablespace/external/hive/sys.db/query_data/
286 |
287 |
288 |
289 | hive.limit.optimize.enable
290 | true
291 |
292 |
293 |
294 | hive.limit.pushdown.memory.usage
295 | 0.04
296 |
297 |
298 |
299 | hive.load.data.owner
300 | hive
301 |
302 |
303 |
304 | hive.lock.manager
305 |
306 |
307 |
308 |
309 | hive.map.aggr
310 | true
311 |
312 |
313 |
314 | hive.map.aggr.hash.force.flush.memory.threshold
315 | 0.9
316 |
317 |
318 |
319 | hive.map.aggr.hash.min.reduction
320 | 0.5
321 |
322 |
323 |
324 | hive.map.aggr.hash.percentmemory
325 | 0.5
326 |
327 |
328 |
329 | hive.mapjoin.bucket.cache.size
330 | 10000
331 |
332 |
333 |
334 | hive.mapjoin.hybridgrace.hashtable
335 | false
336 |
337 |
338 |
339 | hive.mapjoin.optimized.hashtable
340 | true
341 |
342 |
343 |
344 | hive.mapred.reduce.tasks.speculative.execution
345 | false
346 |
347 |
348 |
349 | hive.materializedview.rewriting.incremental
350 | false
351 |
352 |
353 |
354 | hive.merge.mapfiles
355 | true
356 |
357 |
358 |
359 | hive.merge.mapredfiles
360 | false
361 |
362 |
363 |
364 | hive.merge.orcfile.stripe.level
365 | true
366 |
367 |
368 |
369 | hive.merge.rcfile.block.level
370 | true
371 |
372 |
373 |
374 | hive.merge.size.per.task
375 | 256000000
376 |
377 |
378 |
379 | hive.merge.smallfiles.avgsize
380 | 16000000
381 |
382 |
383 |
384 | hive.merge.tezfiles
385 | false
386 |
387 |
388 |
389 | hive.metastore.authorization.storage.checks
390 | false
391 |
392 |
393 |
394 | hive.metastore.cache.pinobjtypes
395 | Table,Database,Type,FieldSchema,Order
396 |
397 |
398 |
399 | hive.metastore.client.connect.retry.delay
400 | 5s
401 |
402 |
403 |
404 | hive.metastore.client.socket.timeout
405 | 1800s
406 |
407 |
408 |
409 | hive.metastore.connect.retries
410 | 24
411 |
412 |
413 |
414 | hive.metastore.db.type
415 | MYSQL
416 |
417 |
418 |
419 | hive.metastore.dml.events
420 | true
421 |
422 |
423 |
424 | hive.metastore.event.listeners
425 |
426 |
427 |
428 |
429 | hive.metastore.execute.setugi
430 | true
431 |
432 |
433 |
434 | hive.metastore.failure.retries
435 | 24
436 |
437 |
438 |
439 | hive.metastore.kerberos.keytab.file
440 | /etc/security/keytabs/hive.service.keytab
441 |
442 |
443 |
444 | hive.metastore.kerberos.principal
445 | hive/_HOST@EXAMPLE.COM
446 |
447 |
448 |
449 | hive.metastore.pre.event.listeners
450 | org.apache.hadoop.hive.ql.security.authorization.AuthorizationPreEventListener
451 |
452 |
453 |
454 | hive.metastore.sasl.enabled
455 | false
456 |
457 |
458 |
459 | hive.metastore.server.max.threads
460 | 100000
461 |
462 |
463 |
464 | hive.metastore.transactional.event.listeners
465 | org.apache.hive.hcatalog.listener.DbNotificationListener
466 |
467 |
468 |
469 | hive.metastore.uris
470 | thrift://hdp01.pcl-test.com:9083
471 |
472 |
473 |
474 | hive.metastore.warehouse.dir
475 | /warehouse/tablespace/managed/hive
476 |
477 |
478 |
479 | hive.metastore.warehouse.external.dir
480 | /warehouse/tablespace/external/hive
481 |
482 |
483 |
484 | hive.optimize.bucketmapjoin
485 | true
486 |
487 |
488 |
489 | hive.optimize.bucketmapjoin.sortedmerge
490 | false
491 |
492 |
493 |
494 | hive.optimize.constant.propagation
495 | true
496 |
497 |
498 |
499 | hive.optimize.dynamic.partition.hashjoin
500 | true
501 |
502 |
503 |
504 | hive.optimize.index.filter
505 | true
506 |
507 |
508 |
509 | hive.optimize.metadataonly
510 | true
511 |
512 |
513 |
514 | hive.optimize.null.scan
515 | true
516 |
517 |
518 |
519 | hive.optimize.reducededuplication
520 | true
521 |
522 |
523 |
524 | hive.optimize.reducededuplication.min.reducer
525 | 4
526 |
527 |
528 |
529 | hive.optimize.sort.dynamic.partition
530 | false
531 |
532 |
533 |
534 | hive.orc.compute.splits.num.threads
535 | 10
536 |
537 |
538 |
539 | hive.orc.splits.include.file.footer
540 | false
541 |
542 |
543 |
544 | hive.prewarm.enabled
545 | false
546 |
547 |
548 |
549 | hive.prewarm.numcontainers
550 | 3
551 |
552 |
553 |
554 | hive.repl.cm.enabled
555 |
556 |
557 |
558 |
559 | hive.repl.cmrootdir
560 |
561 |
562 |
563 |
564 | hive.repl.rootdir
565 |
566 |
567 |
568 |
569 | hive.security.metastore.authenticator.manager
570 | org.apache.hadoop.hive.ql.security.HadoopDefaultMetastoreAuthenticator
571 |
572 |
573 |
574 | hive.security.metastore.authorization.auth.reads
575 | true
576 |
577 |
578 |
579 | hive.security.metastore.authorization.manager
580 | org.apache.hadoop.hive.ql.security.authorization.StorageBasedAuthorizationProvider
581 |
582 |
583 |
584 | hive.server2.allow.user.substitution
585 | true
586 |
587 |
588 |
589 | hive.server2.authentication
590 | NONE
591 |
592 |
593 |
594 | hive.server2.authentication.spnego.keytab
595 | HTTP/_HOST@EXAMPLE.COM
596 |
597 |
598 |
599 | hive.server2.authentication.spnego.principal
600 | /etc/security/keytabs/spnego.service.keytab
601 |
602 |
603 |
604 | hive.server2.enable.doAs
605 | true
606 |
607 |
608 |
609 | hive.server2.idle.operation.timeout
610 | 6h
611 |
612 |
613 |
614 | hive.server2.idle.session.timeout
615 | 1d
616 |
617 |
618 |
619 | hive.server2.logging.operation.enabled
620 | true
621 |
622 |
623 |
624 | hive.server2.logging.operation.log.location
625 | /tmp/hive/operation_logs
626 |
627 |
628 |
629 | hive.server2.max.start.attempts
630 | 5
631 |
632 |
633 |
634 | hive.server2.support.dynamic.service.discovery
635 | true
636 |
637 |
638 |
639 | hive.server2.table.type.mapping
640 | CLASSIC
641 |
642 |
643 |
644 | hive.server2.tez.default.queues
645 | default,llap
646 |
647 |
648 |
649 | hive.server2.tez.initialize.default.sessions
650 | false
651 |
652 |
653 |
654 | hive.server2.tez.sessions.per.default.queue
655 | 1
656 |
657 |
658 |
659 | hive.server2.thrift.http.path
660 | cliservice
661 |
662 |
663 |
664 | hive.server2.thrift.http.port
665 | 10001
666 |
667 |
668 |
669 | hive.server2.thrift.max.worker.threads
670 | 500
671 |
672 |
673 |
674 | hive.server2.thrift.port
675 | 10000
676 |
677 |
678 |
679 | hive.server2.thrift.sasl.qop
680 | auth
681 |
682 |
683 |
684 | hive.server2.transport.mode
685 | binary
686 |
687 |
688 |
689 | hive.server2.use.SSL
690 | false
691 |
692 |
693 |
694 | hive.server2.webui.cors.allowed.headers
695 | X-Requested-With,Content-Type,Accept,Origin,X-Requested-By,x-requested-by
696 |
697 |
698 |
699 | hive.server2.webui.enable.cors
700 | true
701 |
702 |
703 |
704 | hive.server2.webui.port
705 | 10002
706 |
707 |
708 |
709 | hive.server2.webui.use.ssl
710 | false
711 |
712 |
713 |
714 | hive.server2.zookeeper.namespace
715 | hiveserver2
716 |
717 |
718 |
719 | hive.service.metrics.codahale.reporter.classes
720 | org.apache.hadoop.hive.common.metrics.metrics2.JsonFileMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.JmxMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.Metrics2Reporter
721 |
722 |
723 |
724 | hive.smbjoin.cache.rows
725 | 10000
726 |
727 |
728 |
729 | hive.stats.autogather
730 | true
731 |
732 |
733 |
734 | hive.stats.dbclass
735 | fs
736 |
737 |
738 |
739 | hive.stats.fetch.column.stats
740 | true
741 |
742 |
743 |
744 | hive.stats.fetch.partition.stats
745 | true
746 |
747 |
748 |
749 | hive.stats.jdbc.timeout
750 | 0
751 |
752 |
753 |
754 | hive.strict.managed.tables
755 | false
756 |
757 |
758 |
759 | hive.support.concurrency
760 | true
761 |
762 |
763 |
764 | hive.tez.auto.reducer.parallelism
765 | true
766 |
767 |
768 |
769 | hive.tez.bucket.pruning
770 | true
771 |
772 |
773 |
774 | hive.tez.cartesian-product.enabled
775 | true
776 |
777 |
778 |
779 | hive.tez.container.size
780 | 7680
781 |
782 |
783 |
784 | hive.tez.cpu.vcores
785 | -1
786 |
787 |
788 |
789 | hive.tez.dynamic.partition.pruning
790 | true
791 |
792 |
793 |
794 | hive.tez.dynamic.partition.pruning.max.data.size
795 | 104857600
796 |
797 |
798 |
799 | hive.tez.dynamic.partition.pruning.max.event.size
800 | 1048576
801 |
802 |
803 |
804 | hive.tez.exec.print.summary
805 | true
806 |
807 |
808 |
809 | hive.tez.input.format
810 | org.apache.hadoop.hive.ql.io.HiveInputFormat
811 |
812 |
813 |
814 | hive.tez.input.generate.consistent.splits
815 | true
816 |
817 |
818 |
819 | hive.tez.java.opts
820 | -server -Djava.net.preferIPv4Stack=true -XX:NewRatio=8 -XX:+UseNUMA -XX:+UseG1GC -XX:+ResizeTLAB -XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps
821 |
822 |
823 |
824 | hive.tez.log.level
825 | INFO
826 |
827 |
828 |
829 | hive.tez.max.partition.factor
830 | 2.0
831 |
832 |
833 |
834 | hive.tez.min.partition.factor
835 | 0.25
836 |
837 |
838 |
839 | hive.tez.smb.number.waves
840 | 0.5
841 |
842 |
843 |
844 | hive.txn.manager
845 | org.apache.hadoop.hive.ql.lockmgr.DbTxnManager
846 |
847 |
848 |
849 | hive.txn.max.open.batch
850 | 1000
851 |
852 |
853 |
854 | hive.txn.strict.locking.mode
855 | false
856 |
857 |
858 |
859 | hive.txn.timeout
860 | 1000
861 |
862 |
863 |
864 | hive.user.install.directory
865 | /user/
866 |
867 |
868 |
869 | hive.vectorized.execution.enabled
870 | true
871 |
872 |
873 |
874 | hive.vectorized.execution.mapjoin.minmax.enabled
875 | true
876 |
877 |
878 |
879 | hive.vectorized.execution.mapjoin.native.enabled
880 | true
881 |
882 |
883 |
884 | hive.vectorized.execution.mapjoin.native.fast.hashtable.enabled
885 | true
886 |
887 |
888 |
889 | hive.vectorized.execution.reduce.enabled
890 | true
891 |
892 |
893 |
894 | hive.vectorized.groupby.checkinterval
895 | 4096
896 |
897 |
898 |
899 | hive.vectorized.groupby.flush.percent
900 | 0.1
901 |
902 |
903 |
904 | hive.vectorized.groupby.maxentries
905 | 100000
906 |
907 |
908 |
909 | hive.zookeeper.client.port
910 | 2181
911 |
912 |
913 |
914 | hive.zookeeper.namespace
915 | hive_zookeeper_namespace
916 |
917 |
918 |
919 | hive.zookeeper.quorum
920 | hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181
921 |
922 |
923 |
924 | javax.jdo.option.ConnectionDriverName
925 | com.mysql.jdbc.Driver
926 |
927 |
928 |
929 | javax.jdo.option.ConnectionURL
930 | jdbc:mysql://hdp01.pcl-test.com/hive
931 |
932 |
933 |
934 | javax.jdo.option.ConnectionUserName
935 | hive
936 |
937 |
938 |
939 | metastore.create.as.acid
940 | true
941 |
942 |
943 |
944 |
953 |
954 |
--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/FlinkDSFromKafka2HDFS.scala:
--------------------------------------------------------------------------------
1 | package com.anryg
2 |
3 | import java.time.Duration
4 |
5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
6 | import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema}
7 | import org.apache.flink.configuration.MemorySize
8 | import org.apache.flink.connector.kafka.source.KafkaSource
9 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
10 | import org.apache.flink.core.fs.Path
11 | import org.apache.flink.runtime.state.CheckpointStorage
12 | import org.apache.flink.streaming.api.CheckpointingMode
13 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
14 | import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy
15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
16 |
17 |
18 | /**
19 | * @DESC: 读取kafka数据,从DataStream到HDFS
20 | * @Auther: Anryg
21 | * @Date: 2022/8/14 19:08
22 | */
23 | object FlinkDSFromKafka2HDFS {
24 |
25 | private final val hdfsPrefix = "hdfs://192.168.211.106:8020"
26 |
27 | def main(args: Array[String]): Unit = {
28 | //获取流任务的环境变量
29 | val env = StreamExecutionEnvironment.getExecutionEnvironment
30 | .enableCheckpointing(3000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能
31 |
32 | env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkDSFromKafka2HDFS") //设置checkpoint的hdfs目录
33 |
34 | val kafkaSource = KafkaSource.builder() //获取kafka数据源
35 | .setBootstrapServers("192.168.211.107:6667")
36 | .setTopics("qianxin")
37 | .setGroupId("FlinkDSFromKafka2HDFS2")
38 | .setStartingOffsets(OffsetsInitializer.latest())
39 | .setValueOnlyDeserializer(new SimpleStringSchema())
40 | .build()
41 |
42 | import org.apache.flink.streaming.api.scala._ //引入隐私转换函数
43 | val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data") //读取数据源生成DataStream对象
44 |
45 | val targetDS = kafkaDS.map(line => { //对数据源做简单的ETL处理
46 | line.split("\\|")
47 | }).filter(_.length == 9).map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8)))
48 |
49 | /**基于flink1.14之后新的,文件系统的sink策略,跟官网提供的不一致,有坑*/
50 | val hdfsSink2 = StreamingFileSink.forRowFormat(new Path(hdfsPrefix + "/tmp/flink_sink3"),
51 | new SimpleStringEncoder[(String,String,String,String,String,String,String,String,String)]("UTF-8"))
52 | //.withBucketAssigner(new DateTimeBucketAssigner) /**默认基于时间分配器*/
53 | .withRollingPolicy( //设置文件的滚动策略,也就是分文件策略,也可以同时设置文件的命名规则,这里暂时用默认
54 | DefaultRollingPolicy.builder()
55 | .withRolloverInterval(Duration.ofSeconds(300)) //文件滚动间隔,设为5分钟,即每5分钟生成一个新文件
56 | .withInactivityInterval(Duration.ofSeconds(20)) //空闲间隔时间,也就是当前文件有多久没有写入数据,则进行滚动
57 | .withMaxPartSize(MemorySize.ofMebiBytes(800)) //单个文件的最大文件大小,设置为500MB
58 | .build()).build()
59 |
60 | targetDS.addSink(hdfsSink2) //目标DataStream添加sink策略
61 |
62 | env.execute("FlinkDSFromKafka2HDFS") //启动任务
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/FlinkTest04.scala:
--------------------------------------------------------------------------------
1 | package com.anryg
2 |
3 | import com.alibaba.fastjson.JSON
4 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
5 | import org.apache.flink.api.common.serialization.SimpleStringSchema
6 | import org.apache.flink.connector.kafka.source.KafkaSource
7 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
8 | import org.apache.flink.streaming.api.scala._
9 | import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
10 |
11 |
12 | /**
13 | * @DESC: 读取kafka数据,从DataStream转为Table 把结果写ES
14 | * @Auther: Anryg
15 | * @Date: 2022/8/14 19:08
16 | */
17 | object FlinkTest04 {
18 | case class InternetBehavior(id:String, client_ip:String, domain:String, do_time:String, target_ip:String,rcode:String, query_type:String, authority_record:String, add_msg:String, dns_ip:String)//定义当前数据对象
19 |
20 | def main(args: Array[String]): Unit = {
21 | val env = StreamExecutionEnvironment.getExecutionEnvironment
22 |
23 | val tableEnv = StreamTableEnvironment.create(env)
24 |
25 | val kafkaSource = KafkaSource.builder()
26 | .setBootstrapServers("192.168.211.107:6667")
27 | .setTopics("test")
28 | .setGroupId("group01")
29 | .setStartingOffsets(OffsetsInitializer.earliest())
30 | .setValueOnlyDeserializer(new SimpleStringSchema())
31 | .build()
32 |
33 | val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data")
34 | val targetDS = kafkaDS.map(line => {
35 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析
36 | val message = rawJson.getString("message") //获取业务数据部分
37 | val msgArray = message.split(",") //指定分隔符进行字段切分
38 | msgArray
39 | }).filter(_.length == 9).map(array => {
40 | InternetBehavior(array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))
41 | })
42 |
43 | val targetTable = tableEnv.fromDataStream(targetDS)//转化成为Table类型
44 | //targetTable.execute().print()
45 |
46 | /**定义sink*/
47 | tableEnv.executeSql("CREATE TABLE InternetBehavior (\n\tid String,\n client_ip STRING,\n domain STRING,\n do_time STRING,\n target_ip STRING,\n rcode int,\n query_type string,\n authority_record string,\n add_msg string,\n dns_ip string,\n PRIMARY KEY (id) NOT ENFORCED\n) WITH (\n 'connector' = 'elasticsearch-7',\n 'hosts' = 'http://192.168.211.106:9201',\n 'index' = 'internet_behavior-flink'\n)")
48 |
49 | targetTable.executeInsert("InternetBehavior")
50 | //targetDS.addSink()
51 | //targetTable.executeInsert()
52 |
53 | //env.execute("FlinkTest03")
54 |
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/hive_cdc/FlinkReadKafka2Hive.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.hive_cdc
2 |
3 | import org.apache.flink.configuration.Configuration
4 | import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect, TableEnvironment}
5 | import org.apache.flink.table.catalog.hive.HiveCatalog
6 |
7 | /**
8 | * @DESC: Flink读取kafka写hive动态分区表
9 | * @Auther: Anryg
10 | * @Date: 2022/12/19 10:36
11 | */
12 | object FlinkReadKafka2Hive {
13 |
14 | def main(args: Array[String]): Unit = {
15 | val settings = EnvironmentSettings.newInstance().inStreamingMode()
16 | .withConfiguration(setConf())
17 | .build() //读设置
18 | val tableEnv = TableEnvironment.create(settings) //获取table env
19 | setHive(tableEnv)
20 |
21 | /**读取kafka source*/
22 | getDataSource(tableEnv)
23 |
24 | tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) //设置当前SQL语法为hive方言,该方言可以在整个上下文过程中来回切换
25 | /**创建hive表*/
26 | createHiveTable(tableEnv)
27 |
28 | tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT) //设置当前SQL语法为flink方言,
29 | /**将数据Sink到*/
30 | sinkData(tableEnv)
31 |
32 | }
33 |
34 | /**
35 | * @DESC: 设置Flink相关配置
36 | * */
37 | private def setConf(): Configuration ={
38 | val config = new Configuration() //设置checkpoint
39 | config.setString("execution.checkpointing.interval","10000")
40 | config.setString("state.backend", "filesystem")
41 | config.setString("state.checkpoints.dir","hdfs://192.168.211.106:8020/tmp/checkpoint/FlinkWithHive")
42 | config
43 | }
44 |
45 | /**
46 | * @DESC: 设置hive catalog
47 | * */
48 | private def setHive(tableEnv: TableEnvironment): Unit ={
49 | val name = "hive_test" //取个catalog名字
50 | val database = "test" //指定hive的database
51 | //val hiveConf = "./flink-coding/src/main/resources/" //指定hive-site.xml配置文件所在的地方
52 |
53 | /**读取hive配置,并生成hive的catalog对象*/
54 | val hive = new HiveCatalog(name,database, null) //hiveConf为null后,程序会自动到classpath下找hive-site.xml
55 | tableEnv.registerCatalog(name, hive) //将该catalog登记到Flink的table env环境中,这样flink就可以直接访问hive中的表
56 |
57 | tableEnv.useCatalog(name) //让当前的flink环境使用该catalog
58 | }
59 |
60 | /**
61 | * @DESC: 读取Kafka数据源
62 | * */
63 | private def getDataSource(tableEnv: TableEnvironment): Unit ={
64 | tableEnv.executeSql(
65 | """
66 | |drop table if exists test.kafkaTable;
67 | """.stripMargin)
68 |
69 | tableEnv.executeSql(
70 | """
71 | |Create table test.kafkaTable(
72 | |client_ip STRING,
73 | |domain STRING,
74 | |`time` STRING,
75 | |target_ip STRING,
76 | |rcode STRING,
77 | |query_type STRING,
78 | |authority_record STRING,
79 | |add_msg STRING,
80 | |dns_ip STRING
81 | |)
82 | |with(
83 | |'connector' = 'kafka',
84 | |'topic' = 'qianxin',
85 | |'properties.bootstrap.servers' = '192.168.211.107:6667',
86 | |'properties.group.id' = 'FlinkWithHive',
87 | |'scan.startup.mode' = 'latest-offset',
88 | |'value.format'='csv', //确定数据源为文本格式
89 | |'value.csv.field-delimiter'='|' //确定文本数据源的分隔符
90 | |);
91 | """.stripMargin)
92 | }
93 |
94 | /**
95 | * @DESC: 创建hive目标数据表
96 | * */
97 | private def createHiveTable(tableEnv: TableEnvironment): Unit ={
98 | tableEnv.executeSql(
99 | """
100 | |CREATE TABLE if not exists test.kafka_flink_hive (
101 | |client_ip STRING,
102 | |domain STRING,
103 | |target_ip STRING,
104 | |rcode STRING,
105 | |query_type STRING,
106 | |authority_record STRING,
107 | |add_msg STRING,
108 | |dns_ip STRING
109 | |)
110 | |PARTITIONED BY (`time` STRING)
111 | |STORED AS textfile TBLPROPERTIES (
112 | | 'partition.time-extractor.timestamp-pattern'='$time',
113 | | 'sink.partition-commit.trigger'='partition-time',
114 | | 'sink.partition-commit.delay'='1 h',
115 | | 'sink.partition-commit.policy.kind'='metastore,success-file'
116 | |);
117 | """.stripMargin)
118 | }
119 |
120 | /**
121 | * @DESC: 将数据写入到目标表中
122 | * */
123 | private def sinkData(tableEnv: TableEnvironment): Unit ={
124 | tableEnv.executeSql(
125 | """
126 | |INSERT INTO test.kafka_flink_hive
127 | |SELECT client_ip,domain,target_ip,rcode,query_type,authority_record,add_msg,dns_ip,`time`
128 | |FROM test.kafkaTable;
129 | """.stripMargin)
130 | }
131 |
132 | }
133 |
--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/hive_cdc/FlinkWithHive.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.hive_cdc
2 |
3 | import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect, TableEnvironment}
4 | import org.apache.flink.table.catalog.hive.HiveCatalog
5 | import org.apache.hadoop.conf.Configuration
6 | import org.apache.hadoop.hive.conf.HiveConf
7 |
8 | /**
9 | * @DESC: Flink连接hive
10 | * @Auther: Anryg
11 | * @Date: 2022/12/19 10:36
12 | */
13 | object FlinkWithHive {
14 |
15 | def main(args: Array[String]): Unit = {
16 | val settings = EnvironmentSettings.newInstance().inStreamingMode().build() //读取默认设置
17 | val tableEnv = TableEnvironment.create(settings) //获取table env
18 | tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) //设置当前SQL语法为hive方言,该方言可以在整个上下文过程中来回切换
19 | setHive(tableEnv)
20 |
21 | /**查看当前database有哪些表*/
22 | tableEnv.executeSql(
23 | """
24 | |SHOW tables;
25 | """.stripMargin).print()
26 |
27 |
28 | /**将数据Sink到*/
29 | }
30 |
31 | /**
32 | * @DESC: 设置hive catalog
33 | * */
34 | private def setHive(tableEnv: TableEnvironment): Unit ={
35 | val name = "hive_test" //取个catalog名字
36 | val database = "test" //指定hive的database
37 | //val hiveConf = "./flink-coding/src/main/resources/" //指定hive-site.xml配置文件所在的地方
38 |
39 | /**读取hive配置,并生成hive的catalog对象*/
40 | val hive = new HiveCatalog(name,database, null) //hiveConf为null后,程序会自动到classpath下找hive-site.xml
41 | tableEnv.registerCatalog(name, hive) //将该catalog登记到Flink的table env环境中,这样flink就可以直接访问hive中的表
42 |
43 | tableEnv.useCatalog(name) //让当前的flink环境使用该catalog
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkDSFromKafkaWithWatermark.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.window_and_watermark
2 |
3 | import java.text.SimpleDateFormat
4 | import java.time.Duration
5 | import java.util.Locale
6 |
7 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
8 | import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema}
9 | import org.apache.flink.configuration.MemorySize
10 | import org.apache.flink.connector.kafka.source.KafkaSource
11 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
12 | import org.apache.flink.core.fs.Path
13 | import org.apache.flink.streaming.api.CheckpointingMode
14 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
16 | import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, SlidingProcessingTimeWindows, TumblingEventTimeWindows, TumblingProcessingTimeWindows}
17 | import org.apache.flink.streaming.api.windowing.time
18 | import org.apache.flink.streaming.api.windowing.time.Time
19 |
20 |
21 | /**
22 | * @DESC: 读取kafka数据,从DataStream到HDFS
23 | * @Auther: Anryg
24 | * @Date: 2022/8/14 19:08
25 | */
26 | object FlinkDSFromKafkaWithWatermark {
27 |
28 | private final val hdfsPrefix = "hdfs://192.168.211.106:8020"
29 |
30 | def main(args: Array[String]): Unit = {
31 | //获取流任务的环境变量
32 | val env = StreamExecutionEnvironment.getExecutionEnvironment
33 | .enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能
34 |
35 | env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkDSFromKafkaWithWatermark") //设置checkpoint的hdfs目录
36 | env.getCheckpointConfig.setExternalizedCheckpointCleanup(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //设置checkpoint记录的保留策略
37 |
38 | val kafkaSource = KafkaSource.builder() //获取kafka数据源
39 | .setBootstrapServers("192.168.211.107:6667")
40 | .setTopics("qianxin")
41 | .setGroupId("FlinkDSFromKafkaWithWatermark")
42 | .setStartingOffsets(OffsetsInitializer.latest())
43 | .setValueOnlyDeserializer(new SimpleStringSchema())
44 | .build()
45 |
46 | import org.apache.flink.streaming.api.scala._ //引入隐私转换函数
47 |
48 | val kafkaDS = env.fromSource(kafkaSource,
49 | WatermarkStrategy.noWatermarks()
50 | ,"kafka-data") //读取数据源生成DataStream对象
51 |
52 | val targetDS = kafkaDS.map(line => { //对数据源做简单的ETL处理
53 | line.split("\\|")
54 | }).filter(_.length == 9).filter(_(1).endsWith("com"))
55 | .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofHours(10)) //指定watermark
56 | .withTimestampAssigner(new SerializableTimestampAssigner[Array[String]] {
57 | override def extractTimestamp(element: Array[String], recordTimestamp: Long): Long = {
58 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss")
59 | sdf.parse(element(2)).getTime //指定的watermark字段必须是Long类型的时间戳
60 | }
61 | }))
62 | .map(array => (array(0), 1))
63 | .keyBy(kv => kv._1) //根据client_ip聚合
64 | .window(SlidingProcessingTimeWindows.of(Time.minutes(2), Time.seconds(30))) //指定window,这里的window assigner必须是基于Process Time而不是Event Time,因为数据时间跟当前真实时间相差有点多
65 | .sum(1)
66 |
67 | targetDS.print() //打印结果
68 |
69 | env.execute("FlinkDSFromKafkaWithWatermark") //启动任务
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkSQLFromKafkaWithWatermarkAndWindow.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.window_and_watermark
2 |
3 | import org.apache.flink.configuration.Configuration
4 | import org.apache.flink.table.api.{EnvironmentSettings, TableEnvironment}
5 |
6 |
7 | /**
8 | * @DESC: 用SQL API读取kafka数据,并利用watermark和window功能来对数据进行统计
9 | * @Auther: Anryg
10 | * @Date: 2022/8/14 19:08
11 | */
12 | object FlinkSQLFromKafkaWithWatermarkAndWindow {
13 |
14 | def main(args: Array[String]): Unit = {
15 | val streamingSetting = EnvironmentSettings.newInstance().inStreamingMode().build()
16 |
17 | val config = new Configuration() //设置checkpoint
18 | config.setString("execution.checkpointing.interval","10000")
19 | config.setString("state.backend", "filesystem")
20 | config.setString("state.checkpoints.dir","hdfs://192.168.211.106:8020/tmp/checkpoint/FlinkSQLFromKafkaWithWatermarkAndWindow")
21 |
22 | streamingSetting.getConfiguration.addAll(config)
23 |
24 | val tableEnv = TableEnvironment.create(streamingSetting)
25 |
26 | tableEnv.executeSql(
27 | """
28 | |Create table kafkaTable(
29 | |client_ip STRING,
30 | |domain STRING,
31 | |`time` STRING,
32 | |target_ip STRING,
33 | |rcode STRING,
34 | |query_type STRING,
35 | |authority_record STRING,
36 | |add_msg STRING,
37 | |dns_ip STRING,
38 | |event_time AS to_timestamp(`time`, 'yyyyMMddHHmmss'), //设置事件时间为实际数据的产生时间,注意time这个字段必须要用``符合括起来
39 | |watermark for event_time as event_time - interval '10' second //设置watermark,确定watermark字段
40 | |)
41 | |with(
42 | |'connector' = 'kafka',
43 | |'topic' = 'qianxin',
44 | |'properties.bootstrap.servers' = '192.168.211.107:6667',
45 | |'properties.group.id' = 'FlinkSQLFromKafkaWithWatermarkAndWindow',
46 | |'scan.startup.mode' = 'latest-offset',
47 | |'value.format'='csv', //确定数据源为文本格式
48 | |'value.csv.field-delimiter'='|' //确定文本数据源的分隔符
49 | |)
50 | """.stripMargin)
51 |
52 | tableEnv.executeSql(
53 | """
54 | |SELECT
55 | |window_start,
56 | |window_end,
57 | |client_ip,
58 | |count(client_ip) as ip_count
59 | |FROM TABLE(
60 | |HOP( //确定window策略
61 | |TABLE kafkaTable,
62 | |DESCRIPTOR(event_time),
63 | |INTERVAL '30' SECONDS, //确定滑动周期
64 | |INTERVAL '2' MINUTES) //确定窗口时间间隔
65 | |)
66 | |GROUP BY
67 | |window_start,
68 | |window_end,
69 | |client_ip
70 | |ORDER BY ip_count
71 | |DESC
72 | |LIMIT 10
73 | """.stripMargin
74 | ).print()
75 |
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkTBFromKafkaWithWatermark.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.window_and_watermark
2 |
3 | import java.sql.Timestamp
4 | import java.text.SimpleDateFormat
5 | import java.time.Duration
6 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
7 | import org.apache.flink.api.common.serialization.SimpleStringSchema
8 | import org.apache.flink.connector.kafka.source.KafkaSource
9 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
10 | import org.apache.flink.streaming.api.CheckpointingMode
11 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
12 | import org.apache.flink.streaming.api.scala._
13 | import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
14 |
15 |
16 |
17 | /**
18 | * @DESC: 读取kafka数据,从DataStream API转为Table API,并利用watermark
19 | * @Auther: Anryg
20 | * @Date: 2022/8/14 19:08
21 | */
22 | object FlinkTBFromKafkaWithWatermark {
23 | private final val hdfsPrefix = "hdfs://192.168.211.106:8020"//HDFS地址前缀
24 |
25 | def main(args: Array[String]): Unit = {
26 | val env = StreamExecutionEnvironment.getExecutionEnvironment //获取流环境变量
27 | .enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能
28 |
29 | val tableEnv = StreamTableEnvironment.create(env) //创建Table环境变量
30 | env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkTBFromKafkaWithWatermark") //设置checkpoint的hdfs目录
31 | env.getCheckpointConfig.setExternalizedCheckpointCleanup(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //设置checkpoint记录的保留策略
32 |
33 | val kafkaSource = KafkaSource.builder()
34 | .setBootstrapServers("192.168.211.107:6667")
35 | .setTopics("qianxin")
36 | .setGroupId("FlinkTBFromKafkaWithWatermark")
37 | .setStartingOffsets(OffsetsInitializer.latest())
38 | .setValueOnlyDeserializer(new SimpleStringSchema())
39 | .build()
40 | val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data")
41 | val targetDS = kafkaDS.map(_.split("\\|"))
42 | .filter(_.length == 9)
43 | .filter(_(1).endsWith("com"))
44 | .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(10)) //给业务字段分配watermark
45 | .withTimestampAssigner(new SerializableTimestampAssigner[Array[String]] {
46 | override def extractTimestamp(element: Array[String], recordTimestamp: Long): Long = { //实现watermark字段的分配
47 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss")
48 | sdf.parse(element(2)).getTime
49 | }
50 | }))
51 | .map(array => (array(0), array(2)))
52 | .map(kv => {
53 | val date = kv._2
54 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss").parse(date)
55 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(sdf)
56 | (kv._1, Timestamp.valueOf(time)) //将时间转为要求的Time Attributes 也就是Timestamp类型
57 | })
58 |
59 | import org.apache.flink.table.api._ //加入隐式转换,否则下面的$无法识别
60 |
61 | val targetTable = tableEnv.fromDataStream(targetDS)
62 | .as("client_ip", "time") //添加schema
63 | .window(
64 | Slide over 1.minute every 30.seconds() on $"time" as $"w" //加入window
65 | )
66 | .groupBy($"client_ip", $"w")
67 | .select(
68 | $"client_ip",
69 | $"w".start(), //时间窗口的开始时间
70 | $"w".end(), //时间窗口的解释时间
71 | $"client_ip".count() as "count"
72 | )
73 | .orderBy($"count")
74 | .limit(10)
75 | targetTable.execute().print()
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.anryg.bigdata
8 | internet_behavior_project
9 | pom
10 | 1.0-SNAPSHOT
11 |
12 | spark-coding
13 | flink-coding
14 | redis
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/redis/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 | internet_behavior_project
7 | com.anryg.bigdata
8 | 1.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | com.anryg.bigdata
13 | redis
14 | 1.0-SNAPSHOT
15 | jar
16 | redis
17 |
18 | http://www.example.com
19 |
20 |
21 | UTF-8
22 | 1.8
23 | 1.8
24 |
25 |
26 |
27 |
28 | redis.clients
29 | jedis
30 | 3.3.0
31 |
32 |
33 | junit
34 | junit
35 | 4.11
36 | test
37 |
38 |
39 |
40 |
41 | src/main/scala
42 |
43 |
44 |
45 | org.apache.maven.plugins
46 | maven-shade-plugin
47 | 3.1.0
48 |
49 | true
50 | with-dependencies
51 |
52 |
53 | *:*
54 |
55 | META-INF/*.SF
56 | META-INF/*.DSA
57 | META-INF/*.RSA
58 |
59 |
60 |
61 |
62 |
63 |
64 | junit:junit
65 |
66 |
67 |
68 |
69 |
70 | package
71 |
72 | shade
73 |
74 |
75 |
76 |
77 | com.google.guava
78 | com.shade2.google.guava
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 | org.codehaus.mojo
91 | build-helper-maven-plugin
92 | 3.0.0
93 |
94 |
95 | add-source
96 | generate-sources
97 |
98 | add-source
99 |
100 |
101 |
102 | src/main/java
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 | net.alchim31.maven
111 | scala-maven-plugin
112 | 3.2.1
113 |
114 |
115 |
116 | compile
117 | testCompile
118 |
119 |
120 |
121 | -make:transitive
122 | -dependencyfile
123 | ${project.build.directory}/.scala_dependencies
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/IPUtils.java:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata;
2 |
3 | //import com.googlecode.ipv6.IPv6Network;
4 | import org.slf4j.Logger;
5 | import org.slf4j.LoggerFactory;
6 | import redis.clients.jedis.Jedis;
7 |
8 | import java.io.BufferedReader;
9 | import java.io.FileInputStream;
10 | import java.io.InputStreamReader;
11 | import java.math.BigInteger;
12 | import java.net.InetAddress;
13 | import java.net.UnknownHostException;
14 | import java.util.HashMap;
15 | import java.util.Map;
16 |
17 | /**
18 | * @DESC: 提供对IP地址数据相关的操作
19 | * @Author Anryg
20 | * */
21 |
22 | public class IPUtils {
23 | private static Logger logger = LoggerFactory.getLogger(IPUtils.class);
24 |
25 |
26 | /**
27 | * @DESC: 将本地ip.merge.txt文件中的IP地址导入到redis zset中
28 | * @param filePath : IP地址与地理位置关系文件
29 | * @param dbNo : redis的数据库名
30 | * */
31 | public static void ipCountryImport(String filePath, int dbNo) throws Exception {
32 | FileInputStream inputStream = new FileInputStream(filePath);
33 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
34 | String line = null; /*读取文件中的每一行*/
35 | HashMap map = new HashMap(1024,1); //key为数据体,value为ip范围的结束ip地址
36 | int i = 0;
37 | while((line=bufferedReader.readLine()) != null){
38 | String[] args=line.split("\\|");
39 | String ipStart=args[0];
40 | String ipEnd=args[1];
41 | //Long ipStartLong= IPUtils.ip2Long(ipStart);
42 | Long ipEndLong= IPUtils.ip2Long(ipEnd); //将每条IP地址范围的结束IP地址,转换为long类型的数值
43 | String country = args[2]; //获取国家信息
44 | String province = args[4]; //或者省份信息
45 | String city = args[5]; //获取城市信息
46 | String operator = args[6]; //获取运营商信息
47 | StringBuilder rowBuffer = new StringBuilder(11); //用来存放组装后的IP地址与地理位置信息
48 | rowBuffer.append(ipStart).append("-").append(ipEnd).append("-").append(country).append("-")
49 | .append(province).append("-").append(city).append("-").append(operator);
50 | map.put(rowBuffer.toString(),ipEndLong.doubleValue());
51 | ++i;
52 | if (i == 1024) {/**每1024个为一批*/
53 | toRedis(RedisClientUtils.getSingleRedisClient(),map, dbNo,"ipAndAddr");
54 | map.clear();
55 | i = 0;
56 | }
57 | }
58 | if (map.size() > 0) toRedis(RedisClientUtils.getSingleRedisClient(),map, dbNo,"ipAndAddr");
59 | }
60 |
61 | /**
62 | * @DESC: 将IP转为10进制
63 | * */
64 | public static long ip2Long(String ipstr) {
65 | InetAddress ip = null;
66 | try {
67 | ip = InetAddress.getByName(ipstr);
68 | } catch (UnknownHostException e) {
69 | logger.error("UnknownHost...",e);
70 | }
71 | byte[] octets = ip.getAddress();
72 | long result = 0;
73 | for (byte octet : octets) {
74 | result <<= 8;
75 | result |= octet & 0xff;
76 | }
77 | return result;
78 | }
79 |
80 | /**
81 | * @DESC: 经10进制转换成为IPV4地址字符串
82 | * */
83 | public static String Long2Ip(long ten) {
84 | StringBuilder sb = new StringBuilder();
85 | for (int i = 0; i < 4; i++) {
86 | sb.insert(0, Long.toString(ten & 0xff));
87 | if (i < 3) {
88 | sb.insert(0, '.');
89 | }
90 | ten = ten >> 8;
91 | }
92 | return sb.toString();
93 | }
94 |
95 | /**
96 | * 根据IPV4地址和子网掩码计算IPV4地址范围,例如:192.168.1.53/27 --》3232235808,3232235839
97 | * @param ipAndMask
98 | * @return IPV4地址范围
99 | */
100 | public static long[] getIPLongScope(String ipAndMask) {
101 | String[] ipArr = ipAndMask.split("/");
102 | if (ipArr.length != 2) {
103 | throw new IllegalArgumentException("invalid ipAndMask with: "
104 | + ipAndMask);
105 | }
106 | int netMask = Integer.valueOf(ipArr[1].trim());
107 | if (netMask < 0 || netMask > 32) {
108 | throw new IllegalArgumentException("invalid ipAndMask with: "
109 | + ipAndMask);
110 | }
111 | long ipInt = ip2Long(ipArr[0]);
112 | long netIP = ipInt & (0xFFFFFFFF << (32 - netMask));
113 | long hostScope = (0xFFFFFFFF >>> netMask);
114 | return new long[] { netIP, netIP + hostScope };
115 | }
116 |
117 | /**
118 | * 根据IPV4地址和子网掩码计算IPV4地址范围,例如:ip:192.168.1.53,子网掩码:255.255.255.224--》3232235808,3232235839
119 | * @param ipaddr,mask IPV4地址,子网掩码 192.168.1.53,255.255.255.224
120 | * @return IPV4地址范围字符串
121 | */
122 | public static String getIPNetworkAddr(String ipaddr, String mask){
123 | //IP地址和子网掩码与得到网络地址
124 | Long ipNetworkAddr = ip2Long(ipaddr)&ip2Long(mask);
125 | Long ipBroadcastAddr = ((ipNetworkAddr^ip2Long(mask))^0xffffffffL);
126 |
127 | //System.out.println(Long.toBinaryString(ipBroadcastAddr));
128 | return Long2Ip(ipNetworkAddr+1)+"-->"+Long2Ip(ipBroadcastAddr-1);
129 | }
130 |
131 | /**
132 | * ipv6字符串转整数
133 | * @param ipv6
134 | * @return
135 | */
136 | public static BigInteger ipv6ToBigInt(String ipv6)
137 | {
138 |
139 | int compressIndex = ipv6.indexOf("::");
140 | if (compressIndex != -1)
141 | {
142 | String part1s = ipv6.substring(0, compressIndex);
143 | String part2s = ipv6.substring(compressIndex + 1);
144 | BigInteger part1 = ipv6ToBigInt(part1s);
145 | BigInteger part2 = ipv6ToBigInt(part2s);
146 | int part1hasDot = 0;
147 | char[] ch = part1s.toCharArray();
148 | for (char c : ch)
149 | {
150 | if (c == ':')
151 | {
152 | part1hasDot++;
153 | }
154 | }
155 | // ipv6 has most 7 dot
156 | return part1.shiftLeft(16 * (7 - part1hasDot )).add(part2);
157 | }
158 | String[] str = ipv6.split(":");
159 | BigInteger big = BigInteger.ZERO;
160 | for (int i = 0; i < str.length; i++)
161 | {
162 | //::1
163 | if (str[i].isEmpty())
164 | {
165 | str[i] = "0";
166 | }
167 | big = big.add(BigInteger.valueOf(Long.valueOf(str[i], 16))
168 | .shiftLeft(16 * (str.length - i - 1)));
169 | }
170 | return big;
171 | }
172 |
173 |
174 | /**
175 | * @Author liuxh02
176 | * @Description 整数转为ipv6地址字符串
177 | * @Date 2020/8/5
178 | * @Param [big]
179 | * @return java.lang.String
180 | **/
181 | public static String bigIntToipv6(BigInteger big)
182 | {
183 | String str = "";
184 | BigInteger ff = BigInteger.valueOf(0xffff);
185 | for (int i = 0; i < 8 ; i++)
186 | {
187 | str = big.and(ff).toString(16) + ":" + str;
188 |
189 | big = big.shiftRight(16);
190 | }
191 | //the last :
192 | str = str.substring(0, str.length() - 1);
193 |
194 | return str.replaceFirst("(^|:)(0+(:|$)){2,8}", "::");
195 | }
196 |
197 |
198 | /**
199 | * @DESC: 批量方式写入Redis
200 | * */
201 | private static Long toRedis(Jedis jedis, Map map, int dbno, String key) {
202 | try {
203 | jedis.select(dbno);
204 | return jedis.zadd(key,map);
205 | } finally {
206 | RedisClientUtils.returnResource(jedis);
207 | }
208 |
209 | }
210 |
211 |
212 | /**
213 | * @Author liuxh02
214 | * @Description 根据ipv6地址和子网掩码计算IP范围,返回数组
215 | * @Date 2020/8/6
216 | * @Param 【起始IP,结束IP】
217 | * @return java.math.BigInteger[]
218 | **/
219 | /* public static BigInteger[] getIPV6LongScope(String ipv6AndMask ){
220 |
221 | IPv6Network network = IPv6Network.fromString(ipv6AndMask);
222 | BigInteger start=network.getFirst().toBigInteger();//起始IP
223 | BigInteger end=network.getLast().toBigInteger();//结束IP
224 | System.out.println(end);
225 | return new BigInteger[]{start,end};
226 |
227 | }*/
228 | }
229 |
--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/IpSearch.java:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata;
2 |
3 |
4 | import org.slf4j.LoggerFactory;
5 | import redis.clients.jedis.Jedis;
6 | import redis.clients.jedis.Tuple;
7 |
8 | import java.math.BigInteger;
9 | import java.net.UnknownHostException;
10 | import java.util.Set;
11 |
12 | public class IpSearch {
13 | private static org.slf4j.Logger logger = LoggerFactory.getLogger(IpSearch.class);
14 |
15 | /**
16 | * 在redis db1数据库中查找IP所在的地址信息
17 | * @param jedis
18 | * @param ip
19 | * @return 给定IP所在范围
20 | * @throws UnknownHostException
21 | */
22 | public static String getAddrByIP(Jedis jedis, String ip) {
23 | try {
24 | jedis.select(1);
25 | long ipscore = IPUtils.ip2Long(ip);
26 | Set tuples = jedis.zrangeByScoreWithScores("ipAndAddr", String.valueOf(ipscore),"+inf",0,1);
27 | String value = "";
28 | for (Tuple tuple : tuples) {
29 | value = tuple.getElement();
30 | }
31 | String[] valueSplits = value.split("-");
32 | long begin = IPUtils.ip2Long(valueSplits[0]);
33 | long end = IPUtils.ip2Long(valueSplits[1]);
34 | //String[] scope = value.substring(startpos+1,endpos).split(",");
35 | if(ipscore >= begin && ipscore <= end){
36 | return value;
37 | }
38 | else return "";
39 | } finally {
40 | //RedisClientUtils.returnResource(jedis);/**归还到连接池*/
41 |
42 | }
43 | }
44 | /**
45 | * @Author liuxh02
46 | * @Description 在redis db2数据库中查询ipv4,ipv6地址信息
47 | * @Date 2020/8/6
48 | * @Param [jedis, ip]
49 | * @return java.lang.String
50 | **/
51 | public static String getAddr(Jedis jedis, String ip) {
52 | jedis.select(2);
53 | //ip地址转整数
54 | BigInteger ipscore=null;
55 | if(ip.contains(":")){
56 | //ipv6转整数
57 | ipscore=IPUtils.ipv6ToBigInt(ip);
58 | }else{
59 | //ipv4转整数
60 | ipscore = BigInteger.valueOf(IPUtils.ip2Long(ip));
61 | }
62 | Set tuples = jedis.zrangeByScoreWithScores("ipAndAddr",ipscore.toString(),"+inf",0,1);
63 | String value = "";
64 | for (Tuple tuple : tuples) {
65 | value = tuple.getElement();
66 | }
67 | String[] valueArray = value.split("-");
68 | //获取IP和子网掩码
69 | String ipAndMask=valueArray[0];
70 | BigInteger start=null;
71 | BigInteger end=null;
72 | if(ipAndMask.contains(":")){
73 | //ipv6地址计算
74 | BigInteger[] ipv6AndMask = null;
75 | start=ipv6AndMask[0];
76 | end=ipv6AndMask[1];
77 | }else{
78 | //ipv4地址计算
79 | long[] ipv4AndMask=IPUtils.getIPLongScope(ipAndMask);
80 | start= BigInteger.valueOf(ipv4AndMask[0]);
81 | end= BigInteger.valueOf(ipv4AndMask[1]);
82 | }
83 | if(ipscore.compareTo(start)>0 && ipscore.compareTo(end)<0){
84 | return value;
85 | }
86 | else return "";
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/RedisClientUtils.java:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata;
2 |
3 | import redis.clients.jedis.Jedis;
4 | import redis.clients.jedis.JedisPool;
5 | import redis.clients.jedis.JedisPoolConfig;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | import java.util.Set;
10 |
11 | /**
12 | * Created by Anryg on 2018/5/9.
13 | */
14 | public class RedisClientUtils implements RedisParam {
15 | private static volatile JedisPool jedisPool = null;/**用连接池进行管理,避免多线程情况下连接redis出现的各种问题*/
16 | private static volatile Jedis jedis = null;
17 |
18 | /**
19 | * @DESC: 初始化连接池
20 | * */
21 | private static void initPool(){
22 | JedisPoolConfig config = null;
23 | try {
24 | config = new JedisPoolConfig();
25 | config.setMaxTotal(MAX_ACTIVE);
26 | config.setMaxIdle(MAX_IDLE);
27 | config.setMaxWaitMillis(MAX_WAIT);
28 | config.setTestOnBorrow(TEST_ON_BORROW);//使用时进行扫描,确保都可用
29 | config.setTestWhileIdle(true);//Idle时进行连接扫描
30 | config.setTestOnReturn(true);//还回线程池时进行扫描
31 | } catch (Exception e) {
32 | throw e;
33 | }
34 | /*表示idle object evitor两次扫描之间要sleep的毫秒数
35 | config.setTimeBetweenEvictionRunsMillis(30000);
36 | 表示idle object evitor每次扫描的最多的对象数
37 | config.setNumTestsPerEvictionRun(10);
38 | 表示一个对象至少停留在idle状态的最短时间,然后才能被idle object evitor扫描并驱逐;这一项只有在timeBetweenEvictionRunsMillis大于0时才有意义
39 | config.setMinEvictableIdleTimeMillis(60000);*/
40 | jedisPool = new JedisPool(config, HOSTS.split(",")[0], PORT, TIMEOUT, PASSWD);
41 | }
42 | /**
43 | *@DESC: 多线程环境下确保只初始化一个连接池
44 | */
45 | private static void poolInit() {
46 | if (jedisPool == null){
47 | synchronized (RedisClientUtils.class){
48 | if (jedisPool == null) initPool();
49 | }
50 | }
51 | }
52 |
53 | /**
54 | * @DESC: 获取连接池对象,适用多线程时,利用其获取多个jedis客户端
55 | * */
56 | public static JedisPool getJedisPool(){
57 | poolInit();
58 | return jedisPool;
59 | }
60 | /**
61 | * @DESC: 同步获取Jedis实例,适合单线程
62 | * @return Jedis
63 | */
64 | public static Jedis getSingleRedisClient() {
65 | poolInit();
66 | if (jedis == null){
67 | synchronized (RedisClientUtils.class){
68 | if (jedis == null) {
69 | jedis = jedisPool.getResource();
70 | }
71 | }
72 | }
73 | return jedis;
74 | }
75 | /**
76 | * @DESC: 释放jedis资源,将资源放回连接池
77 | * @param jedis
78 | */
79 | public static void returnResource(final Jedis jedis) {
80 | if (jedis != null && jedisPool != null) jedis.close();
81 | }
82 |
83 | /**
84 | * @DESC: 删除某个库下的所有数据
85 | * */
86 | public static void delDataPerDB(Jedis redis, int dbNum){
87 | redis.select(dbNum);
88 | Set keySet = redis.keys("*");
89 | for (String key:keySet){
90 | try {
91 | Set fields = redis.hkeys(key);
92 | redis.hdel(key,fields.toArray(new String[fields.size()]));//Set转Array
93 | } catch (Exception e) {
94 | throw e;
95 | }finally {
96 | //redis.close();
97 | }
98 | }
99 | }
100 |
101 | /**
102 | * @DESC: 存储set对象
103 | * */
104 | public static boolean save2RedisBySet(Jedis redis, int redisNo, String key, String[] strArray){
105 | redis.select(redisNo);
106 | long count = 0;
107 | try {
108 | count = redis.sadd(key,strArray);/**存储不重复的*/
109 | } catch (Exception e) {
110 | e.printStackTrace();
111 | } finally {
112 | redis.close();
113 | }
114 | if (count > 0) return true;
115 | else return false;
116 | }
117 |
118 | /**
119 | * @DESC: 用来批量存储key:value
120 | * @param kvList : 为容器,奇数位的key,偶数位的为value,且总数必须是偶数个
121 | * */
122 | public static void save2RedisByKVs(Jedis redis, int redisNo, List kvList){
123 | redis.select(redisNo);
124 | try {
125 | redis.mset(kvList.toArray(new String[kvList.size()]));
126 | } finally {
127 | redis.close();
128 | }
129 | }
130 | /**
131 | * @DESC: 获取set对象的结果
132 | * */
133 | public static Set getSetResult(Jedis redis, int redisNo, String key){
134 | redis.select(redisNo);
135 | Set scanResult = null;
136 | try {
137 | scanResult = redis.smembers(key);
138 | } catch (Exception e) {
139 | e.printStackTrace();
140 | } finally {
141 | redis.close();
142 | }
143 | return scanResult;
144 | }
145 |
146 | /**
147 | *@DESC: 删除指定的key集合(调用时所在环境指定的数据库)
148 | * */
149 | public static void deleteKeys(Jedis redis , List keys){
150 | redis.del(keys.toArray(new String[keys.size()]));
151 | }
152 |
153 | /**
154 | * @DESC: 删除指定key(hash类型)下的字段集(调用时所在环境指定的数据库)
155 | * */
156 | public static void deleteFieldByKey(Jedis redis, String key, List fields){
157 | redis.hdel(key,fields.toArray(new String[fields.size()]));
158 | }
159 |
160 | }
161 |
--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/RedisParam.java:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata;
2 |
3 |
4 |
5 | /**
6 | * Created by Anryg on 2018/5/9.
7 | * @DESC: 提供Redis的基础属性配置
8 | */
9 | public interface RedisParam {
10 | String HOSTS = "192.168.211.106";/**redis服务器列表,目前为单点*/
11 | int PORT = 6379;
12 | String PASSWD = "pcl@2020";
13 | //可用连接实例的最大数目,默认值为8;
14 | //如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)
15 | int MAX_ACTIVE = 1500;
16 | //控制一个pool最多有多少个状态为idle(空闲的)的jedis实例,默认值也是8
17 | int MAX_IDLE = 100;
18 | //等待可用连接的最大时间,单位毫秒,默认值为-1,表示永不超时。如果超过等待时间,则直接抛出JedisConnectionException
19 | int MAX_WAIT = 100 * 1000;
20 | int TIMEOUT = 100 * 1000;//超时时间
21 | //在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
22 | boolean TEST_ON_BORROW = true;
23 | }
24 |
--------------------------------------------------------------------------------
/spark-coding/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 | internet_behavior_project
7 | com.anryg.bigdata
8 | 1.0-SNAPSHOT
9 |
10 | 4.0.0
11 | spark-coding
12 | spark-coding
13 |
14 | http://www.example.com
15 |
16 |
17 | UTF-8
18 | 1.8
19 | 1.8
20 |
21 |
22 |
23 |
24 |
25 | org.apache.spark
26 | spark-core_2.12
27 | 3.2.0
28 |
29 |
30 |
31 | org.apache.spark
32 | spark-sql_2.12
33 | 3.2.0
34 |
35 |
36 |
37 | org.apache.spark
38 | spark-sql-kafka-0-10_2.12
39 | 3.2.0
40 |
41 |
42 |
43 | com.alibaba
44 | fastjson
45 | 1.2.71
46 |
47 |
48 |
49 | org.apache.spark
50 | spark-hive_2.12
51 | 3.2.0
52 |
53 |
54 |
55 | com.anryg.bigdata
56 | redis
57 | 1.0-SNAPSHOT
58 |
59 |
60 |
61 | org.elasticsearch
62 | elasticsearch-spark-30_2.12
63 | 7.12.0
64 |
65 |
66 | scala-library
67 | org.scala-lang
68 |
69 |
70 | spark-core_2.12
71 | org.apache.spark
72 |
73 |
74 | spark-sql_2.12
75 | org.apache.spark
76 |
77 |
78 | spark-catalyst_2.12
79 | org.apache.spark
80 |
81 |
82 | slf4j-api
83 | org.slf4j
84 |
85 |
86 |
87 |
88 | commons-httpclient
89 | commons-httpclient
90 | 3.1
91 |
92 |
93 |
94 |
95 | com.clickhouse
96 | clickhouse-jdbc
97 | 0.4.6
98 |
99 |
100 |
101 |
102 |
103 | junit
104 | junit
105 | 4.11
106 | test
107 |
108 |
109 |
110 |
111 |
112 |
113 | src/main/scala
114 | src/main/test
115 |
116 |
117 |
118 |
119 | org.apache.maven.plugins
120 | maven-shade-plugin
121 | 3.2.0
122 |
123 | true
124 | with-dependencies
125 |
126 |
127 | *:*
128 |
129 |
130 | junit:junit
131 |
132 |
133 |
134 |
135 |
136 | *:*
137 |
138 | META-INF/*.SF
139 | META-INF/*.DSA
140 | META-INF/*.RSA
141 |
142 |
143 |
144 | false
145 |
146 |
151 |
152 |
153 |
154 | package
155 |
156 | shade
157 |
158 |
159 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 | org.codehaus.mojo
185 | build-helper-maven-plugin
186 | 3.0.0
187 |
188 |
189 | add-source
190 | generate-sources
191 |
192 | add-source
193 |
194 |
195 |
196 | src/main/java
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 | net.alchim31.maven
205 | scala-maven-plugin
206 | 3.2.1
207 |
208 |
209 |
210 | compile
211 | testCompile
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
--------------------------------------------------------------------------------
/spark-coding/spark-coding.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
--------------------------------------------------------------------------------
/spark-coding/src/main/java/com/anryg/bigdata/clickhouse/CKSink.java:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.clickhouse;
2 |
3 | import com.clickhouse.jdbc.ClickHouseConnection;
4 | import com.clickhouse.jdbc.ClickHouseDataSource;
5 | import org.apache.spark.sql.ForeachWriter;
6 | import org.apache.spark.sql.Row;
7 |
8 | import java.sql.PreparedStatement;
9 | import java.sql.SQLException;
10 |
11 | /**
12 | * @DESC: 自定义structured streaming的外部sink,通过jdbc写数据到clickhouse中
13 | * @Auther: Anryg
14 | * @Date: 2023/7/3 20:24
15 | */
16 | public class CKSink extends ForeachWriter {
17 | private static final String jdbcUrl = "jdbc:ch://192.168.211.107:8123,192.168.211.108:8123,192.168.211.109:8123/local_db"; //为了防止找不到本地表,把整个集群的配置都写上
18 | //private static final Properties properties = new Properties();
19 | private static volatile ClickHouseDataSource ckDtaSource;
20 | private static volatile ClickHouseConnection connection;
21 |
22 | private static final String user = "default"; //用CK的默认用户
23 | private static final String pwd = ""; //默认用户没有设置密码
24 | private static final String tableName = "dns_logs_from_spark"; //写入的CK目标表
25 |
26 |
27 | /**
28 | * @DESC: 执行数据处理之前的准备工作,创建数据库连接,并确保单例,其中open会以partition为单位执行
29 | * */
30 | @Override
31 | public boolean open(long partitionId, long epochId){
32 | if (ckDtaSource == null || connection == null) {
33 | synchronized (CKSink.class){
34 | if (ckDtaSource == null || connection == null) {
35 | try {
36 | ckDtaSource = new ClickHouseDataSource(jdbcUrl);
37 | connection = ckDtaSource.getConnection(user, pwd);
38 | } catch (SQLException e) {
39 | e.printStackTrace();
40 | System.exit(-1); //捕获到异常后进程退出
41 | }
42 | }
43 | }
44 | }
45 |
46 | if (connection == null) return false;
47 | else return true;
48 | }
49 |
50 |
51 | /**
52 | * @DESC: 当open函数返回为true之后,会针对partition中的每个ROW进行调用
53 | * */
54 | @Override
55 | public void process(Row value) {
56 | try {
57 | PreparedStatement preparedStatement = connection.prepareStatement("insert into " + tableName + " values(?,?,?,?,?,?,?,?,?)");
58 | preparedStatement.setString(1,value.getString(0));
59 | preparedStatement.setString(2,value.getString(1));
60 | preparedStatement.setString(3,value.getString(2));
61 | preparedStatement.setString(4,value.getString(3));
62 | preparedStatement.setString(5,value.getString(4));
63 | preparedStatement.setString(6,value.getString(5));
64 | preparedStatement.setString(7,value.getString(6));
65 | preparedStatement.setString(8,value.getString(7));
66 | preparedStatement.setString(9,value.getString(8));
67 | preparedStatement.addBatch();
68 | preparedStatement.executeBatch();
69 | } catch (SQLException e) {
70 | e.printStackTrace();
71 | System.exit(-1); //捕获到异常后进程退出
72 | }
73 |
74 | }
75 |
76 | /**
77 | * @DESC: 上两个函数执行完后,开始调用,一般用于关闭连接
78 | * */
79 | @Override
80 | public void close(Throwable errorOrNull) {
81 | //长连接,不关闭
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/hive/ConnectHive.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.hive
2 |
3 | import org.apache.spark.SparkConf
4 | //import org.apache.spark.sql.SparkSession
5 |
6 | /**
7 | * @DESC:
8 | * @Auther: Anryg
9 | * @Date: 2022/4/8 16:30
10 | */
11 | /*object ConnectHive {
12 |
13 | def main(args: Array[String]): Unit = {
14 | val conf = new SparkConf()
15 | conf.setAppName("connect_hive")
16 | val sparkSession = SparkSession.builder().config(conf)
17 | //.config("spark.sql.warehouse.dir","hdfs://192.168.211.106:8020/warehouse/tablespace/managed/hive")
18 | //.config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
19 | //.config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
20 | .enableHiveSupport()
21 | .getOrCreate()
22 |
23 | val result = sparkSession.sql("select * from xas.as_bgp_bak limit 3")
24 | result.show()
25 |
26 | sparkSession.close()
27 | sparkSession.stop()
28 | }
29 | }*/
30 |
31 |
32 | /*val hive = HiveWarehouseSession.session(sparkSession).build()//获取HWC对象
33 | val result = hive.executeQuery("select * from doi_data limit 2")//查询hive表数据
34 | result.show()*/
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/hive/Spark3ConnectHive3.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.hive
2 |
3 | import org.apache.spark.SparkConf
4 | //import org.apache.spark.sql.SparkSession
5 |
6 | /**
7 | * @DESC: 必须是非ACID表,否则读取为空表,但不报错
8 | * @Auther: Anryg
9 | * @Date: 2022/4/8 16:30
10 | */
11 | /*object Spark3ConnectHive3 {
12 |
13 | def main(args: Array[String]): Unit = {
14 | val conf = new SparkConf()
15 | conf.setAppName("connect_hive")
16 | val sparkSession = SparkSession.builder().config(conf)
17 | //.config("spark.sql.warehouse.dir","hdfs://192.168.211.106:8020/warehouse/tablespace/managed/hive")
18 | //.config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
19 | //.config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
20 | //.config("spark.sql.hive.strict.managed.tables", false)
21 | .enableHiveSupport()
22 | .getOrCreate()
23 |
24 | val result = sparkSession.sql("select * from xas.as_bgp_bak limit 3")
25 | result.show()
26 |
27 | sparkSession.close()
28 | sparkSession.stop()
29 | }
30 | }*/
31 |
32 |
33 | /*val hive = HiveWarehouseSession.session(sparkSession).build()//获取HWC对象
34 | val result = hive.executeQuery("select * from doi_data limit 2")//查询hive表数据
35 | result.show()*/
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/Kafka2CK.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.clickhouse
2 |
3 |
4 | import com.anryg.bigdata.clickhouse.CKSink
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.sql.SparkSession
7 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
8 |
9 | /**
10 | * @DESC: 通过spark structured streaming消费kafka,并通过自定义的ForeachWriter写数据到clickhouse
11 | * @Auther: Anryg
12 | * @Date: 2023/7/3 10:18
13 | */
14 | object Kafka2CK {
15 |
16 | def main(args: Array[String]): Unit = {
17 | val conf = new SparkConf().setAppName("Kafka2CK").setMaster("local[*]")
18 | val spark = SparkSession.builder().config(conf).getOrCreate()
19 |
20 | val rawDF = spark.readStream //获取数据源
21 | .format("kafka") //确定数据源的来源格式
22 | .option("kafka.bootstrap.servers", "192.168.211.107:6667,192.168.211.108:6667,192.168.211.109:6667") //指定kafka集群的地址,理论上写一个broker就可以了
23 | .option("subscribe","qianxin") //指定topic
24 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/
25 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常
26 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数
27 | //.option("maxOffsetsPerTrigger",99000000)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible,但是每次只取最新的数据,不取旧的*/
28 | .option("startingOffsets","latest") //第一次消费时,读取kafka数据的位置
29 | .load()
30 |
31 | import spark.implicits._
32 |
33 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型
34 | .map(row => {
35 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field
36 | val msgArray = line.split("\\|") //指定分隔符进行字段切分
37 | msgArray
38 | }).filter(_.length == 9) //只留字段数为9的数据
39 | .map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组,为了方便下一步赋予schema
40 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名
41 |
42 |
43 | val query = ds.writeStream
44 | .outputMode(OutputMode.Append()) //指定数据的写入方式
45 | .foreach(new CKSink)
46 | //.format("console") //指定外部输出介质,注意:不能同时指定2个外部输出,否则只会以最后一个为准
47 | //.trigger(Trigger.ProcessingTime(6,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/
48 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/Kafka2CK2") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/
49 | .start()
50 |
51 | query.awaitTermination()
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/StreamingProcessHelper.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming
2 |
3 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
4 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter, OutputMode}
5 |
6 | /**
7 | * @DESC: 对数据处理的共性部分进行提取
8 | * @Auther: Anryg
9 | * @Date: 2022/8/31 17:50
10 | */
11 | trait StreamingProcessHelper[Any] {
12 |
13 | /**
14 | * @DESC: 以流的方式获取数据source
15 | * @param sparkSession:
16 | * @param dataSource: 数据源形式,比如kafka
17 | * @param config: 对流式数据源的配置
18 | * */
19 | def getStreamingReader(sparkSession:SparkSession, dataSource:String, config:Map[String,String]): DataStreamReader ={
20 | val streamingReader = sparkSession.readStream
21 | .format(dataSource)
22 | .options(config)
23 | streamingReader
24 | }
25 |
26 | /**
27 | * @DESC: 以流方式对数据进行sink
28 | * @param dataSet: 处理完成的结果数据集
29 | * @param outputMode: sink的类型:Complete、append、update
30 | * @param config: 对sink对象的配置
31 | * */
32 | def getStreamingWriter(dataSet:DataFrame, outputMode:OutputMode, outputFormat:String, config:Map[String,String]): DataStreamWriter[Row] ={
33 | val streamingWriter = dataSet.writeStream
34 | .format(outputFormat)
35 | .outputMode(outputMode)
36 | .options(config)
37 | streamingWriter
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/StructuredStreamingTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | package com.anryg.bigdata.streaming
3 |
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.sql.SparkSession
6 | import org.apache.spark.sql.streaming.OutputMode
7 |
8 | import scala.collection.mutable
9 |
10 | /**
11 | * @DESC: 测试daemon,注意如果在Windows下测试运行,需要在Windows上设置HADOOP_HOME环境变量,且需要在$HADOOP_HOME/bin目录下放置hadoop.dll和winutils两个文件
12 | * @Auther: Anryg
13 | * @Date: 2021/3/1 11:09
14 | */
15 | object StructuredStreamingTest {
16 |
17 |
18 | def main(args: Array[String]): Unit = {
19 | val conf = new SparkConf().setMaster("local[2]").setAppName("Structured streaming test")
20 | //val conf = SparkConfFactory.newSparkConf().setMaster("local[2]").setAppName("Structured streaming test")
21 |
22 | val spark = SparkSession.builder().config(conf).getOrCreate()
23 |
24 | val rawDF = spark.readStream.format("socket") /**如果结果输出为complete模式,原始DF不能直接作为结果输出,必须经过聚合处理才可以,否则会有如下报错*/
25 | /*Exception in thread "main" org.apache.spark.sql.AnalysisException:
26 | Complete output mode not supported when there are no streaming aggregations on streaming DataFrames/Datasets;;*/
27 | .option("host","192.168.211.106")
28 | .option("port",9998)
29 | .load()
30 |
31 | import spark.implicits._
32 |
33 |
34 |
35 | val xxx = rawDF.as[String].foreachPartition(iter => {
36 | while (iter.hasNext) println(iter.next())
37 | })
38 | /*mapPartitions(iterator => {
39 | val array = new mutable.ArrayBuffer[String]
40 | while (iterator.hasNext){
41 | val next = iterator.next()
42 | array.+=(next)
43 | }
44 | array.toIterator
45 | })*/
46 |
47 | val query = rawDF.writeStream
48 | .outputMode(OutputMode.Append())
49 | .format("console")
50 | .start()
51 |
52 | query.awaitTermination()
53 |
54 |
55 | //rawDF.take(10).foreach(println(_))
56 |
57 |
58 | }
59 |
60 | }
61 | */
62 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreaming4Kafka2CSV.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.demo;
2 | import java.util.concurrent.TimeUnit
3 |
4 | import com.alibaba.fastjson.JSON
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.sql.SparkSession
7 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
8 | /**
9 | * @DESC: 对接实时上网的数据源到HDFS的CSV文件中
10 | * @Auther: Anryg
11 | * @Date: 2020/12/17 09:56
12 | */
13 | object StructuredStreaming4Kafka2CSV {
14 |
15 | def main(args: Array[String]): Unit = {
16 | val conf = new SparkConf().setAppName("StructuredStreaming4Kafka2CSV").setMaster("local[*]")
17 | val spark = SparkSession.builder().config(conf).getOrCreate()
18 |
19 | val rawDF = spark.readStream
20 | .format("kafka")
21 | .option("kafka.bootstrap.servers", "192.168.211.107:6667")
22 | .option("subscribe","qianxin")
23 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/
24 | .option("failOnDataLoss",false)
25 | .option("fetchOffset.numRetries",3)
26 | .option("maxOffsetsPerTrigger",90000000)/**用于限流,限定每个批次取的数据条数,确定写入HDFS单个文件的条数*/
27 | .option("startingOffsets","earliest")
28 | .load()
29 |
30 | import spark.implicits._
31 | val ds = rawDF.selectExpr("CAST(value AS STRING)")
32 | .map(row => {
33 | val line = row.getAs[String]("value")
34 | val fieldArray:Array[String] = line.split("\\|")
35 | fieldArray
36 | }).filter(_.length == 9).map(array =>(array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8)))
37 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip")
38 |
39 | ds.printSchema()
40 |
41 | //val ds1 = ds.select($"client_ip")
42 | val query = ds.writeStream
43 | .outputMode(OutputMode.Append()).trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒写文件一次*/
44 | .option("format", "append") /**会在同一个目录下追加新文件,否则只能在特定目录下写一个批次的的数据后就报错*/
45 | .option("header", "true") /**添加文件的scheme*/
46 | .format("csv").option("path","hdfs://192.168.211.106:8020/DATA/qianxin/3/")
47 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-03") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/
48 | .start()
49 |
50 | query.awaitTermination()
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.demo
2 |
3 | import com.alibaba.fastjson.JSON
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.sql.SparkSession
6 | import org.apache.spark.sql.streaming.OutputMode
7 | ;
8 |
9 | /**
10 | * @DESC: 从kafka读取上网上网数据
11 | * @Auther: Anryg
12 | * @Date: 2020/12/17 09:56
13 | */
14 | object StructuredStreamingFromKafka {
15 |
16 | def main(args: Array[String]): Unit = {
17 | val conf = new SparkConf().setAppName("StructuredStreamingFromKafka").setMaster("local[*]")
18 | val spark = SparkSession.builder().config(conf).getOrCreate()
19 |
20 | val rawDF = spark.readStream //获取数据源
21 | .format("kafka") //确定数据源的来源格式
22 | .option("kafka.bootstrap.servers", "192.168.211.108:6667") //指定kafka集群的地址,理论上写一个broker就可以了
23 | .option("subscribe","test") //指定topic
24 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/
25 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常
26 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数
27 | .option("maxOffsetsPerTrigger",10)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible,但是每次只取最新的数据,不取旧的*/
28 | .option("startingOffsets","latest") //第一次消费时,读取kafka数据的位置
29 | //.option("startingOffsets","""{"test":{"0":-2,"1":-2,"2":-2,"3":-2}}""")
30 | .load()
31 |
32 | import spark.implicits._
33 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型
34 | .map(row => {
35 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field
36 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析
37 | val message = rawJson.getString("message") //获取业务数据部分
38 | val msgArray = message.split(",") //指定分隔符进行字段切分
39 | msgArray
40 | }).filter(_.length == 9) //只留字段数为9的数据
41 | .map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组,为了方便下一步赋予schema
42 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名
43 |
44 | ds.printSchema() //打印schema,确认没有问题
45 |
46 | val query = ds.writeStream
47 | .outputMode(OutputMode.Append()) //指定数据的写入方式
48 | .format("console") //指定外部输出介质
49 | //.trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/
50 | .option("format", "append") /**会在同一个目录下追加新文件,否则只能在特定目录下写一个批次的的数据后就报错*/
51 | //.option("header", "true") /**添加文件的scheme*/
52 | // .format("csv").option("path","hdfs://192.168.211.106:8020/DATA/qianxin/3/")
53 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-08") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/
54 | .start()
55 |
56 | query.awaitTermination()
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka2ES.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.demo;
2 | import com.alibaba.fastjson.JSON
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.streaming.OutputMode
6 | /**
7 | * @DESC: 从kafka读取上网上网数据,写入ES
8 | * @Auther: Anryg
9 | * @Date: 2020/12/17 09:56
10 | */
11 | object StructuredStreamingFromKafka2ES {
12 |
13 | def main(args: Array[String]): Unit = {
14 | val conf = new SparkConf().setAppName("StructuredStreamingFromKafka").setMaster("local[*]")
15 | val spark = SparkSession.builder().config(conf).getOrCreate()
16 |
17 | val rawDF = spark.readStream
18 | .format("kafka") //确定数据源的来源格式
19 | .option("kafka.bootstrap.servers", "192.168.211.107:6667") //指定kafka集群的地址,理论上写一个broker就可以了
20 | .option("subscribe","test") //指定topic
21 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/
22 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常
23 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数
24 | //.option("maxOffsetsPerTrigger",100)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible*/
25 | .option("startingOffsets","earliest") //第一次消费时,读取kafka数据的位置
26 | .load()
27 |
28 | import spark.implicits._
29 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型
30 | .map(row => {
31 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field
32 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析
33 | val message = rawJson.getString("message") //获取业务数据部分
34 | val msgArray = message.split(",") //指定分隔符进行字段切分
35 | msgArray
36 | }).filter(_.length == 9) //只留字段数为9的数据
37 | .map(array => (array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组,为了方便下一步赋予schema
38 | .toDF("id","client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名
39 |
40 | ds.printSchema() //打印schema,确认没有问题
41 |
42 | val query = ds.writeStream
43 | .outputMode(OutputMode.Append()) //指定数据的写入方式
44 | .format("org.elasticsearch.spark.sql") //指定外部输出为ES
45 | .option("es.nodes","192.168.211.106")
46 | .option("es.port","9201")
47 | .option("es.write.operation","upsert")
48 | .option("es.mapping.id","id")
49 | //.option("es.mapping.exclude","id")
50 | //.trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/
51 | .option("format", "append") /**追加写入*/
52 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-05") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化*/
53 | .start("internet_behavior-flink")
54 |
55 | query.awaitTermination()
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka2Hive.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.demo
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import com.alibaba.fastjson.JSON
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
9 | ;
10 |
11 | /**
12 | * @DESC: 从kafka读取上网数据,写入hive动态分区表
13 | * @Auther: Anryg
14 | * @Date: 2020/12/17 09:56
15 | */
16 | object StructuredStreamingFromKafka2Hive {
17 |
18 | def main(args: Array[String]): Unit = {
19 | val conf = new SparkConf()
20 | .setAppName("StructuredStreamingFromKafka2Hive")
21 | .setMaster("local[*]")//本地运行模式,如果提交集群,注释掉这行
22 | val spark = SparkSession.builder().config(conf)
23 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
24 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
25 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog
26 | .getOrCreate()
27 |
28 | val rawDF = spark.readStream
29 | .format("kafka") //确定数据源的来源格式
30 | .option("kafka.bootstrap.servers", "192.168.211.107:6667") //指定kafka集群的地址,理论上写一个broker就可以了
31 | .option("subscribe","test") //指定topic
32 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/
33 | .option("failOnDataLoss",false) //如果读取数据源时,发现数据突然缺失,比如被删,则是否马上抛出异常
34 | .option("fetchOffset.numRetries",3) //获取消息的偏移量时,最多进行的重试次数
35 | .option("maxOffsetsPerTrigger",500)/**用于限流,限定每次读取数据的最大条数,不指定则是as fast as possible*/
36 | .option("startingOffsets","earliest") //第一次消费时,读取kafka数据的位置
37 | .load()
38 |
39 | import spark.implicits._
40 | val ds = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型
41 | .map(row => {
42 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field
43 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析
44 | val message = rawJson.getString("message") //获取业务数据部分
45 | val msgArray = message.split(",") //指定分隔符进行字段切分
46 | msgArray
47 | }).filter(_.length == 9) //只留字段数为9的数据
48 | .filter(array => array(2).length >= 8)//确保日期字段符合规范
49 | .map(array => (array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),
50 | if(array(4).isInstanceOf[Int]) array(4).toInt else 99,array(5),array(6),array(7),array(8),array(2).substring(0,4),array(2).substring(4,6),array(2).substring(6,8))) //将其转化成为元组,为了方便下一步赋予schema
51 | .toDF("id","client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day") //给裸数据添加字段名
52 |
53 | ds.printSchema() //打印schema,确认没有问题
54 | spark.sql("show databases;").show()
55 |
56 | val query = ds.writeStream
57 | .outputMode(OutputMode.Append()) //指定数据的写入方式
58 | .format("orc") //指定外部输出的文件存储格式
59 | .option("format", "append")
60 | .trigger(Trigger.ProcessingTime(10,TimeUnit.SECONDS))/**每60秒执行一次,不指定就是as fast as possible*/
61 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StructuredStreamingFromKafka2Hive01") /**用来保存offset,用该目录来绑定对应的offset,如果该目录发生改变则程序运行的id会发生变化,类比group.id的变化,写hive的时候一定不要轻易变动*/
62 | .partitionBy("year","month","day")//提供分区字段
63 | .toTable("test.test")//写入hive表
64 | query.awaitTermination()
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingReadHive.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.demo;
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.sql.SparkSession
6 |
7 | /**
8 | * @DESC: 读取通过streaming写入hive动态分区表的数据
9 | * @Auther: Anryg
10 | * @Date: 2022/08/31 09:56
11 | */
12 | object StructuredStreamingReadHive {
13 |
14 | def main(args: Array[String]): Unit = {
15 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
16 | val conf = new SparkConf()
17 | .setAppName("StructuredStreamingReadHive")
18 | .setMaster("local[*]")//本地运行模式,如果提交集群,注释掉这行
19 | val spark = SparkSession.builder().config(conf)
20 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
21 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
22 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog
23 | .getOrCreate()
24 |
25 | spark.readStream
26 | .table("ods.ods_kafka_internetlog1")
27 | .select("client_ip")
28 | .writeStream
29 | .format("console")
30 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StructuredStreamingReadHive1")
31 | .start().awaitTermination()
32 |
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/window_watermark/WorldCountWithWatermark.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.demo.window_watermark
2 |
3 | import java.sql.Timestamp
4 | import java.text.SimpleDateFormat
5 | import org.apache.log4j.{Level, Logger}
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.sql.streaming.OutputMode
9 |
10 | /**
11 | * @DESC: 用时间窗口和watermark来进行client_ip的worldcount统计
12 | * @Auther: Anryg
13 | * @Date: 2022/11/30 10:04
14 | */
15 | object WorldCountWithWatermark {
16 |
17 | def main(args: Array[String]): Unit = {
18 | val conf = new SparkConf().setAppName("WorldCountWithWatermark").setMaster("local")
19 | val spark = SparkSession.builder()
20 | .config(conf)
21 | .getOrCreate()
22 | Logger.getLogger("org.apache").setLevel(Level.WARN) //减少INFO日志的输出
23 |
24 | val rawDF = spark.readStream
25 | .format("kafka")
26 | .option("kafka.bootstrap.servers", "192.168.211.107:6667")
27 | .option("subscribe", "qianxin")
28 | //.option("group.id","test9999") /**不再用该方式来绑定offset,而是每个程序有个唯一的id,该id跟checkpointLocation绑定,虽然group.id属性在运行中依然保留,但是不再跟offset绑定*/
29 | .option("failOnDataLoss",false)
30 | .option("fetchOffset.numRetries",3)
31 | //.option("maxOffsetsPerTrigger",Integer.MAX_VALUE)/**用于限流,限定每个批次取的数据条数,确定写入HDFS单个文件的条数*/
32 | .option("startingOffsets","latest")
33 | .load()
34 |
35 | import spark.implicits._
36 | val df1 = rawDF.selectExpr("CAST(value AS string)")
37 | .map(row =>{
38 | val line = row.getAs[String]("value")
39 | val fieldArray:Array[String] = line.split("\\|")
40 | fieldArray
41 | })
42 | .filter(_.length == 9) //确定字段数必须为9个
43 | .filter(_(1).endsWith("com")) //防止数量太大,对访问的网站做的一点限制
44 | .map(array =>{
45 | val sdf = new SimpleDateFormat("yyyyMMddhhmmss").parse(array(2))
46 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(sdf)
47 | (array(0), Timestamp.valueOf(time)) //因为time这个字段要作为watermark字段,它必须是yyyy-MM-dd HH:mm:ss样式的Timestamp类型
48 | })
49 | .toDF("client_ip", "time") //添加schema
50 |
51 | import org.apache.spark.sql.functions._ /**引入spark内置函数*/
52 |
53 | val df2 = df1.withWatermark("time", "10 seconds") //一般需要跟window一起配合使用
54 | .groupBy(window($"time","2 minutes","30 seconds"), $"client_ip") //确定具体字段,以及对应的聚合时间窗口,和滑动窗口
55 | .count()
56 | .orderBy($"count".desc)
57 | .limit(10)
58 |
59 | val query = df2.writeStream
60 | .format("console") //打印到控制台
61 | .option("truncate", false) //将结果的内容完整输出,默认会砍掉内容过长的部分
62 | .option("numRows",30) //一次最多打印多少行,默认20行
63 | .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/WorldCountWithWatermark") //确定checkpoint目录
64 | //.outputMode(OutputMode.Update())//不支持排序的结果
65 | .outputMode(OutputMode.Complete()) //确定输出模式,默认为Append
66 | .start()
67 |
68 | query.awaitTermination()
69 | }
70 |
71 | }
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/dwd/StreamingFromOds2Dwd.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.dwd
2 |
3 | import com.anryg.bigdata.{IpSearch, RedisClientUtils}
4 | import com.anryg.bigdata.streaming.StreamingProcessHelper
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
7 | import org.apache.spark.sql.{DataFrame, SparkSession}
8 |
9 |
10 | /**
11 | * @DESC: 读取ods层数据,并加工写入到dwd
12 | * @Auther: Anryg
13 | * @Date: 2022/9/1 09:53
14 | */
15 | object StreamingFromOds2Dwd extends StreamingProcessHelper[Any]{
16 |
17 | def main(args: Array[String]): Unit = {
18 | val conf = new SparkConf().setAppName("StreamingFromOds2Dwd").setMaster("local")
19 | val spark = SparkSession.builder().config(conf)
20 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
21 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
22 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog
23 | .getOrCreate()
24 |
25 | clickProcess(spark,"ods.ods_kafka_internetlog","dwd.dwd_internetlog_detail")
26 | }
27 |
28 | /**
29 | *@DESC: 流方式读取hive数据源
30 | * */
31 | def readHive2DF(sparkSession: SparkSession, sourceTable:String): DataFrame ={
32 | sparkSession.readStream.table(sourceTable)
33 | }
34 |
35 | /**
36 | *@DESC: 对ods数据进行字段补齐等处理
37 | * */
38 | def handleData(sparkSession: SparkSession, dataFrame: DataFrame, tableName:String): DataFrame ={
39 | import sparkSession.implicits._
40 | dataFrame.printSchema()
41 | dataFrame.map(row => {
42 | val clientIP = row.getAs[String]("client_ip")
43 | val ipAndAddr = IpSearch.getAddrByIP(RedisClientUtils.getSingleRedisClient,clientIP).split("-")
44 | val country = ipAndAddr(2)
45 | val province = ipAndAddr(3)
46 | val city = ipAndAddr(4)
47 | val operator = ipAndAddr(5)
48 | val domain = row.getAs[String]("domain").toLowerCase//将域名转成小写
49 | val time = row.getAs[String]("time")
50 | val targetIP = row.getAs[String]("target_ip")
51 | val rcode = row.getAs[String]("rcode")
52 | val queryType = row.getAs[String]("query_type")
53 | val authRecord = row.getAs[String]("authority_record").toLowerCase
54 | val addMsg = row.getAs[String]("add_msg")
55 | val dnsIP = row.getAs[String]("dns_ip")
56 | val year = row.getAs[String]("year")
57 | val month = row.getAs[String]("month")
58 | val day = row.getAs[String]("day")
59 | (clientIP,country,province,city,operator,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP,year,month,day)
60 | }).toDF("client_ip","country","province","city","operator","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day")
61 | }
62 |
63 | /**
64 | *@DESC: 将处理好的数据sink到dwd表中
65 | * */
66 | def sinkData(targetDS:DataFrame, tableName:String): StreamingQuery ={
67 | val config = Map(("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StreamingFromOds2Dwd"),
68 | ("format","append"))
69 | getStreamingWriter(targetDS,OutputMode.Append(),"orc",config)
70 | .partitionBy("year","month","day")
71 | .toTable(tableName)
72 | }
73 |
74 | /**
75 | * @DESC: 将所有数据步骤串起来
76 | * */
77 | def clickProcess(sparkSession: SparkSession,sourceTable:String, sinkTable:String): Unit ={
78 | val rawDF = readHive2DF(sparkSession, sourceTable)
79 | val targetDS = handleData(sparkSession, rawDF, sourceTable)
80 | sinkData(targetDS, sinkTable).awaitTermination()
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/ods/StreamingSource2HiveOds.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.streaming.ods
2 |
3 | import com.alibaba.fastjson.JSON
4 | import com.anryg.bigdata.streaming.StreamingProcessHelper
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
7 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
8 |
9 |
10 | /**
11 | * @DESC:
12 | * @Auther: Anryg
13 | * @Date: 2022/8/31 19:03
14 | */
15 | object StreamingSource2HiveOds extends StreamingProcessHelper[Any]{
16 |
17 |
18 | /**
19 | * @DESC: 主函数,应用运行入口
20 | * */
21 | def main(args: Array[String]): Unit = {
22 | val conf = new SparkConf().setAppName("StreamingSource2HiveOds").setMaster("local[*]")
23 | val spark = SparkSession.builder().config(conf)
24 | .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
25 | .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
26 | .enableHiveSupport() //打开hive支持功能,可以与hive共享catalog
27 | .getOrCreate()
28 |
29 | clickProcess(spark)
30 |
31 | }
32 |
33 | /**
34 | *@DESC: 将kafka数据源数据读取出来成为DataFrame
35 | * */
36 | def readKafka2DF(sparkSession: SparkSession): DataFrame ={
37 | val config = Map(("kafka.bootstrap.servers", "192.168.211.107:6667"),("subscribe","test"),
38 | ("failOnDataLoss","false"),("fetchOffset.numRetries","3"),("startingOffsets","earliest"))
39 |
40 | getStreamingReader(sparkSession,"kafka",config).load()
41 | }
42 |
43 | /**
44 | *@DESC: 将原始DF进行业务逻辑处理
45 | * */
46 |
47 | def handleData(sparkSession: SparkSession, rawDF:DataFrame): DataFrame ={
48 | import sparkSession.implicits._
49 | val targetDS = rawDF.selectExpr("CAST(value AS STRING)") //将kafka中的数据的value转为为string,原始为binary类型
50 | .map(row => {
51 | val line = row.getAs[String]("value") //获取row对象中的field,其实也只有一个field
52 | val rawJson = JSON.parseObject(line) //原始string是一个json,对其进行解析
53 | val message = rawJson.getString("message") //获取业务数据部分
54 | val msgArray = message.split(",") //指定分隔符进行字段切分
55 | msgArray
56 | }).filter(_.length == 9).filter(array => array(2).length >= 8)//确保日期字段符合规范
57 | .map(array =>(array(0),array(1),array(2),array(3), array(4),array(5),array(6),array(7),array(8),
58 | array(2).substring(0,4),array(2).substring(4,6),array(2).substring(6,8)))
59 | .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day") //给裸数据添加字段名
60 |
61 | targetDS
62 | }
63 |
64 | /**
65 | *@DESC: 将目标数据集进行写入hive的ODS
66 | * */
67 | def sinkData(targetDS:DataFrame): StreamingQuery ={
68 | val config = Map(("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StreamingSource2HiveOds"),
69 | ("format","append"))
70 | getStreamingWriter(targetDS, OutputMode.Append(),"orc",config)
71 | .partitionBy("year","month","day")
72 | .toTable("ods.ods_kafka_internetlog")
73 | }
74 |
75 | /**
76 | * @DESC: 将所有数据步骤串起来
77 | * */
78 | def clickProcess(sparkSession: SparkSession): Unit ={
79 | val rawDF = readKafka2DF(sparkSession)
80 | val targetDS = handleData(sparkSession, rawDF)
81 | sinkData(targetDS).awaitTermination()
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/DataSkew01.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.test.data_skew
2 |
3 | import java.util
4 |
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | /**
8 | * @DESC: 一个数据倾斜的例子
9 | * @Auther: Anryg
10 | * @Date: 2022/10/10 17:00
11 | */
12 | object DataSkew01 {
13 |
14 | def main(args: Array[String]): Unit = {
15 | val conf = new SparkConf().setAppName("DataSkewTest01")/*.setMaster("local[*]")*/
16 | val spark = new SparkContext(conf)
17 |
18 | val rawRDD = spark.textFile(args(0))//读取数据源
19 |
20 | val filteredRDD = rawRDD.filter(line => { /**筛选满足需要的数据,已到达数据倾斜的目的*/
21 | val array = line.split(",")
22 | val target_ip = array(3)
23 | target_ip.equals("106.38.176.185") || target_ip.equals("106.38.176.117") || target_ip.equals("106.38.176.118") || target_ip.equals("106.38.176.116")
24 | })
25 |
26 | val reducedRDD = filteredRDD.map(line => {/**根据目的ip进行汇总,将访问同一个目的ip的所有客户端ip进行汇总*/
27 | val array = line.split(",")
28 | val target_ip = array(3)
29 | val client_ip = array(0)
30 | val index = client_ip.lastIndexOf(".")
31 | val subClientIP = client_ip.substring(0, index) //为了让后续聚合后的value数据量尽可能的少,只取ip的前段部分
32 | (target_ip,Array(subClientIP))
33 | }).reduceByKey(new MyPartitioner(4), _++_)//将Array中的元素进行合并
34 |
35 | val targetRDD = reducedRDD.map(kv => {/**将访问同一个目的ip的客户端,再次根据客户端ip进行进一步统计*/
36 | val map = new util.HashMap[String,Int]()
37 | val target_ip = kv._1
38 | val clientIPArray = kv._2
39 | clientIPArray.foreach(clientIP => {
40 | if (map.containsKey(clientIP)) {
41 | val sum = map.get(clientIP) + 1
42 | map.put(clientIP,sum)
43 | }
44 | else map.put(clientIP,1)
45 | })
46 | (target_ip,map)
47 | })
48 |
49 | targetRDD.saveAsTextFile("/tmp/DataSkew01") //结果数据保存目录
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/DataSkew02.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.test.data_skew
2 |
3 | import java.util
4 |
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | import scala.util.Random
8 |
9 | /**
10 | * @DESC: 一个数据倾斜的例子
11 | * @Auther: Anryg
12 | * @Date: 2022/10/10 17:00
13 | */
14 | object DataSkew02 {
15 |
16 | def main(args: Array[String]): Unit = {
17 | val conf = new SparkConf().setAppName("DataSkewTest02")/*.setMaster("local[*]")*/
18 | val spark = new SparkContext(conf)
19 |
20 | val rawRDD = spark.textFile(args(0)) //读取原始数据源
21 |
22 | val filteredRDD = rawRDD.filter(line => { /**筛选满足需要的数据,已到达数据倾斜的目的*/
23 | val array = line.split(",")
24 | val target_ip = array(3)
25 | target_ip.equals("106.38.176.185") || target_ip.equals("106.38.176.117") || target_ip.equals("106.38.176.118") || target_ip.equals("106.38.176.116")
26 | })
27 |
28 | val reducedRDD_01 = filteredRDD.map(line => {/**解决倾斜第一步:加盐操作将原本1个分区的数据扩大到100个分区*/
29 | val array = line.split(",")
30 | val target_ip = array(3)
31 | val client_ip = array(0)
32 | val index = client_ip.lastIndexOf(".")
33 | val subClientIP = client_ip.substring(0, index)//为了让后续聚合后的value数据量尽可能的少,只取ip的前段部分
34 | if (target_ip.equals("106.38.176.185")){/**针对特定倾斜的key进行加盐操作*/
35 | val saltNum = 99 //将原来的1个key增加到100个key
36 | val salt = new Random().nextInt(saltNum)
37 | (target_ip + "-" + salt,Array(subClientIP))
38 | }
39 | else (target_ip,Array(subClientIP))
40 | }).reduceByKey(_++_,103)//将Array中的元素进行合并,并确定分区数量
41 |
42 | val targetRDD_01 = reducedRDD_01.map(kv => {/**第二步:将各个分区中的数据进行初步统计,减少单个分区中value的大小*/
43 | val map = new util.HashMap[String,Int]()
44 | val target_ip = kv._1
45 | val clientIPArray = kv._2
46 | clientIPArray.foreach(clientIP => {//对clientIP进行统计
47 | if (map.containsKey(clientIP)) {
48 | val sum = map.get(clientIP) + 1
49 | map.put(clientIP,sum)
50 | }
51 | else map.put(clientIP,1)
52 | })
53 | (target_ip,map)
54 | })
55 |
56 | val reducedRDD_02 = targetRDD_01.map(kv => {/**第3步:对倾斜的数据进行减盐操作,将分区数从100减到10*/
57 | val targetIPWithSalt01 = kv._1
58 | val clientIPMap = kv._2
59 | if (targetIPWithSalt01.startsWith("106.38.176.185")){
60 | val targetIP = targetIPWithSalt01.split("-")(0)
61 | val saltNum = 9 //将原来的100个分区减少到10个分区
62 | val salt = new Random().nextInt(saltNum)
63 | (targetIP + "-" + salt,clientIPMap)
64 | }
65 | else kv
66 | }).reduceByKey((map1,map2) => { /**合并2个map中的元素,key相同则value值相加*/
67 | val map3 = new util.HashMap[String,Int](map1)
68 | map2.forEach((key,value) => {
69 | map3.merge(key, value, (v1,v2) => v1 + v2) //将map1和map2中的结果merge到map3中,相同的key,则value相加
70 | })
71 | map3
72 | },13)//调整分区数量
73 |
74 | val finalRDD = reducedRDD_02.map(kv => {/**第4步:继续减盐,将原本10个分区数的数据恢复到1个*/
75 | val targetIPWithSalt01 = kv._1
76 | val clientIPMap = kv._2
77 | if (targetIPWithSalt01.startsWith("106.38.176.185")){
78 | val targetIP = targetIPWithSalt01.split("-")(0)
79 | (targetIP,clientIPMap)//彻底将盐去掉
80 | }
81 | else kv
82 | }).reduceByKey(new MyPartitioner(4), (map1,map2) => { /**合并2个map中的元素,key相同则value值相加*/
83 | val map3 = new util.HashMap[String,Int](map1)
84 | map2.forEach((key,value) => {
85 | map3.merge(key, value, (v1,v2) => v1 + v2)
86 | })
87 | map3
88 | })//调整分区数量
89 |
90 | finalRDD.saveAsTextFile(args(1))
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/MyPartitioner.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.test.data_skew
2 |
3 | import org.apache.spark.Partitioner
4 |
5 | /**
6 | * @DESC: 实现自定义的分区策略
7 | * @Auther: Anryg
8 | * @Date: 2022/10/13 09:52
9 | */
10 | class MyPartitioner(partitionNum: Int) extends Partitioner{
11 | override def numPartitions: Int = partitionNum //确定总分区数量
12 |
13 | override def getPartition(key: Any): Int = {//确定数据进入分区的具体策略
14 | val keyStr = key.toString
15 | val keyTag = keyStr.substring(keyStr.length - 1, keyStr.length)
16 | keyTag.toInt % partitionNum
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/map_pk_mappartition/MapPartitionTest.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.test.map_pk_mappartition
2 |
3 | import java.util
4 |
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.sql.SparkSession
7 |
8 | import scala.collection.mutable
9 |
10 | /**
11 | * @DESC:
12 | * @Auther: Anryg
13 | * @Date: 2022/9/20 10:10
14 | */
15 | object MapPartitionTest {
16 |
17 | def main(args: Array[String]): Unit = {
18 | val conf = new SparkConf().setAppName("MapPartitionTest")/*.setMaster("local")*/
19 | val spark = SparkSession.builder().config(conf).getOrCreate()
20 | val rawDF = spark.read/*.option("header",true)*/.csv(args(0))
21 |
22 | import spark.implicits._
23 | rawDF.printSchema()
24 | rawDF.show()
25 | val resultDF = rawDF.mapPartitions(iterator => {
26 | //val array = new mutable.ArrayBuffer[(String,String,String,String,String,String,String,String,String)]
27 | //val seq = mutable.Seq[(String,String,String,String,String,String,String,String,String)]
28 | //val list = new util.LinkedList[(String,String,String,String,String,String,String,String,String)]
29 | val set = new mutable.LinkedHashSet[(String,String,String,String,String,String,String,String,String)]
30 | while (iterator.hasNext){
31 | val next = iterator.next()
32 | val clientIP = next.getAs[String]("_c0")
33 | val domain = next.getAs[String]("_c1").toLowerCase//将域名转成小写
34 | val time = next.getAs[String]("_c2")
35 | val targetIP = next.getAs[String]("_c3")
36 | val rcode = next.getAs[String]("_c4")
37 | val queryType = next.getAs[String]("_c5")
38 | val authRecord = if (next.getAs[String]("_c6") == null ) "" else next.getAs[String]("_c6").toLowerCase
39 | val addMsg = if (next.getAs[String]("_c7") == null ) "" else next.getAs[String]("_c7")
40 | val dnsIP = next.getAs[String]("_c8")
41 |
42 | set.+=((clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP))
43 | //array.+=((clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP))
44 | }
45 | //array.toIterator
46 | set.toIterator
47 | }).toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip")
48 |
49 | resultDF.write.csv(args(1))
50 | }
51 |
52 |
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/map_pk_mappartition/MapTest.scala:
--------------------------------------------------------------------------------
1 | package com.anryg.bigdata.test.map_pk_mappartition
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.sql.SparkSession
5 |
6 | /**
7 | * @DESC:
8 | * @Auther: Anryg
9 | * @Date: 2022/9/20 10:10
10 | */
11 | object MapTest {
12 |
13 | def main(args: Array[String]): Unit = {
14 | val conf = new SparkConf().setAppName("MapTest")/*.setMaster("local")*/
15 | val spark = SparkSession.builder().config(conf).getOrCreate()
16 | val rawDF = spark.read/*.option("header",true)*/.csv(args(0))//读取HDFS的数据源
17 |
18 | import spark.implicits._
19 | rawDF.printSchema() //spark job 1
20 | rawDF.show() //spark job 2
21 | val resultDF = rawDF.map(row => {
22 | val clientIP = row.getAs[String]("_c0")
23 | val domain = row.getAs[String]("_c1").toLowerCase//将域名转成小写
24 | val time = row.getAs[String]("_c2")
25 | val targetIP = row.getAs[String]("_c3")
26 | val rcode = row.getAs[String]("_c4")
27 | val queryType = row.getAs[String]("_c5")
28 | val authRecord = if (row.getAs[String]("_c6") == null ) "" else row.getAs[String]("_c6").toLowerCase
29 | val addMsg = if (row.getAs[String]("_c7") == null ) "" else row.getAs[String]("_c7")
30 | val dnsIP = row.getAs[String]("_c8")
31 | (clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP)
32 | }).toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip")
33 |
34 |
35 | /**将转换后的数据写入HDFS*/
36 | resultDF.write.csv(args(1))//spark job 3
37 | }
38 | }
39 |
--------------------------------------------------------------------------------