├── .idea ├── checkstyle-idea.xml ├── compiler.xml ├── encodings.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── uiDesigner.xml ├── vcs.xml └── workspace.xml ├── README.MD ├── pom.xml ├── run ├── HdfsFileClean.sh ├── HiveTableClean.sh └── ParseFsimageFile2Hive.sh └── src ├── main ├── java │ └── com │ │ └── xkj │ │ └── mlrc │ │ ├── clean │ │ ├── domain │ │ │ ├── LogBean.java │ │ │ └── ParamOption.java │ │ ├── file │ │ │ └── HdfsFileClean.java │ │ ├── table │ │ │ └── HiveTableClean.java │ │ └── util │ │ │ ├── ArgsUtil.java │ │ │ ├── DateUtil.java │ │ │ ├── HdfsUtils.java │ │ │ ├── JdbcHelper.java │ │ │ └── PropsUtil.java │ │ ├── common │ │ └── shell │ │ │ ├── MyUserInfo.java │ │ │ └── Shell.java │ │ └── fsimage │ │ ├── GenerateFsimageTable.java │ │ └── GetFromFsImageInfo.java └── resources │ ├── config.properties │ ├── core-site.xml │ ├── hdfs-site.xml │ ├── hive-site.xml │ └── log4j.properties └── test └── java └── com └── xkj └── mlrc ├── clean └── util │ ├── HdfsUtilsTest.java │ └── JdbcHelperTest.java └── fsimage └── GetFromFsImageTest.java /.idea/checkstyle-idea.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 36 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 64 | 65 | 66 | 67 | doRunMain 68 | 69 | \t 70 | hive.meta.mysql.url 71 | \n 72 | hdfs://c 73 | abs 74 | app_xqtt_news_sdk_show_click 75 | trash 76 | move 77 | tras 78 | all_overdue_dirs 79 | show 80 | execute 81 | tbl_name 82 | targetPath 83 | \\\n 84 | prof 85 | mlg 86 | mlg. 87 | lom 88 | ware 89 | 90 | 91 | 92 | 93 | 94 | F:\workspace\temp\data-manager\src\main\java\com\xkj\mlrc\clean 95 | 96 | 97 | 98 | 100 | 101 | 143 | 144 | 145 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 354 | 355 | 356 | 357 | 370 | 371 | 385 | 386 | 400 | 401 | 415 | 416 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 1587448851975 453 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | jar://D:/software/jdk181/src.zip!/java/sql/DriverManager.java 564 | 663 565 | 566 | 568 | 569 | file://$PROJECT_DIR$/data-manager/src/main/java/com/xkj/mlrc/clean/table/HiveTableClean.java 570 | 266 571 | 572 | 574 | 575 | file://$PROJECT_DIR$/data-manager/src/test/java/com/xkj/mlrc/fsimage/GetFromFsImageTest.java 576 | 36 577 | 578 | 580 | 581 | file://$PROJECT_DIR$/data-manager/src/main/java/com/xkj/mlrc/fsimage/GenerateFsimageTable.java 582 | 54 583 | 584 | 586 | 587 | file://$PROJECT_DIR$/data-manager/src/main/java/com/xkj/mlrc/fsimage/GetFromFsImageInfo.java 588 | 74 589 | 590 | 592 | 593 | file://$PROJECT_DIR$/data-manager/src/main/java/com/xkj/mlrc/fsimage/GetFromFsImageInfo.java 594 | 88 595 | 596 | 598 | 599 | file://$PROJECT_DIR$/data-manager/src/main/java/com/xkj/mlrc/fsimage/GetFromFsImageInfo.java 600 | 101 601 | 602 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 726 | 727 | 728 | 729 | 730 | 731 | No facets are configured 732 | 733 | 738 | 739 | 740 | 741 | 742 | 743 | Python 3.6 (ad-datatransfer) interpreter library 744 | 745 | 750 | 751 | 752 | 753 | 754 | 755 | 1.8 756 | 757 | 762 | 763 | 764 | 765 | 766 | 767 | data-manager 768 | 769 | 775 | 776 | 777 | 778 | 779 | 780 | Python 3.6 (ad-datatransfer) 781 | 782 | 787 | 788 | 789 | 790 | 791 | 792 | Maven: antlr:antlr:2.7.7 793 | 794 | 799 | 800 | 801 | 802 | 803 | 804 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # data-manager 2 | # 自动化HDFS数据清理的终极方法 3 | 4 | 代码地址：https://github.com/lijufeng2016/data-manager 5 | 6 | ## 一、背景 7 | 8 | hdfs集群从出生到成长，经历了各种各样业务和人的摧残，早已疲惫不堪，承受巨大压力。某天，你突然发现hdfs的空间超过80%的告警阈值，这时候，你的第一反应是找出那些比较占空间的目录，手动删除，或者是写个定时脚本每天清理固定的目录，随着业务和时间的不断摧残，小朋友，你是否有很多问号？ 9 | 10 | - 占大头的文件清理后还是发现hdfs占用空间大，不停地找要清理的目录，最后发现小文件加起来也占用大 11 | - 同时很多人使用同一个集群，经过互联网的几番大洗礼后hdfs留下很多已离职人员的大量文件不知所措 12 | - 哪些数据该清还是不该清毫无头绪，万一误删了呢 13 | - 很多垃圾文件存放在hdfs上，但是分不清哪些到底是垃圾文件，不知道文件还有没有人用 14 | 15 | 16 | 17 | 针对上述问题，我们可不可以换一种清理hdfs的思路呢？不需要反复修改脚本去指定特定要删的目录呢？也不需要为了找哪些需要清理的目录而焦头烂额呢？来，接下来变魔术给你看。 18 | 19 | ## 二、原理 20 | 21 | 本文介绍一种方法，可以清hive数据，也可以清非hive表的hdfs数据。基本原理是通过解析hadoop fsimage文件获得hdfs全量的文件路径和所有文件最后的访问时间，请hive表数据则还需要加上hive的元数据信息。 22 | 23 | ### fsimage： 24 | 25 | fsimage是hdfs的心脏，hdfs的全量的路径信息都存放在fsimage文件里面。我们在操作hdfs时，不论是增删改查，hadoop都会记录一条edit log，也就是hdfs的操作记录，edit log会定时merge生成fsimage文件，在HA模式下，fsimage文件由standby NameNode生成，单点模式下，由secondary NameNode生成。fsimage文件本身是二进制不可明文读取的，我们需要解析成可读的形式，比如csv。hadoop自带的命令`hdfs oiv`是专门用来解析fsimage文件，通过执行`hdfs getconf -confKey dfs.namenode.name.dir`命令可以知道fsimage的路径，在路径下默认会保存两个fsimage文件，都是fsimage_xxxxxxxxxxx的格式带一串时间戳，时间戳最大的那个就是由最新edit log合并解析生成的。 26 | 27 | 执行: 28 | 29 | ```shell 30 | hdfs oiv -p Delimited -delimiter "," -i fsimage_xxxxxxxx -o fsimage.csv 31 | ``` 32 | 33 | 解析fsimage生成csv文件，文件内容包含了hdfs所有文件和目录，csv包含如下列： 34 | 35 | - **Path** 目录路径 36 | - Replication 备份数 37 | - ModificationTime 最后修改时间 38 | - **AccessTime** 最后访问时间 39 | - PreferredBlockSize 首选块大小 byte 40 | - BlocksCount 块数 41 | - FileSize 文件大小 byte 42 | - NSQUOTA 名称配额限制指定目录下允许的文件和目录的数量。 43 | - DSQUOTA 空间配额限制该目录下允许的字节数 44 | - Permission 权限 45 | - UserName 用户 46 | - GroupName 用户组 47 | 48 | 加粗的部分，是两个最重要的字段，**AccessTime**作为hdfs文件访问的最后时间，**可以根据它去确定哪些文件还在用，哪些已经很久没用，可以判定为垃圾文件或过期数据，达到清理的目的**。必须要开启`dfs.namenode.accesstime.precision`参数才会有AccessTime，默认开启值为1。但是在hdp集群是默认关闭的，注意要在hdfs-site.xml文件里面配置开启。 49 | 50 | 解析后的csv文件会上传到对应字段的建好的hive表，给后面清理逻辑使用 51 | 52 | ### hive元数据 53 | 54 | 一般在配置hive的时候，都会选用mysql作为元数据存储的介质，hive的元数据表很多，记录了表名、分区、路径、参数等等一切除了表数据之外的所有信息，我们在hive的元数据库里面需要知道表的**hdfs路径**和**分区**，清理hive数据的时候再根据上述的fsimage对应的hive表去做关联，把要清理的表或表分区关联出来 55 | 56 | ## 三、使用方法 57 | 58 | 代码地址：https://github.com/lijufeng2016/data-manager 59 | 60 | 主类： 61 | 62 | `com.xkj.mlrc.fsimage.GenerateFsimageTable`：解析生成fsimage的csv文件并上传到hive 63 | 64 | `com.xkj.mlrc.clean.table.HiveTableClean`：清理hive表的逻辑 65 | 66 | `com.xkj.mlrc.clean.file.HdfsFileClean`：清理hdfs目录文件的逻辑，与上面清理hive的逻辑独立不冲突 67 | 68 | args参数说明： 69 | 70 | | 参数名 | 说明 | 71 | | --------------- | ------------------------------------------------------------ | 72 | | -targetPath | 指定的要删的目标路径，逗号隔开 | 73 | | -avoidPath | 要避开删除的路径，不扫描的路径,逗号隔开 | 74 | | -avoidSuffix | 要避开的包含后缀的文件,逗号隔开 | 75 | | -avoidPrefix | 要避开的包含前缀的文件,逗号隔开 | 76 | | -avoidDbs | 要避免删除的hive库，包含库下所有的表分区，逗号隔开 | 77 | | -avoidTbls | 要避免删除的hive表，包含表下所有的分区，逗号隔开 | 78 | | -avoidTbls-file | 用要避免删除的表，用文件存放在hdfs，必须是“库.表名”的形式，包含表下所有的分区 | 79 | | -expire | 过期的数据时间，也就是清理多少天之前的数据，这是个参数很重要，必须大于0 | 80 | | -hdfsroot | hdfs根路径，HA模式如 hdfs://bigdatacluster，单点模式如：hdfs://xxxx:50070 | 81 | 82 | 必要的准备 83 | 84 | ### 本地idea运行： 85 | 86 | #### step1：准备工作 87 | 88 | 必须要把**hive-site.xml、core-site.xml、hdfs-site.xml**文件放在项目的resources下，否则运行不起来！然后按照自己的环境修改所有**config.properties**配置项。 89 | 90 | #### step2:解析fsimage文件 91 | 92 | 执行主类**com.xkj.mlrc.fsimage.GenerateFsimageTable**，会远程ssh到NameNode执行一系列shell并解析fsimage文件上传到hdfs 93 | 94 | #### step3:清理数据 95 | 96 | 根据自己的需要运行 97 | 98 | `com.xkj.mlrc.clean.table.HiveTableClean` 或 `com.xkj.mlrc.clean.file.HdfsFileClean`清理hive表或hdfs数据，并根据上面的args参数说明列表传入自己需要的参数运行 99 | 100 | 101 | 102 | ### yarn运行： 103 | 104 | #### step1：准备工作 105 | 106 | 照自己的环境修改所有**config.properties**配置项，maven打包项目生成data-manager.jar文件上传到集群机器上 107 | 108 | #### step2:解析fsimage文件 109 | 110 | 在NameNode的节点下执行项目run目录下的`ParseFsimageFile2Hive.sh`脚本，会执行一系列shell并解析fsimage文件上传到hdfs 111 | 112 | #### step3:清理数据 113 | 114 | 根据自己的需要运行项目run目录下的`HdfsFileClean.sh` 或 `HiveTableClean.sh`脚本清理hive表或hdfs数据，根据自己的需要配置上面的args参数列表 115 | 116 | ## 四、总结 117 | 118 | 这种方法完美的利用了fsimage文件和hive元数据。传统删数据的方法需要用户知道哪些目录该删或不该删，用这种方法，你只需要关注多久没使用过的数据就删，比如有的文件连续超过7天未被读取，之后被读取的可能性也不大，就可以用上面的代码去做清理。代码里也特意做了安全机制，hdfs的java api中，直接删除的话不会放hdfs的回收站，这个项目里是把所有数据放入回收站，等到回收触发的时间才彻底删除，如果误删了数据也可以有时间恢复。 119 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.xkj.mlrc 8 | data-manager 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | UTF-8 14 | 5.1.38 15 | 2.7.3 16 | 2.3.0 17 | 18 | 19 | ${project.artifactId} 20 | 21 | 22 | src/main/resources 23 | 24 | *.xml 25 | 26 | 27 | 28 | src/main/resources/ 29 | false 30 | 31 | 32 | src/main/java 33 | 34 | **/* 35 | 36 | false 37 | 38 | 39 | 40 | 41 | 42 | org.apache.maven.plugins 43 | maven-compiler-plugin 44 | 3.1 45 | 46 | 1.8 47 | 1.8 48 | 49 | 50 | 51 | 52 | maven-assembly-plugin 53 | 3.1.0 54 | 55 | 56 | jar-with-dependencies 57 | 58 | false 59 | 60 | 61 | 62 | make-assembly 63 | package 64 | 65 | single 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | org.apache.hive 77 | hive-jdbc 78 | 1.2.1 79 | 80 | 81 | org.eclipse.jetty.aggregate 82 | jetty-all 83 | 84 | 85 | org.apache.hive 86 | hive-shims 87 | 88 | 89 | provided 90 | 91 | 92 | 93 | 94 | com.alibaba 95 | druid 96 | 1.1.10 97 | provided 98 | 99 | 100 | 101 | 102 | junit 103 | junit 104 | 4.5 105 | test 106 | 107 | 108 | mysql 109 | mysql-connector-java 110 | 5.1.38 111 | provided 112 | 113 | 114 | 115 | args4j 116 | args4j 117 | 2.33 118 | provided 119 | 120 | 121 | 122 | io.netty 123 | netty-all 124 | 4.1.17.Final 125 | provided 126 | 127 | 128 | org.apache.hadoop 129 | hadoop-common 130 | ${hadoop.version} 131 | provided 132 | 133 | 134 | 135 | org.apache.hadoop 136 | hadoop-hdfs 137 | ${hadoop.version} 138 | provided 139 | 140 | 141 | 142 | commons-logging 143 | commons-logging 144 | 1.2 145 | provided 146 | 147 | 148 | 149 | 150 | org.projectlombok 151 | lombok 152 | 1.18.0 153 | provided 154 | 155 | 156 | 157 | org.apache.spark 158 | spark-core_2.11 159 | ${spark.version} 160 | provided 161 | 162 | 163 | org.apache.spark 164 | spark-sql_2.11 165 | ${spark.version} 166 | provided 167 | 168 | 169 | org.apache.spark 170 | spark-streaming_2.11 171 | ${spark.version} 172 | provided 173 | 174 | 175 | org.apache.spark 176 | spark-hive_2.11 177 | ${spark.version} 178 | provided 179 | 180 | 181 | org.apache.spark 182 | spark-streaming-kafka-0-10_2.11 183 | 2.3.0 184 | provided 185 | 186 | 187 | org.apache.spark 188 | spark-yarn_2.11 189 | ${spark.version} 190 | provided 191 | 192 | 193 | -------------------------------------------------------------------------------- /run/HdfsFileClean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /etc/profile 3 | echo "submit spark job" 4 | 5 | spark-submit --master yarn-cluster \ 6 | --conf spark.storage.memoryFraction=0.1 \ 7 | --executor-cores 2 \ 8 | --num-executors 2 \ 9 | --executor-memory 2g \ 10 | --driver-memory 2g \ 11 | --class com.xkj.mlrc.clean.file.HdfsFileClean \ 12 | data-manager.jar \ 13 | -avoidSuffix .jar,.xml \ 14 | -expire 3 \ 15 | -targetPath /user/cxy/userprofile/log,/user//user/xqlm \ 16 | -avoidPath /user/bin,/user/spark 17 | 18 | 19 | 20 | rc=$? 21 | if [[ $rc != 0 ]]; then 22 | echo "`date "+%Y-%m-%d %H:%M:%S"` Spark job run failed......" 23 | exit 1 24 | else 25 | echo "`date "+%Y-%m-%d %H:%M:%S"` Spark job run successfully......." 26 | fi 27 | 28 | -------------------------------------------------------------------------------- /run/HiveTableClean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /etc/profile 3 | echo "submit spark job" 4 | spark-submit --master yarn-cluster \ 5 | --conf spark.storage.memoryFraction=0.1 \ 6 | --executor-cores 2 \ 7 | --num-executors 2 \ 8 | --executor-memory 2g \ 9 | --driver-memory 2g \ 10 | --class com.xkj.mlrc.clean.table.HiveTableClean \ 11 | data-manager.jar \ 12 | -avoidSuffix .jar,.xml \ 13 | -expire 3 14 | 15 | 16 | 17 | 18 | 19 | rc=$? 20 | if [[ $rc != 0 ]]; then 21 | echo "`date "+%Y-%m-%d %H:%M:%S"` Spark job run failed......" 22 | exit 1 23 | else 24 | echo "`date "+%Y-%m-%d %H:%M:%S"` Spark job run successfully......." 25 | fi 26 | -------------------------------------------------------------------------------- /run/ParseFsimageFile2Hive.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #创建hive表 3 | hive -S -e "CREATE TABLE IF NOT EXISTS fsimage( \ 4 | path string, \ 5 | replication int, \ 6 | modificationtime string, \ 7 | accesstime string, \ 8 | preferredblocksize bigint, \ 9 | blockscount int, \ 10 | filesize bigint, \ 11 | nsquota int, \ 12 | dsquota int, \ 13 | permission string, \ 14 | username string, \ 15 | groupname string) \ 16 | ROW FORMAT DELIMITED \ 17 | FIELDS TERMINATED BY ',' \ 18 | STORED AS INPUTFORMAT \ 19 | 'org.apache.hadoop.mapred.TextInputFormat' \ 20 | OUTPUTFORMAT \ 21 | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' \ 22 | location '/tmp/fsimage'" 23 | 24 | #在NameNode节点，最好是standby NameNode解析fsimage文件 25 | nn_paths=`hdfs getconf -confKey dfs.namenode.name.dir` 26 | nn_path=${nn_paths##*,} 27 | echo ${nn_path} 28 | # 找到后缀名最大的那个fsimage文件，如：fsimage_0000000000157279038 29 | fsimage_file=`find ${nn_path}/current -type f -name 'fsimage_*' | grep -v '.md5' | sort -n | tail -n1` 30 | #解析fsimage文件成csv 31 | hdfs oiv -p Delimited -delimiter "," -i ${fsimage_file} -o fsimage.csv 32 | #上传到hive表，供后面做分析和删除文件使用 33 | hadoop fs -put -f fsimage.csv /tmp/fsimage/ 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/domain/LogBean.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.domain; 2 | 3 | /** 4 | * 日志对象 5 | * @author lijf@2345.com 6 | * @date 2020/4/21 14:09 7 | * @desc 8 | */ 9 | 10 | public class LogBean { 11 | String path; 12 | Integer replication; 13 | String modificationtime; 14 | String accesstime; 15 | Long preferredblocksize; 16 | Integer blockscount; 17 | Long filesize; 18 | Integer nsquota; 19 | Integer dsquota; 20 | String permission; 21 | String username; 22 | String groupname; 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/domain/ParamOption.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.domain; 2 | 3 | import org.kohsuke.args4j.Option; 4 | 5 | public class ParamOption { 6 | 7 | @Option(name="-targetPath", usage="指定的要删的目标路径") 8 | public String targetPath; 9 | @Option(name="-avoidPath", usage="要避开的路径，不扫描的路径,逗号隔开") 10 | public String avoidPath; 11 | @Option(name="-avoidSuffix", usage="要避开的包含后缀的文件") 12 | public String avoidSuffix; 13 | @Option(name="-avoidPrefix", usage="要避开的包含前缀的文件") 14 | public String avoidPrefix; 15 | @Option(name="-avoidDbs", usage="要避免删除的数据库，包含库下所有的表分区，逗号隔开") 16 | public String avoidDb; 17 | @Option(name="-avoidTbls", usage="用要避免删除的表，包含表下所有的分区，逗号隔开") 18 | public String avoidTbls; 19 | @Option(name="-avoidTbls-file", usage="用要避免删除的表，用hdfs文件存放，必须是“库.表名”的形式，包含表下所有的分区") 20 | public String avoidTblsFile; 21 | @Option(name="-expire", usage="过期的数据",required = true) 22 | public Integer expire; 23 | @Option(name="-hdfsroot", usage="hdfs根路径,默认hdfs://cluster") 24 | public String hdfsroot = "hdfs://cluster"; 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/file/HdfsFileClean.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.file; 2 | 3 | import com.xkj.mlrc.clean.domain.ParamOption; 4 | import com.xkj.mlrc.clean.util.ArgsUtil; 5 | import com.xkj.mlrc.clean.util.HdfsUtils; 6 | import com.xkj.mlrc.fsimage.GetFromFsImageInfo; 7 | 8 | import lombok.extern.slf4j.Slf4j; 9 | import org.apache.spark.api.java.function.ForeachFunction; 10 | import org.apache.spark.sql.*; 11 | 12 | import java.io.IOException; 13 | 14 | 15 | /** 16 | * hdfs文件清理 17 | * @author lijf@2345.com 18 | * @date 2020/4/21 14:06 19 | * @desc 20 | */ 21 | @Slf4j 22 | public class HdfsFileClean { 23 | public static void main(String[] args) { 24 | SparkSession spark = getSparkSession(); 25 | ParamOption option = ArgsUtil.getOption(args); 26 | 27 | GetFromFsImageInfo fsImageInfo = GetFromFsImageInfo.builder() 28 | .spark(spark) 29 | .avoidPrefix(option.avoidPrefix) 30 | .avoidSuffix(option.avoidSuffix) 31 | .avoidPath(option.avoidPath) 32 | .expire(option.expire) 33 | .targetPath(option.targetPath) 34 | .hdfsroot(option.hdfsroot) 35 | .build(); 36 | 37 | Dataset allFiles = fsImageInfo.getAllFiles(); 38 | allFiles.foreach(new ForeachFunction() { 39 | @Override 40 | public void call(Row row) throws Exception { 41 | String path = row.getAs("path").toString(); 42 | try { 43 | HdfsUtils.trashPath(path); 44 | log.info("删除路径成功:" + path); 45 | } catch (IOException e) { 46 | log.info("删除路径失败:" + path); 47 | e.printStackTrace(); 48 | } 49 | } 50 | }); 51 | 52 | 53 | } 54 | private static SparkSession getSparkSession() { 55 | return SparkSession 56 | .builder() 57 | .master("local[2]") 58 | .appName(HdfsFileClean.class.getSimpleName()) 59 | .enableHiveSupport() 60 | .getOrCreate(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/table/HiveTableClean.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.table; 2 | 3 | import com.xkj.mlrc.clean.domain.ParamOption; 4 | import com.xkj.mlrc.clean.util.ArgsUtil; 5 | import com.xkj.mlrc.clean.util.HdfsUtils; 6 | import com.xkj.mlrc.clean.util.JdbcHelper; 7 | import com.xkj.mlrc.clean.util.PropsUtil; 8 | import com.xkj.mlrc.fsimage.GetFromFsImageInfo; 9 | import lombok.extern.slf4j.Slf4j; 10 | import org.apache.commons.io.FileUtils; 11 | import org.apache.commons.lang3.StringUtils; 12 | import org.apache.spark.api.java.function.FilterFunction; 13 | import org.apache.spark.broadcast.Broadcast; 14 | import org.apache.spark.sql.Dataset; 15 | import org.apache.spark.sql.Row; 16 | import org.apache.spark.sql.SaveMode; 17 | import org.apache.spark.sql.SparkSession; 18 | import scala.Tuple3; 19 | import scala.reflect.ClassManifestFactory; 20 | 21 | import java.io.File; 22 | import java.io.FileNotFoundException; 23 | import java.io.IOException; 24 | import java.io.InputStream; 25 | import java.net.URL; 26 | import java.util.List; 27 | import java.util.Properties; 28 | 29 | /** 30 | * @author lijf@2345.com 31 | * @date 2020/4/21 16:30 32 | * @desc 33 | */ 34 | @Slf4j 35 | public class HiveTableClean { 36 | private static SparkSession spark; 37 | private static Properties properties; 38 | private static ParamOption option; 39 | private static volatile Broadcast> broadcast = null; 40 | 41 | 42 | public static void main(String[] args) throws IOException { 43 | init(args); 44 | loadTableInfoFromMysql(); 45 | loadExpireDataByFsimage(); 46 | cleanUnPartitiondTables(); 47 | cleanPartitiondTables(); 48 | } 49 | 50 | /** 51 | * 获取fsimage记录的所有过期的目录 52 | */ 53 | public static void loadExpireDataByFsimage() { 54 | GetFromFsImageInfo fsImage = GetFromFsImageInfo.builder() 55 | .spark(spark) 56 | .avoidPrefix(option.avoidPrefix) 57 | .avoidSuffix(option.avoidSuffix) 58 | .expire(option.expire) 59 | .targetPath(option.targetPath) 60 | .hdfsroot(option.hdfsroot) 61 | .build(); 62 | fsImage.getAllShouldDelDirsByLastAccesstime().createOrReplaceTempView("all_overdue_dirs"); 63 | } 64 | 65 | /** 66 | * 初始化 67 | * 68 | * @param args 参数 69 | */ 70 | private static void init(String[] args) { 71 | spark = getSparkSession(); 72 | properties = new Properties(); 73 | String url = PropsUtil.getProp("hive.meta.mysql.url"); 74 | properties.put("url", url); 75 | properties.put("driver", PropsUtil.getProp("hive.meta.mysql.driver")); 76 | properties.put("user", PropsUtil.getProp("hive.meta.mysql.username")); 77 | properties.put("password", PropsUtil.getProp("hive.meta.mysql.password")); 78 | option = ArgsUtil.getOption(args); 79 | } 80 | 81 | /** 82 | * 清理非分区表 83 | */ 84 | private static void cleanUnPartitiondTables() throws IOException { 85 | //查询出hive的所有表 86 | String sqlText = "select a.name as dbname," + 87 | " b.tbl_name," + 88 | " case when isnull(c.pkey_name) then 0 else 1 end as ispartition," + 89 | " d.location " + 90 | " from dbs a " + 91 | " join tbls b on(a.db_id=b.db_id)" + 92 | " left join partition_keys c on(b.tbl_id=c.tbl_id)" + 93 | " join sds d on(b.sd_id=d.sd_id)"; 94 | spark.sql(sqlText).createOrReplaceTempView("all_hive_tables"); 95 | String distinctLines = "select dbname," + 96 | " tbl_name," + 97 | " ispartition," + 98 | " location " + 99 | " from all_hive_tables " + 100 | " where ispartition=0" + 101 | " group by dbname,tbl_name,ispartition,location"; 102 | // 去重记录 103 | spark.sql(distinctLines).createOrReplaceTempView("all_unpartitiond_tbs"); 104 | // 获取所有要删的过期表，join fsimage的路径得到 105 | String getExpiredTbs = "select a.* from all_unpartitiond_tbs a join all_overdue_dirs b on(a.location=b.hdfs_abs_path)"; 106 | Dataset allUnpartitiondTables = spark.sql(getExpiredTbs); 107 | // 过滤要排除掉的表 108 | allUnpartitiondTables = filterExclusionTables(allUnpartitiondTables); 109 | // 获取非分区表的库名、表名、路径 110 | allUnpartitiondTables 111 | .toJavaRDD() 112 | .map(row -> new Tuple3<>(row.getAs("dbname").toString(), row.getAs("tbl_name").toString(), row.getAs("location").toString())) 113 | .foreachPartition(partition -> { 114 | JdbcHelper jdbcHelper = JdbcHelper.getHiveInstance(); 115 | while (partition.hasNext()){ 116 | Tuple3 tableLine = partition.next(); 117 | String dbname = tableLine._1(); 118 | String tblName = tableLine._2(); 119 | String location = tableLine._3(); 120 | //删除表 121 | String table = dbname + "." + tblName; 122 | String sqlTextDrop = "drop table if exists " + table; 123 | try { 124 | jdbcHelper.execute(sqlTextDrop); 125 | log.info("删除表成功:" + table); 126 | } catch (Exception e) { 127 | log.info("删除表失败:" + table); 128 | e.printStackTrace(); 129 | } 130 | //删除路径 131 | try { 132 | HdfsUtils.trashPath(location); 133 | log.info("删除路径成功:" + location); 134 | } catch (IOException e) { 135 | if (e instanceof FileNotFoundException) { 136 | log.info("删除路径成功:" + location); 137 | } else { 138 | log.error("删除路径失败:" + location); 139 | e.printStackTrace(); 140 | } 141 | } 142 | } 143 | }); 144 | } 145 | 146 | /** 147 | * 过滤掉要排除的库表 148 | * 149 | * @param tablesDataset tablesDataset 150 | * @return Dataset 151 | * @throws IOException IOException 152 | */ 153 | private static Dataset filterExclusionTables(Dataset tablesDataset) throws IOException { 154 | String avoidTbls = option.avoidTbls; 155 | String avoidDb = option.avoidDb; 156 | String avoidTblsFile = option.avoidTblsFile; 157 | 158 | if (null != avoidDb) { 159 | tablesDataset = tablesDataset.filter(new FilterFunction() { 160 | @Override 161 | public boolean call(Row row) throws Exception { 162 | return !avoidDb.equalsIgnoreCase(row.getAs("dbname").toString()); 163 | } 164 | }); 165 | } 166 | 167 | if (null != avoidTbls) { 168 | String[] tables = avoidTbls.split(","); 169 | for (int i = 0; i < tables.length; i++) { 170 | String table = tables[i]; 171 | tablesDataset = tablesDataset.filter(new FilterFunction() { 172 | @Override 173 | public boolean call(Row row) throws Exception { 174 | String dbname = row.getAs("dbname").toString(); 175 | String tblName = row.getAs("tbl_name").toString(); 176 | String tableName = dbname + "." + tblName; 177 | return !table.equalsIgnoreCase(tableName); 178 | } 179 | }); 180 | } 181 | } 182 | if(null != avoidTblsFile){ 183 | List tables = HdfsUtils.readByLine(avoidTblsFile); 184 | broadcast = spark.sparkContext().broadcast(tables, ClassManifestFactory.classType(List.class)); 185 | tablesDataset = tablesDataset.filter(new FilterFunction() { 186 | List exclusionTablesValue = broadcast.value(); 187 | @Override 188 | public boolean call(Row row) throws Exception { 189 | String dbname = row.getAs("dbname").toString(); 190 | String tblName = row.getAs("tbl_name").toString(); 191 | String table = dbname + "." + tblName; 192 | return !exclusionTablesValue.contains(table); 193 | } 194 | }); 195 | } 196 | return tablesDataset; 197 | } 198 | 199 | /** 200 | * 清理分区表 201 | */ 202 | private static void cleanPartitiondTables() throws IOException { 203 | String allPartitionTbs = "SELECT " + 204 | " a.name as dbname, " + 205 | " b.tbl_name, " + 206 | " c.location, " + 207 | " d.part_name," + 208 | " concat(location,'/',part_name) as part_location " + 209 | " FROM " + 210 | " dbs a " + 211 | " JOIN tbls b ON (a.db_id = b.db_id) " + 212 | " JOIN sds c ON (b.sd_id = c.sd_id) " + 213 | " JOIN partitions d ON (b.tbl_id = d.tbl_id) "; 214 | // 获取所有分区表 215 | spark.sql(allPartitionTbs).createOrReplaceTempView("allPartitionTbs"); 216 | String getExpiredParts = "select a.* from allPartitionTbs a join all_overdue_dirs b on(a.part_location=b.hdfs_abs_path)"; 217 | Dataset partitiondTables = spark.sql(getExpiredParts); 218 | partitiondTables = filterExclusionTables(partitiondTables); 219 | partitiondTables.foreachPartition(parttition -> { 220 | JdbcHelper jdbcHelper = JdbcHelper.getHiveInstance(); 221 | while (parttition.hasNext()){ 222 | Row row = parttition.next(); 223 | String dbName = row.getAs("dbname").toString(); 224 | String tblName = row.getAs("tbl_name").toString(); 225 | String partLocation = row.getAs("part_location").toString(); 226 | String partName = row.getAs("part_name").toString(); 227 | 228 | // 解析出分区名 229 | String[] split = partName.split("/"); 230 | for (int j = 0; j < split.length; j++) { 231 | String part = split[j]; 232 | split[j] = part.replace("=", "='") + "'"; 233 | } 234 | String partNameFmt = StringUtils.join(split, ","); 235 | String tableName = dbName + "." + tblName; 236 | String dropPartitionSql = "ALTER TABLE " + tableName + " DROP IF EXISTS PARTITION (" + partNameFmt + ")"; 237 | try { 238 | jdbcHelper.execute(dropPartitionSql); 239 | log.info("删除表分区成功！表名：{}，分区：{}", tableName, partNameFmt); 240 | } catch (Exception e) { 241 | log.info("删除表分区失败！表名：{}，分区：{}", tableName, partNameFmt); 242 | e.printStackTrace(); 243 | } 244 | //删除路径 245 | try { 246 | HdfsUtils.trashPath(partLocation); 247 | log.info("删除分区路径成功！表名：{}，分区：{}，路径：{}", tableName, partNameFmt, partLocation); 248 | } catch (Exception e) { 249 | if (e instanceof FileNotFoundException) { 250 | log.info("删除分区路径成功！表名：{}，分区：{}，路径：{}", tableName, partNameFmt, partLocation); 251 | } else { 252 | log.info("删除分区路径失败！表名：{}，分区：{}，路径：{}", tableName, partNameFmt, partLocation); 253 | e.printStackTrace(); 254 | } 255 | } 256 | 257 | } 258 | }); 259 | 260 | } 261 | 262 | /** 263 | * 读取mysql中hive的元数据得到所有表信息 264 | */ 265 | private static void loadTableInfoFromMysql() { 266 | spark.read().jdbc(properties.getProperty("url"), "DBS", properties).write().mode(SaveMode.Overwrite).saveAsTable("dbs"); 267 | spark.read().jdbc(properties.getProperty("url"), "PARTITION_KEYS", properties).write().mode(SaveMode.Overwrite).saveAsTable("partition_keys"); 268 | spark.read().jdbc(properties.getProperty("url"), "SDS", properties).write().mode(SaveMode.Overwrite).saveAsTable("sds"); 269 | spark.read().jdbc(properties.getProperty("url"), "PARTITION_KEY_VALS", properties).write().mode(SaveMode.Overwrite).saveAsTable("partition_key_vals"); 270 | spark.read().jdbc(properties.getProperty("url"), "PARTITIONS", properties).write().mode(SaveMode.Overwrite).saveAsTable("partitions"); 271 | spark.read().jdbc(properties.getProperty("url"), "TBLS", properties).write().mode(SaveMode.Overwrite).saveAsTable("tbls"); 272 | 273 | } 274 | 275 | private static SparkSession getSparkSession() { 276 | return SparkSession 277 | .builder() 278 | .master("local[2]") 279 | .appName(HiveTableClean.class.getSimpleName()) 280 | .enableHiveSupport() 281 | .getOrCreate(); 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/util/ArgsUtil.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.util; 2 | 3 | import com.xkj.mlrc.clean.domain.ParamOption; 4 | import org.kohsuke.args4j.CmdLineException; 5 | import org.kohsuke.args4j.CmdLineParser; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | /** 10 | * @Author: lijf@2345.com 11 | * @Date: 2018/8/22 18:33 12 | * @Version: 1.0 13 | */ 14 | public class ArgsUtil { 15 | private static Logger logger = LoggerFactory.getLogger(ArgsUtil.class); 16 | static CmdLineParser parser; 17 | private ArgsUtil() {} 18 | 19 | /** 20 | * 解析参数 21 | * @param args 22 | * @return 23 | */ 24 | public static ParamOption getOption(String[] args){ 25 | //开始解析命令参数 26 | ParamOption option = new ParamOption(); 27 | parser = new CmdLineParser(option); 28 | try { 29 | parser.parseArgument(args); 30 | } catch (CmdLineException e) { 31 | logger.error(e.toString(),e); 32 | } 33 | 34 | return option; 35 | } 36 | 37 | /** 38 | * 参数说明 39 | */ 40 | public static void showHelp(){ 41 | parser.printUsage(System.out); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/util/DateUtil.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.util; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.*; 9 | 10 | /** 11 | * @author: lijf@2345.com 12 | * @Date: 2018/7/5 13:47 13 | * @Version: 1.0 14 | */ 15 | public class DateUtil { 16 | 17 | private DateUtil() { } 18 | 19 | public static final String DATE_FORMAT_MEDIUM = "yyyy-MM-dd"; 20 | public static final String DATE_FORMAT_LONG= "yyyy-MM-dd HH:mm:ss"; 21 | private static Logger logger = LoggerFactory.getLogger(DateUtil.class); 22 | 23 | /** 24 | * 判断时间是否在时间段内 25 | * 26 | * @param nowTime 27 | * @param beginTime 28 | * @param endTime 29 | * @return 30 | */ 31 | public static boolean belongCalendar(Date nowTime, Date beginTime, Date endTime) { 32 | Calendar date = Calendar.getInstance(); 33 | date.setTime(nowTime); 34 | 35 | Calendar begin = Calendar.getInstance(); 36 | begin.setTime(beginTime); 37 | 38 | Calendar end = Calendar.getInstance(); 39 | end.setTime(endTime); 40 | 41 | return (date.after(begin) && date.before(end)); 42 | } 43 | /** 44 | * 格式化日期 45 | * @param fmt 格式 46 | * @return 47 | */ 48 | public static String getCurrentFormatDate(String fmt) { 49 | String formatDate = ""; 50 | try { 51 | SimpleDateFormat format = new SimpleDateFormat(fmt); 52 | Date date = new Date(); 53 | formatDate = format.format(date); 54 | }catch (Exception e){ 55 | logger.error(e.toString(),e); 56 | } 57 | return formatDate; 58 | } 59 | 60 | /** 61 | * 获取N天前后的凌晨零点 yyyy-MM-dd HH:mm:ss 62 | * @param n 63 | * @return 64 | */ 65 | public static String getNDayBeforeOrAfterZeroMorning(int n) { 66 | Calendar instance = Calendar.getInstance(); 67 | SimpleDateFormat sdfCn = new SimpleDateFormat(DATE_FORMAT_MEDIUM); 68 | instance.add(Calendar.DAY_OF_MONTH, n); 69 | Date parse = instance.getTime(); 70 | String nDayBefore = sdfCn.format(parse); 71 | 72 | return nDayBefore+" 00:00:00"; 73 | } 74 | /** 75 | * 获取前一天时间字符串，返回例子：2018-07-30 76 | */ 77 | public static final String getYesterDay(){ 78 | Date date=new Date(); 79 | Calendar calendar = new GregorianCalendar(); 80 | calendar.setTime(date); 81 | calendar.add(Calendar.DATE,-1); 82 | date=calendar.getTime(); 83 | SimpleDateFormat formatter = new SimpleDateFormat(DATE_FORMAT_MEDIUM); 84 | return formatter.format(date); 85 | } 86 | 87 | /** 88 | * 转换成Timestamp 89 | * @param formatDt 90 | * @return 91 | */ 92 | public static long parseToTimestamp(String format,String formatDt) { 93 | SimpleDateFormat formatDate = new SimpleDateFormat(format); 94 | Date date = null; 95 | try { 96 | date = formatDate.parse(formatDt); 97 | return date.getTime() / 1000; 98 | } catch (ParseException e) { 99 | logger.error(e.toString(),e); 100 | } 101 | return 0; 102 | } 103 | 104 | /** 105 | * 获取n天前的日期 106 | * @param format 格式 107 | * @param n 天数 108 | * @return 109 | * @throws ParseException 110 | */ 111 | public static String getNDayFmtDAte(String format,int n) { 112 | Calendar instance = Calendar.getInstance(); 113 | SimpleDateFormat sdfCn = new SimpleDateFormat(format); 114 | instance.add(Calendar.DAY_OF_MONTH, n); 115 | Date parse = instance.getTime(); 116 | return sdfCn.format(parse); 117 | } 118 | 119 | /** 120 | * 根据day获取n天前的日期 121 | * @param format 格式 122 | * @param n 天数 123 | * @return 124 | * @throws ParseException 125 | */ 126 | public static String getNDayFmtByDay(String format,String day,int n) { 127 | Calendar instance = Calendar.getInstance(); 128 | SimpleDateFormat sdfCn = new SimpleDateFormat(format); 129 | try { 130 | Date date = sdfCn.parse(day); 131 | instance.setTime(date); 132 | } catch (ParseException e) { 133 | e.printStackTrace(); 134 | } 135 | instance.add(Calendar.DAY_OF_MONTH, n); 136 | Date parse = instance.getTime(); 137 | return sdfCn.format(parse); 138 | } 139 | 140 | /** 141 | * 获取前天的日期 142 | */ 143 | public static final String getBeforeYe(){ 144 | Date date=new Date(); 145 | Calendar calendar = new GregorianCalendar(); 146 | calendar.setTime(date); 147 | calendar.add(Calendar.DATE,-2); 148 | date=calendar.getTime(); 149 | SimpleDateFormat formatter = new SimpleDateFormat(DATE_FORMAT_MEDIUM); 150 | return formatter.format(date); 151 | } 152 | 153 | /** 154 | * 获取前N天的日期(不包含今天) 155 | */ 156 | public static List getNday(Integer num){ 157 | if(num==null||num<=0){ 158 | return new ArrayList<>(); 159 | } 160 | SimpleDateFormat formatter = new SimpleDateFormat(DATE_FORMAT_MEDIUM); 161 | List li = new ArrayList<>(); 162 | Date date=new Date(); 163 | Calendar calendar = new GregorianCalendar(); 164 | for(int i=num;i>=1;i--){ 165 | calendar.setTime(date); 166 | calendar.add(Calendar.DATE,-i); 167 | li.add(formatter.format(calendar.getTime())); 168 | } 169 | return li; 170 | } 171 | 172 | /** 173 | * 格式化日期转为Date 174 | * @param fmtDate yyyy-MM-dd HH:mm:ss 175 | * @return date 176 | */ 177 | public static Date parse2Date(String fmtDate){ 178 | Date date; 179 | SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT_LONG); 180 | try { 181 | date = sdf.parse(fmtDate); 182 | } catch (ParseException e) { 183 | date = new Date(); 184 | } 185 | return date; 186 | } 187 | 188 | /** 189 | * 获取当天还剩余的时间，单位：S 190 | * @return ... 191 | */ 192 | public static int getLeftSecondsToday(){ 193 | SimpleDateFormat dateFormat = new SimpleDateFormat(DATE_FORMAT_LONG); 194 | 195 | String nowStr = dateFormat.format(new Date()); 196 | 197 | String patten = " "; 198 | String endStr = nowStr.substring(0,nowStr.indexOf(patten)) + " 23:59:59"; 199 | int leftSeconds = 0; 200 | try { 201 | leftSeconds = Integer.valueOf((dateFormat.parse(endStr).getTime() - dateFormat.parse(nowStr).getTime()) / 1000+""); 202 | } catch (ParseException e) { 203 | logger.error(e.toString(),e); 204 | } 205 | return leftSeconds; 206 | } 207 | 208 | /** 209 | * 时间戳转换为格式化时间戳 210 | * @param timestamp 211 | * @return 212 | */ 213 | public static String parseToFmtDateStr(Integer timestamp){ 214 | SimpleDateFormat fmt = new SimpleDateFormat(DATE_FORMAT_LONG); 215 | return fmt.format(new Date(timestamp * 1000L)); 216 | } 217 | 218 | /** 219 | * 根据两个日期获取两日期之间的日期 220 | * @param beginDay 格式为2019-05-08 221 | * @param endDay 格式为2019-05-09 222 | * @return 223 | */ 224 | public static List getDays(String beginDay,String endDay){ 225 | // 返回的日期集合 226 | List days = new ArrayList<>(); 227 | SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); 228 | try { 229 | Date start = dateFormat.parse(beginDay); 230 | Date end = dateFormat.parse(endDay); 231 | 232 | Calendar tempStart = Calendar.getInstance(); 233 | tempStart.setTime(start); 234 | 235 | Calendar tempEnd = Calendar.getInstance(); 236 | tempEnd.setTime(end); 237 | tempEnd.add(Calendar.DATE, +1); 238 | while (tempStart.before(tempEnd)) { 239 | days.add(dateFormat.format(tempStart.getTime())); 240 | tempStart.add(Calendar.DAY_OF_YEAR, 1); 241 | } 242 | } catch (ParseException e) { 243 | e.printStackTrace(); 244 | } 245 | return days; 246 | } 247 | 248 | /** 249 | * 时间戳转换为格式化时间戳 250 | * @param timestamp 251 | * @return 252 | */ 253 | public static String parseTimestamp2FmtDateStr(String format,Long timestamp){ 254 | SimpleDateFormat fmt = new SimpleDateFormat(format); 255 | return fmt.format(new Date(timestamp * 1000L)); 256 | } 257 | 258 | /** 259 | * 判断时间是否在时间段内 260 | * 传入24小时制格式，如01 03 表示凌晨1点与3点，15表示下午3点。 261 | * 04 到 19 表示凌晨4点到今天的下午7点 262 | * 19 到 09 表示晚上7点到第二天上午9点 263 | * 264 | * @param now ... 265 | * @param beginHour ... 266 | * @param endHour ... 267 | * @return ... 268 | */ 269 | public static boolean judgeTimeBetween(Date now, String beginHour, String endHour) { 270 | int iBeginHour = Integer.valueOf(beginHour); 271 | int iEndHour = Integer.valueOf(endHour); 272 | if (iBeginHour == iEndHour) { 273 | return true; 274 | } 275 | Calendar date = Calendar.getInstance(); 276 | date.set(Calendar.HOUR_OF_DAY, iBeginHour); 277 | date.set(Calendar.MINUTE, 0); 278 | date.set(Calendar.SECOND, 0); 279 | Date beginTime = date.getTime(); 280 | if (iEndHour == 0) { 281 | date.set(Calendar.HOUR_OF_DAY, 23); 282 | date.set(Calendar.MINUTE, 59); 283 | date.set(Calendar.SECOND, 59); 284 | Date endTime = date.getTime(); 285 | if (now.after(beginTime) && now.before(endTime)) { 286 | return true; 287 | } 288 | } 289 | if (iBeginHour < iEndHour) { 290 | date.set(Calendar.HOUR_OF_DAY, iEndHour); 291 | date.set(Calendar.MINUTE, 0); 292 | date.set(Calendar.SECOND, 0); 293 | Date endTime = date.getTime(); 294 | if (now.after(beginTime) && now.before(endTime)) { 295 | return true; 296 | } 297 | } else { 298 | date.setTime(now); 299 | int nowHour = date.get(Calendar.HOUR_OF_DAY); 300 | if (nowHour >= iBeginHour) { 301 | return true; 302 | } else { 303 | if (nowHour < iEndHour) { 304 | return true; 305 | } 306 | } 307 | } 308 | return false; 309 | } 310 | 311 | public static void main(String[] args) { 312 | String nDayFmtDAte = DateUtil.getNDayFmtDAte("yyyy-MM-dd HH:mm:ss", 3); 313 | System.out.println(nDayFmtDAte); 314 | } 315 | } 316 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/util/HdfsUtils.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.util; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.*; 6 | import org.apache.hadoop.hdfs.DistributedFileSystem; 7 | import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 8 | import org.apache.log4j.Logger; 9 | 10 | import java.io.*; 11 | import java.net.URI; 12 | import java.util.ArrayList; 13 | import java.util.HashMap; 14 | import java.util.List; 15 | import java.util.Map; 16 | 17 | /** 18 | * @Author: lijf@2345.com 19 | * @Date: 2018/7/10 17:35 20 | * @Version: 1.0 21 | */ 22 | public class HdfsUtils { 23 | 24 | private HdfsUtils() { 25 | } 26 | 27 | private static FileSystem hdfs; 28 | static Logger logger = Logger.getLogger(HdfsUtils.class); 29 | private static final String PATH_DELIMER = "/"; 30 | 31 | static { 32 | //获取FileSystem类的方法有很多种，这里只写一种 33 | Configuration config = new Configuration(); 34 | try { 35 | String hdfsUri = PropsUtil.getProp("ad.hdfs.root.uri"); 36 | // 第一位为uri，第二位为config，第三位是登录的用户 37 | hdfs = FileSystem.get(new URI(hdfsUri), config); 38 | 39 | } catch (Exception e) { 40 | logger.error(e.toString(), e); 41 | } 42 | } 43 | 44 | /** 45 | * 检查文件或者文件夹是否存在 46 | * 47 | * @param filename 48 | * @return 49 | */ 50 | public static boolean checkFileExist(String filename) { 51 | try { 52 | Path f = new Path(filename); 53 | return hdfs.exists(f); 54 | } catch (Exception e) { 55 | logger.error(e.toString(), e); 56 | } 57 | return false; 58 | } 59 | 60 | /** 61 | * 创建文件夹 62 | * 63 | * @param dirName 64 | * @return 65 | */ 66 | public static boolean mkdir(String dirName) { 67 | if (checkFileExist(dirName)) { 68 | return true; 69 | } 70 | try { 71 | Path f = new Path(dirName); 72 | logger.info("Create and Write :" + f.getName() + " to hdfs"); 73 | return hdfs.mkdirs(f); 74 | } catch (Exception e) { 75 | logger.error(e.toString(), e); 76 | } 77 | 78 | return false; 79 | } 80 | 81 | /** 82 | * 创建一个空文件 83 | * 84 | * @param filePath 文件的完整路径名称 85 | * @return 86 | */ 87 | public static boolean mkfile(String filePath) { 88 | try { 89 | Path f = new Path(filePath); 90 | if (hdfs.exists(f)) { 91 | return true; 92 | } 93 | FSDataOutputStream os = hdfs.create(f, false); 94 | os.close(); 95 | return true; 96 | } catch (IllegalArgumentException | IOException e) { 97 | logger.error(e.toString(), e); 98 | } 99 | return false; 100 | } 101 | 102 | /** 103 | * 复制文件到指定目录 104 | * 105 | * @param srcfile srcfile 106 | * @param desfile desfile 107 | * @return boolean 108 | * @throws IOException IOException 109 | */ 110 | public static boolean hdfsCopyUtils(String srcfile, String desfile) throws IOException { 111 | Configuration conf = new Configuration(); 112 | Path src = new Path(srcfile); 113 | Path dst = new Path(desfile); 114 | FileUtil.copy(src.getFileSystem(conf), src, 115 | dst.getFileSystem(conf), dst, false, conf); 116 | 117 | return true; 118 | } 119 | 120 | /** 121 | * 移动文件或者文件夹 122 | * 123 | * @param src 初始路径 124 | * @param dst 移动结束路径 125 | * @throws Exception 126 | */ 127 | public static void movefile(String src, String dst) throws IOException { 128 | Path p1 = new Path(src); 129 | Path p2 = new Path(dst); 130 | hdfs.rename(p1, p2); 131 | } 132 | 133 | /** 134 | * 删除文件或者文件夹 135 | * 136 | * @param src 137 | * @throws Exception 138 | */ 139 | public static void delete(String src) throws IOException { 140 | Path p1 = new Path(src); 141 | if (hdfs.isDirectory(p1)) { 142 | hdfs.delete(p1, true); 143 | logger.info("删除文件夹成功: " + src); 144 | } else if (hdfs.isFile(p1)) { 145 | hdfs.delete(p1, false); 146 | logger.info("删除文件成功: " + src); 147 | } 148 | 149 | } 150 | 151 | /** 152 | * 读取本地文件到HDFS系统, 保证文件格式是utf-8 153 | * 154 | * @param localFilename 155 | * @param hdfsPath 156 | * @return 157 | */ 158 | public static boolean copyLocalFileToHDFS(String localFilename, String hdfsPath) { 159 | // 如果路径不存在就创建文件夹 160 | mkdir(hdfsPath); 161 | 162 | File file = new File(localFilename); 163 | 164 | // 如果hdfs上已经存在文件，那么先删除该文件 165 | if (HdfsUtils.checkFileExist(hdfsPath + PATH_DELIMER + file.getName())) { 166 | try { 167 | delete(hdfsPath + PATH_DELIMER + file.getName()); 168 | } catch (IOException e) { 169 | e.printStackTrace(); 170 | } 171 | } 172 | 173 | Path f = new Path(hdfsPath + PATH_DELIMER + file.getName()); 174 | try ( 175 | FileInputStream is = new FileInputStream(file); 176 | FSDataOutputStream os = hdfs.create(f, true) 177 | ) { 178 | byte[] buffer = new byte[10240000]; 179 | int nCount = 0; 180 | 181 | while (true) { 182 | int bytesRead = is.read(buffer); 183 | if (bytesRead <= 0) { 184 | break; 185 | } 186 | 187 | os.write(buffer, 0, bytesRead); 188 | nCount++; 189 | if (nCount % (100) == 0) { 190 | logger.info(" Have move " + nCount + " blocks"); 191 | } 192 | } 193 | logger.info(" Write content of file " + file.getName() 194 | + " to hdfs file " + f.getName() + " success"); 195 | return true; 196 | } catch (Exception e) { 197 | logger.error(e.toString(), e); 198 | } 199 | return false; 200 | } 201 | 202 | /** 203 | * 复制本地文件夹到hdfs的文件 204 | * 205 | * @param localPath 206 | * @param hdfsPath 207 | * @return 208 | */ 209 | public static boolean copyLocalDirTohdfs(String localPath, String hdfsPath) { 210 | try { 211 | File root = new File(localPath); 212 | File[] files = root.listFiles(); 213 | 214 | for (File file : files) { 215 | if (file.isFile()) { 216 | copyLocalFileToHDFS(file.getPath(), hdfsPath); 217 | 218 | } else if (file.isDirectory()) { 219 | copyLocalDirTohdfs(localPath + "/" + file.getName(), hdfsPath + "/" + file.getName()); 220 | } 221 | } 222 | return true; 223 | } catch (Exception e) { 224 | logger.error(e.toString(), e); 225 | } 226 | return false; 227 | } 228 | 229 | 230 | /** 231 | * 从hdfs下载 232 | * 233 | * @param hdfsFilename 234 | * @param localPath 235 | * @return 236 | */ 237 | public static boolean downloadFileFromHdfs(String hdfsFilename, String localPath) { 238 | 239 | Path f = new Path(hdfsFilename); 240 | File file = new File(localPath + PATH_DELIMER + f.getName()); 241 | try ( 242 | FSDataInputStream dis = hdfs.open(f); 243 | FileOutputStream os = new FileOutputStream(file); 244 | ) { 245 | byte[] buffer = new byte[1024000]; 246 | int length = 0; 247 | while ((length = dis.read(buffer)) > 0) { 248 | os.write(buffer, 0, length); 249 | } 250 | return true; 251 | } catch (Exception e) { 252 | logger.error(e.toString(), e); 253 | } 254 | return false; 255 | } 256 | 257 | /** 258 | * HDFS 到 HDFS 的合并 259 | * hdfs提供了一种FileUtil.copyMerge（）的方法，注意下面的 false 这个，如果改为true，就会删除这个目录 260 | * 261 | * @param folder 需要合并的目录 262 | * @param file 要合并成的文件，完整路径名称 263 | */ 264 | public static void copyMerge(String folder, String file) { 265 | Configuration conf = new Configuration(); 266 | Path src = new Path(folder); 267 | Path dst = new Path(file); 268 | 269 | try { 270 | FileUtil.copyMerge(src.getFileSystem(conf), src, 271 | dst.getFileSystem(conf), dst, false, conf, null); 272 | } catch (IOException e) { 273 | logger.error(e.toString(), e); 274 | } 275 | } 276 | 277 | 278 | /** 279 | * 列出所有DataNode的名字信息 280 | */ 281 | public static void listDataNodeInfo() { 282 | try { 283 | DistributedFileSystem fs = null; 284 | fs = (DistributedFileSystem) hdfs; 285 | DatanodeInfo[] dataNodeStats = fs.getDataNodeStats(); 286 | String[] names = new String[dataNodeStats.length]; 287 | logger.info("List of all the datanode in the HDFS cluster:"); 288 | for (int i = 0; i < names.length; i++) { 289 | names[i] = dataNodeStats[i].getHostName(); 290 | logger.info(names[i]); 291 | } 292 | logger.info(hdfs.getUri().toString()); 293 | } catch (Exception e) { 294 | logger.error(e.toString(), e); 295 | } 296 | } 297 | 298 | 299 | public static boolean mergeDirFiles(List fileList, String tarPath, String rowTerminateFlag) { 300 | 301 | Path tarFile = new Path(tarPath); 302 | try (FSDataOutputStream tarFileOutputStream = hdfs.create(tarFile, true)) { 303 | byte[] buffer = new byte[1024000]; 304 | int length = 0; 305 | long nTotalLength = 0; 306 | int nCount = 0; 307 | boolean bfirst = true; 308 | for (FileStatus file : fileList) { 309 | if (file.getPath().equals(tarFile)) { 310 | continue; 311 | } 312 | logger.info(" merging file from " + file.getPath() + " to " + tarPath); 313 | 314 | if (!bfirst) { 315 | //添加换行符 316 | tarFileOutputStream.write(rowTerminateFlag.getBytes(), 0, rowTerminateFlag.length()); 317 | } 318 | try ( 319 | FSDataInputStream srcFileInputStream = hdfs.open(file.getPath(), buffer.length); 320 | ) { 321 | while ((length = srcFileInputStream.read(buffer)) > 0) { 322 | nCount++; 323 | tarFileOutputStream.write(buffer, 0, length); 324 | nTotalLength += length; 325 | if (nCount % 1000 == 0) { 326 | tarFileOutputStream.flush(); 327 | logger.info("Have move " + (nTotalLength / 1024000) + " MB"); 328 | } 329 | 330 | } 331 | } 332 | bfirst = false; 333 | } 334 | 335 | } catch (Exception e) { 336 | logger.error(e.toString(), e); 337 | try { 338 | delete(tarPath); 339 | } catch (IOException e1) { 340 | e1.printStackTrace(); 341 | } 342 | return false; 343 | } 344 | return true; 345 | } 346 | 347 | 348 | /** 349 | * 将一个字符串写入某个路径 350 | * 351 | * @param text 要保存的字符串 352 | * @param path 要保存的路径 353 | */ 354 | public static void writerString(String text, String path) { 355 | 356 | try { 357 | Path f = new Path(path); 358 | FSDataOutputStream os = hdfs.append(f); 359 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(os, "utf-8")); 360 | writer.write(text); 361 | writer.close(); 362 | os.close(); 363 | 364 | } catch (Exception e) { 365 | logger.error(e.toString(), e); 366 | } 367 | 368 | } 369 | 370 | /** 371 | * 按行读取文件内容，并且防止乱码 372 | * 373 | * @param hdfsFilename 374 | * @return 375 | */ 376 | public static List readByLine(String hdfsFilename) { 377 | List list = new ArrayList<>(); 378 | Path f = new Path(hdfsFilename); 379 | try ( 380 | FSDataInputStream dis = hdfs.open(f); 381 | BufferedReader bf = new BufferedReader(new InputStreamReader(dis));) { 382 | String line = null; 383 | while ((line = bf.readLine()) != null) { 384 | list.add(new String(line.getBytes(), "utf-8")); 385 | } 386 | return list; 387 | } catch (Exception e) { 388 | logger.error(e.toString(), e); 389 | return list; 390 | } 391 | } 392 | 393 | /** 394 | * 按行读取文件内容，并且防止乱码 395 | * 396 | * @param hdfsDir 397 | * @return 398 | */ 399 | public static List listFiles(String hdfsDir) { 400 | List listFiles = new ArrayList<>(); 401 | try { 402 | Path path = new Path(hdfsDir); 403 | if (!hdfs.exists(path)) { 404 | return listFiles; 405 | } 406 | FileStatus[] fileStatuses = hdfs.listStatus(path); 407 | for (int i = 0; i < fileStatuses.length; i++) { 408 | FileStatus fileStatus = fileStatuses[i]; 409 | String fileName = fileStatus.getPath().getName(); 410 | listFiles.add(fileName); 411 | } 412 | } catch (Exception e) { 413 | e.printStackTrace(); 414 | return listFiles; 415 | } 416 | return listFiles; 417 | } 418 | 419 | /** 420 | * 获取子文件或文件的最后更新时间 421 | * 422 | * @param uri 路径地址 423 | * @return 424 | */ 425 | public static Map getFilesModifyTime(String uri) { 426 | Map map = new HashMap<>(); 427 | try { 428 | if (hdfs.isDirectory(new Path(uri))) { 429 | FileStatus[] fileStatuses = hdfs.listStatus(new Path(uri)); 430 | for (int i = 0; i < fileStatuses.length; i++) { 431 | FileStatus fileStatus = fileStatuses[i]; 432 | String name = fileStatus.getPath().toUri().toString(); 433 | long modificationTime = fileStatus.getModificationTime(); 434 | map.put(name, modificationTime); 435 | } 436 | } else { 437 | Path path = new Path(uri); 438 | if (hdfs.exists(path)) { 439 | FileStatus fileStatus = hdfs.getFileStatus(path); 440 | String name = fileStatus.getPath().toUri().toString(); 441 | long modificationTime = fileStatus.getModificationTime(); 442 | map.put(name, modificationTime); 443 | } 444 | } 445 | } catch (IOException e) { 446 | e.printStackTrace(); 447 | return map; 448 | } 449 | return map; 450 | } 451 | 452 | 453 | /** 454 | * 把路径放入回收站 455 | * 456 | * @param src 目标路径 457 | * @return boolean 458 | * @throws IOException IOException 459 | */ 460 | public static boolean trashPath(String src) throws IOException { 461 | Path path = new Path(src); 462 | Trash trashTmp = new Trash(hdfs, hdfs.getConf()); 463 | if (hdfs.exists(path)) { 464 | if (trashTmp.moveToTrash(path)) { 465 | return true; 466 | } 467 | } 468 | return false; 469 | } 470 | 471 | } 472 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/util/JdbcHelper.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.util; 2 | 3 | import com.alibaba.druid.pool.DruidDataSource; 4 | import com.alibaba.druid.pool.DruidDataSourceFactory; 5 | import com.alibaba.druid.pool.DruidPooledConnection; 6 | import org.apache.log4j.Logger; 7 | 8 | import java.io.IOException; 9 | import java.sql.*; 10 | import java.util.List; 11 | import java.util.Properties; 12 | 13 | 14 | /** 15 | * JDBC辅助组件 16 | * 17 | * @author Administrator 18 | */ 19 | public class JdbcHelper { 20 | 21 | static Logger loggrt = Logger.getLogger(JdbcHelper.class); 22 | private static DruidDataSource druidDataSource = null; 23 | private static JdbcHelper instance = null; 24 | 25 | 26 | 27 | /** 28 | * 获取单例 29 | * 30 | * @return 单例 31 | */ 32 | public static JdbcHelper getHiveInstance() throws Exception { 33 | if (instance == null) { 34 | instance = new JdbcHelper(); 35 | Properties properties = new Properties(); 36 | String url = PropsUtil.getProp("hive.jdbc.url"); 37 | String user = PropsUtil.getProp("hive.jdbc.user"); 38 | String password = PropsUtil.getProp("hive.jdbc.password"); 39 | String driver = PropsUtil.getProp("hive.jdbc.driver"); 40 | properties.put("driverClassName",driver); 41 | properties.put("url",url); 42 | properties.put("username",user); 43 | properties.put("password",password); 44 | druidDataSource = (DruidDataSource) DruidDataSourceFactory.createDataSource(properties); 45 | } 46 | return instance; 47 | } 48 | 49 | 50 | /** 51 | * 返回druid数据库连接 52 | * 53 | * @return 54 | * @throws SQLException 55 | */ 56 | public DruidPooledConnection getConnection() throws SQLException { 57 | 58 | return druidDataSource.getConnection(); 59 | } 60 | 61 | /** 62 | * 执行增删改SQL语句 63 | * 64 | * @param sql 65 | * @param params 66 | * @return 影响的行数 67 | */ 68 | public int executeUpdate(String sql, Object[] params) { 69 | int rtn = 0; 70 | Connection conn = null; 71 | PreparedStatement pstmt = null; 72 | 73 | try { 74 | conn = getConnection(); 75 | conn.setAutoCommit(false); 76 | 77 | pstmt = conn.prepareStatement(sql); 78 | 79 | if (params != null && params.length > 0) { 80 | for (int i = 0; i < params.length; i++) { 81 | pstmt.setObject(i + 1, params[i]); 82 | } 83 | } 84 | 85 | rtn = pstmt.executeUpdate(); 86 | 87 | conn.commit(); 88 | } catch (Exception e) { 89 | loggrt.error(e.toString(), e); 90 | } finally { 91 | if (pstmt != null) { 92 | try { 93 | pstmt.close(); 94 | } catch (SQLException e) { 95 | loggrt.error(e.toString(), e); 96 | } 97 | } 98 | if (conn != null) { 99 | try { 100 | conn.close(); 101 | } catch (SQLException e) { 102 | loggrt.error(e.toString(), e); 103 | } 104 | } 105 | } 106 | 107 | return rtn; 108 | } 109 | 110 | /** 111 | * 执行查询SQL语句 112 | * 113 | * @param sql 114 | * @param params 115 | * @param callback 116 | */ 117 | public void executeQuery(String sql, Object[] params, 118 | QueryCallback callback) { 119 | Connection conn = null; 120 | PreparedStatement pstmt = null; 121 | ResultSet rs = null; 122 | 123 | try { 124 | conn = getConnection(); 125 | pstmt = conn.prepareStatement(sql); 126 | 127 | if (params != null && params.length > 0) { 128 | for (int i = 0; i < params.length; i++) { 129 | pstmt.setObject(i + 1, params[i]); 130 | } 131 | } 132 | 133 | rs = pstmt.executeQuery(); 134 | 135 | callback.process(rs); 136 | } catch (Exception e) { 137 | loggrt.error(e.toString(), e); 138 | } finally { 139 | if (pstmt != null) { 140 | try { 141 | pstmt.close(); 142 | } catch (SQLException e) { 143 | loggrt.error(e.toString(), e); 144 | } 145 | } 146 | if (conn != null) { 147 | try { 148 | conn.close(); 149 | } catch (SQLException e) { 150 | loggrt.error(e.toString(), e); 151 | } 152 | } 153 | } 154 | } 155 | 156 | /** 157 | * 执行查询SQL语句 158 | * @param sql 159 | 160 | */ 161 | public void execute(String sql) { 162 | Connection conn = null; 163 | Statement pstmt = null; 164 | 165 | try { 166 | conn = getConnection(); 167 | pstmt = conn.createStatement(); 168 | pstmt.execute(sql); 169 | } catch (Exception e) { 170 | loggrt.error(e.toString(), e); 171 | } finally { 172 | if (pstmt != null) { 173 | try { 174 | pstmt.close(); 175 | } catch (SQLException e) { 176 | loggrt.error(e.toString(), e); 177 | } 178 | } 179 | if (conn != null) { 180 | try { 181 | conn.close(); 182 | } catch (SQLException e) { 183 | loggrt.error(e.toString(), e); 184 | } 185 | } 186 | } 187 | } 188 | 189 | /** 190 | * @param sql 191 | * @param paramsList 192 | * @return 每条SQL语句影响的行数 193 | */ 194 | public int[] executeBatch(String sql, List paramsList) { 195 | int[] rtn = null; 196 | Connection conn = null; 197 | PreparedStatement pstmt = null; 198 | 199 | try { 200 | conn = getConnection(); 201 | 202 | // 第一步：使用Connection对象，取消自动提交 203 | conn.setAutoCommit(false); 204 | 205 | pstmt = conn.prepareStatement(sql); 206 | 207 | // 第二步：使用PreparedStatement.addBatch()方法加入批量的SQL参数 208 | if (paramsList != null && !paramsList.isEmpty()) { 209 | for (Object[] params : paramsList) { 210 | for (int i = 0; i < params.length; i++) { 211 | pstmt.setObject(i + 1, params[i]); 212 | } 213 | pstmt.addBatch(); 214 | } 215 | } 216 | 217 | // 第三步：使用PreparedStatement.executeBatch()方法，执行批量的SQL语句 218 | rtn = pstmt.executeBatch(); 219 | 220 | // 最后一步：使用Connection对象，提交批量的SQL语句 221 | conn.commit(); 222 | } catch (Exception e) { 223 | loggrt.error(e.toString(), e); 224 | } finally { 225 | if (pstmt != null) { 226 | try { 227 | pstmt.close(); 228 | } catch (SQLException e) { 229 | loggrt.error(e.toString(), e); 230 | } 231 | } 232 | if (conn != null) { 233 | try { 234 | conn.close(); 235 | } catch (SQLException e) { 236 | loggrt.error(e.toString(), e); 237 | } 238 | } 239 | } 240 | 241 | return rtn; 242 | } 243 | 244 | /** 245 | * 静态内部类：查询回调接口 246 | * 247 | * @author Administrator 248 | */ 249 | public static interface QueryCallback { 250 | 251 | /** 252 | * 处理查询结果 253 | * 254 | * @param rs 255 | * @throws Exception 256 | */ 257 | void process(ResultSet rs) throws Exception; 258 | 259 | } 260 | 261 | } 262 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/clean/util/PropsUtil.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.util; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.io.IOException; 7 | import java.util.Properties; 8 | 9 | /** 10 | * 获取系统配置的工具类 11 | * 12 | * @author lijf@2345.com 13 | * @return 14 | */ 15 | public final class PropsUtil { 16 | 17 | private static Properties props = null; 18 | private static Logger logger = LoggerFactory.getLogger(PropsUtil.class); 19 | 20 | static { 21 | try { 22 | props = new Properties(); 23 | props.load(PropsUtil.class.getClassLoader().getResourceAsStream("config.properties")); 24 | } catch (IOException e) { 25 | logger.error(e.toString()); 26 | } 27 | } 28 | 29 | private PropsUtil() { 30 | } 31 | 32 | /** 33 | * 获取配置 34 | * 35 | * @param key key 36 | * @return value 37 | */ 38 | public static String getProp(String key) { 39 | if (props != null) { 40 | return props.getProperty(key); 41 | } 42 | return null; 43 | } 44 | 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/common/shell/MyUserInfo.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.common.shell; 2 | 3 | /** 4 | * @author lijf@2345.com 5 | * @date 2020/4/24 14:56 6 | * @desc 7 | */ 8 | 9 | import com.jcraft.jsch.UserInfo; 10 | 11 | public class MyUserInfo implements UserInfo { 12 | 13 | @Override 14 | public String getPassphrase() { 15 | // TODO Auto-generated method stub 16 | System.out.println("MyUserInfo.getPassphrase()"); 17 | return null; 18 | } 19 | 20 | @Override 21 | public String getPassword() { 22 | // TODO Auto-generated method stub 23 | System.out.println("MyUserInfo.getPassword()"); 24 | return null; 25 | } 26 | 27 | @Override 28 | public boolean promptPassphrase(String arg0) { 29 | // TODO Auto-generated method stub 30 | System.out.println("MyUserInfo.promptPassphrase()"); 31 | System.out.println(arg0); 32 | return false; 33 | } 34 | 35 | @Override 36 | public boolean promptPassword(String arg0) { 37 | // TODO Auto-generated method stub 38 | System.out.println("MyUserInfo.promptPassword()"); 39 | System.out.println(arg0); 40 | return false; 41 | } 42 | 43 | @Override 44 | public boolean promptYesNo(String arg0) { 45 | // TODO Auto-generated method stub' 46 | System.out.println("MyUserInfo.promptYesNo()"); 47 | System.out.println(arg0); 48 | if (arg0.contains("The authenticity of host")) { 49 | return true; 50 | } 51 | return true; 52 | } 53 | 54 | @Override 55 | public void showMessage(String arg0) { 56 | // TODO Auto-generated method stub 57 | System.out.println("MyUserInfo.showMessage()"); 58 | } 59 | 60 | } -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/common/shell/Shell.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.common.shell; 2 | 3 | /** 4 | * @author lijf@2345.com 5 | * @date 2020/4/24 14:55 6 | * @desc 7 | */ 8 | 9 | import java.io.BufferedReader; 10 | import java.io.IOException; 11 | import java.io.InputStreamReader; 12 | import java.util.ArrayList; 13 | 14 | import com.jcraft.jsch.Channel; 15 | import com.jcraft.jsch.ChannelExec; 16 | import com.jcraft.jsch.JSch; 17 | import com.jcraft.jsch.JSchException; 18 | import com.jcraft.jsch.Session; 19 | import com.xkj.mlrc.clean.util.PropsUtil; 20 | 21 | public class Shell { 22 | //远程主机的ip地址 23 | private String ip; 24 | //远程主机登录用户名 25 | public String username; 26 | //远程主机的登录密码 27 | private String password; 28 | //设置ssh连接的远程端口 29 | public static final int DEFAULT_SSH_PORT = 22; 30 | //保存输出内容的容器 31 | private ArrayList stdout; 32 | 33 | /** 34 | * 初始化登录信息 35 | * 36 | * @param ip 37 | * @param username 38 | * @param password 39 | */ 40 | public Shell(final String ip, final String username, final String password) { 41 | this.ip = ip; 42 | this.username = username; 43 | this.password = password; 44 | stdout = new ArrayList(); 45 | } 46 | 47 | /** 48 | * 执行shell命令 49 | * 50 | * @param command 51 | * @return 52 | */ 53 | public int execute(final String command) throws JSchException, IOException { 54 | if (null != stdout) { 55 | stdout.clear(); 56 | } 57 | int returnCode = 0; 58 | JSch jsch = new JSch(); 59 | MyUserInfo userInfo = new MyUserInfo(); 60 | 61 | 62 | //创建session并且打开连接，因为创建session之后要主动打开连接 63 | Session session = jsch.getSession(username, ip, DEFAULT_SSH_PORT); 64 | session.setPassword(password); 65 | session.setUserInfo(userInfo); 66 | session.connect(); 67 | 68 | //打开通道，设置通道类型，和执行的命令 69 | Channel channel = session.openChannel("exec"); 70 | ChannelExec channelExec = (ChannelExec) channel; 71 | channelExec.setCommand(command); 72 | 73 | channelExec.setInputStream(null); 74 | BufferedReader input = new BufferedReader(new InputStreamReader 75 | (channelExec.getInputStream())); 76 | 77 | channelExec.connect(); 78 | System.out.println("The remote command is :" + command); 79 | 80 | //接收远程服务器执行命令的结果 81 | String line; 82 | while ((line = input.readLine()) != null) { 83 | stdout.add(line); 84 | } 85 | input.close(); 86 | 87 | // 得到returnCode 88 | if (channelExec.isClosed()) { 89 | returnCode = channelExec.getExitStatus(); 90 | } 91 | 92 | // 关闭通道 93 | channelExec.disconnect(); 94 | //关闭session 95 | session.disconnect(); 96 | 97 | return returnCode; 98 | } 99 | 100 | /** 101 | * get stdout 102 | * 103 | * @return 104 | */ 105 | public ArrayList getStandardOutput() { 106 | return stdout; 107 | } 108 | public static void main(final String [] args) throws IOException, JSchException { 109 | String host = PropsUtil.getProp("ssh.namenode.host"); 110 | String user = PropsUtil.getProp("ssh.namenode.user"); 111 | String password = PropsUtil.getProp("ssh.namenode.password"); 112 | Shell shell = new Shell(host, user, password); 113 | shell.execute("ll"); 114 | 115 | ArrayList stdout = shell.getStandardOutput(); 116 | System.out.println("============="); 117 | for (String str : stdout) { 118 | System.out.println(str); 119 | } 120 | 121 | } 122 | 123 | } -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/fsimage/GenerateFsimageTable.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.fsimage; 2 | 3 | import com.jcraft.jsch.*; 4 | import com.xkj.mlrc.clean.util.PropsUtil; 5 | import com.xkj.mlrc.common.shell.Shell; 6 | import lombok.extern.slf4j.Slf4j; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.util.ArrayList; 11 | 12 | /** 13 | * 远程ssh解析fsimage文件 14 | * 15 | * @author lijf@2345.com 16 | * @date 2020/4/23 12:38 17 | * @desc 18 | */ 19 | @Slf4j 20 | public class GenerateFsimageTable { 21 | public static Shell shell; 22 | 23 | public static void main(String[] args) throws IOException, JSchException { 24 | // ssh远程登录NameNode所在的机器 25 | sshLoginNameNodeHost(); 26 | // 解析fsimage文件并上传到hive表对应的路径 27 | generateFsimageCsv2Hive(); 28 | } 29 | 30 | private static void generateFsimageCsv2Hive() throws IOException, JSchException { 31 | //获取存放fsimage文件的目录 32 | String cmd1 = "hdfs getconf -confKey dfs.namenode.name.dir"; 33 | shell.execute(cmd1); 34 | ArrayList list1 = shell.getStandardOutput(); 35 | String fsimageDir = list1.get(list1.size() - 1).split(",")[0]; 36 | 37 | //获取最新的fsimage文件的路径 38 | String cmd2 = "find ${fsimageDir}/current -type f -name 'fsimage_*' | grep -v '.md5' | sort -n | tail -n1"; 39 | shell.execute(cmd2.replace("${fsimageDir}", fsimageDir)); 40 | ArrayList list2 = shell.getStandardOutput(); 41 | String fsimageFile = list2.get(list2.size() - 1); 42 | //拷贝fsimage文件到ssh登录的HOME目录,并加上时间戳后缀 43 | String userHomeDir = "/home/" + shell.username; 44 | long timestamp = System.currentTimeMillis(); 45 | String fsimageFileName = new File(fsimageFile).getName() + "_" + timestamp; 46 | String cmd3 = "cp ${fsimageFile} ${userhome}/${fsimageFileName}"; 47 | cmd3 = cmd3.replace("${fsimageFile}", fsimageFile).replace("${userhome}", userHomeDir).replace("${fsimageFileName}", fsimageFileName); 48 | shell.execute(cmd3); 49 | 50 | //解析fsimage成csv文件 51 | String cmd4 = "hdfs oiv -p Delimited -delimiter ',' -i ${userhome}/${fsimageFileName} -o ${userhome}/fsimage.csv"; 52 | cmd4 = cmd4.replace("${userhome}", userHomeDir).replace("${fsimageFileName}", fsimageFileName); 53 | shell.execute(cmd4); 54 | 55 | // 创建fsimage表 56 | String cmd5 = "hive -S -e \"CREATE TABLE IF NOT EXISTS fsimage( " + 57 | "path string, " + 58 | "replication int, " + 59 | "modificationtime string, " + 60 | "accesstime string, " + 61 | "preferredblocksize bigint, " + 62 | "blockscount int, " + 63 | "filesize bigint, " + 64 | "nsquota int, " + 65 | "dsquota int, " + 66 | "permission string, " + 67 | "username string, " + 68 | "groupname string) " + 69 | "ROW FORMAT DELIMITED " + 70 | "FIELDS TERMINATED BY ',' " + 71 | "STORED AS INPUTFORMAT " + 72 | "'org.apache.hadoop.mapred.TextInputFormat' " + 73 | "OUTPUTFORMAT " + 74 | "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' " + 75 | "location '/tmp/fsimage'\""; 76 | shell.execute(cmd5); 77 | 78 | // 上传fsimage.csv文件到hive表，为后面统计分析做准备 79 | String cmd6 = "hadoop fs -put -f ${userhome}/fsimage.csv /tmp/fsimage/"; 80 | cmd6 = cmd6.replace("${userhome}", userHomeDir); 81 | shell.execute(cmd6); 82 | 83 | // 删除无用的数据 84 | String cmd7 = "rm -rf ${userhome}/fsimage*"; 85 | cmd7 = cmd7.replace("${userhome}", userHomeDir); 86 | shell.execute(cmd7); 87 | 88 | } 89 | 90 | /** 91 | * ssh登录namenode host 92 | */ 93 | private static void sshLoginNameNodeHost() throws IOException, JSchException { 94 | String host = PropsUtil.getProp("ssh.namenode.host"); 95 | String user = PropsUtil.getProp("ssh.namenode.user"); 96 | String password = PropsUtil.getProp("ssh.namenode.password"); 97 | shell = new Shell(host, user, password); 98 | int code = shell.execute("ls"); 99 | System.out.println(code); 100 | if (code == 0) { 101 | log.info("用户：{} 登录host：{}成功！！", user, host); 102 | } else { 103 | log.error("用户：{} 登录host：{}失败！！", user, host); 104 | System.exit(-1); 105 | } 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/com/xkj/mlrc/fsimage/GetFromFsImageInfo.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.fsimage; 2 | 3 | import com.xkj.mlrc.clean.util.DateUtil; 4 | import lombok.AllArgsConstructor; 5 | import lombok.Builder; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.spark.api.java.function.FilterFunction; 10 | import org.apache.spark.sql.Dataset; 11 | import org.apache.spark.sql.Row; 12 | import org.apache.spark.sql.SparkSession; 13 | 14 | import java.io.Serializable; 15 | 16 | /** 17 | * @author lijf@2345.com 18 | * @date 2020/4/22 14:18 19 | * @desc 获取fsimage信息表 20 | */ 21 | @Data 22 | @AllArgsConstructor 23 | @NoArgsConstructor 24 | @Builder() 25 | public class GetFromFsImageInfo implements Serializable { 26 | private SparkSession spark; 27 | // 目标路径，要删除的路径，包含所有子路径 28 | private String targetPath; 29 | // 要避开扫描的路径，包含所有子路径 30 | private String avoidPath; 31 | // 要过滤的带有前缀的文件 32 | private String avoidSuffix; 33 | // 要过滤的带有后缀的文件 34 | private String avoidPrefix; 35 | // 过滤多少天之前未读的数据 36 | private Integer expire; 37 | // hdfs根路径 38 | private String hdfsroot; 39 | 40 | /** 41 | * 获取要删除的所有文件 42 | * 43 | * @return Dataset 44 | * @throws NullPointerException 空指针异常 45 | */ 46 | public Dataset getAllFiles() throws NullPointerException { 47 | if (expire == null || expire == 0) { 48 | throw new NullPointerException("expire必须大于0！！"); 49 | } 50 | // 获取过期数据的日期 51 | String nDayFmtDAte = DateUtil.getNDayFmtDAte("yyyy-MM-dd HH:mm", 0 - expire); 52 | // 获取要删的文件 53 | String sqlText = "select * from fsimage where replication>0 and accesstime<'" + nDayFmtDAte + "'"; 54 | Dataset fsImage = spark.sql(sqlText); 55 | 56 | // 以下根据传入的各个参数过滤数据 57 | if (null != targetPath) { 58 | fsImage = fsImage.filter(new FilterFunction() { 59 | @Override 60 | public boolean call(Row row) throws Exception { 61 | String[] targetPaths = targetPath.split(","); 62 | boolean contains = false; 63 | for (int i = 0; i < targetPaths.length; i++) { 64 | String path = targetPaths[i]; 65 | String fileAbsPath = row.getAs("path").toString(); 66 | if (fileAbsPath.startsWith(path)) { 67 | contains = true; 68 | } 69 | } 70 | return contains; 71 | } 72 | 73 | }); 74 | } 75 | if (null != avoidPath) { 76 | String[] avoidPaths = avoidPath.split(","); 77 | for (int i = 0; i < avoidPaths.length; i++) { 78 | String path = avoidPaths[i]; 79 | fsImage = fsImage.filter(new FilterFunction() { 80 | @Override 81 | public boolean call(Row row) throws Exception { 82 | String fileAbsPath = row.getAs("path").toString(); 83 | return !fileAbsPath.startsWith(path); 84 | } 85 | }); 86 | } 87 | } 88 | 89 | if (null != avoidSuffix) { 90 | String[] avoidSuffixs = avoidSuffix.split(","); 91 | for (int i = 0; i < avoidSuffixs.length; i++) { 92 | String suffix = avoidSuffixs[i]; 93 | fsImage = fsImage.filter(new FilterFunction() { 94 | @Override 95 | public boolean call(Row row) throws Exception { 96 | String path = row.getAs("path").toString(); 97 | return !path.endsWith(suffix); 98 | } 99 | }); 100 | } 101 | } 102 | if (null != avoidPrefix) { 103 | String[] avoidPrefixs = avoidPrefix.split(","); 104 | for (int i = 0; i < avoidPrefixs.length; i++) { 105 | String prefix = avoidPrefixs[i]; 106 | fsImage = fsImage.filter(new FilterFunction() { 107 | @Override 108 | public boolean call(Row row) throws Exception { 109 | String pathName = row.getAs("path").toString(); 110 | String fileName = new Path(pathName).getName(); 111 | return !fileName.startsWith(prefix); 112 | } 113 | }); 114 | } 115 | } 116 | return fsImage; 117 | } 118 | 119 | 120 | /** 121 | * 获取所有文件的目录 122 | * 123 | * @return Dataset 124 | */ 125 | public Dataset getAllFilesDir() { 126 | Dataset allFiles = getAllFiles(); 127 | Dataset fsimage = allFiles.selectExpr("*", "SUBSTRING_INDEX(path,'/',size(split(path,'/'))-1) as dir"); 128 | return fsimage; 129 | } 130 | 131 | /** 132 | * 根据目录下的访问时间最大的文件决定目录是否删除 133 | * 134 | * @return Dataset 135 | */ 136 | public Dataset getAllShouldDelDirsByLastAccesstime() { 137 | getAllFilesDir().createOrReplaceTempView("fsimage_dirs"); 138 | String sqlText = "select * from " + 139 | " (" + 140 | " select *," + 141 | " row_number() over (partition by dir order by accesstime desc) rank " + 142 | " from fsimage_dirs" + 143 | " ) tmp where rank=1"; 144 | Dataset fsimage = spark.sql(sqlText).selectExpr("*", "concat('" + hdfsroot + "',dir) as hdfs_abs_path"); 145 | return fsimage; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/main/resources/config.properties: -------------------------------------------------------------------------------- 1 | #hdfs相关 2 | ad.hdfs.root.uri=hdfs://cluster 3 | 4 | # hive的mysql元数据库 5 | hive.meta.mysql.url=jdbc:mysql://ha05:3306/hive?&characterEncoding=utf-8&useSSL=false 6 | hive.meta.mysql.username=test_hive 7 | hive.meta.mysql.password=mYsQl2345@y@eLi 8 | hive.meta.mysql.driver=com.mysql.jdbc.Driver 9 | 10 | #hive jdbc相关，没有密码的可以不写 11 | hive.jdbc.url=jdbc:hive2://ha03:10000/default 12 | hive.jdbc.driver=org.apache.hive.jdbc.HiveDriver 13 | hive.jdbc.user=lijf 14 | hive.jdbc.password= 15 | 16 | #NameNode host相关, 代码中需要ssh远程登录到NameNode，如果是HA模式，最好是standby NameNode的host 17 | ssh.namenode.host=ha05 18 | ssh.namenode.user=lijf 19 | ssh.namenode.password=Z2$EgK%k#xssFLu! 20 | -------------------------------------------------------------------------------- /src/main/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.AbstractFileSystem.alluxio.impl 4 | alluxio.hadoop.AlluxioFileSystem 5 | 6 | 7 | fs.alluxio.impl 8 | alluxio.hadoop.FileSystem 9 | 10 | 11 | fs.AbstractFileSystem.alluxio.impl 12 | alluxio.hadoop.AlluxioFileSystem 13 | 14 | 15 | 16 | fs.alluxio.impl 17 | alluxio.hadoop.FileSystem 18 | 19 | 20 | 21 | fs.azure.user.agent.prefix 22 | User-Agent: APN/1.0 Hortonworks/1.0 HDP/2.6.5.0-292 23 | 24 | 25 | 26 | fs.defaultFS 27 | hdfs://cluster 28 | true 29 | 30 | 31 | 32 | fs.s3a.user.agent.prefix 33 | User-Agent: APN/1.0 Hortonworks/1.0 HDP/2.6.5.0-292 34 | 35 | 36 | 37 | fs.trash.interval 38 | 360 39 | 40 | 41 | 42 | ha.failover-controller.active-standby-elector.zk.op.retries 43 | 120 44 | 45 | 46 | 47 | ha.zookeeper.quorum 48 | ha05:2181,ha04:2181,ha02:2181,ha01:2181,ha03:2181 49 | 50 | 51 | 52 | hadoop.custom-extensions.root 53 | /hdp/ext/2.6/hadoop 54 | 55 | 56 | 57 | hadoop.http.authentication.simple.anonymous.allowed 58 | true 59 | 60 | 61 | 62 | hadoop.http.staticuser.user 63 | yarn 64 | 65 | 66 | 67 | hadoop.proxyuser.falcon.groups 68 | * 69 | 70 | 71 | 72 | hadoop.proxyuser.falcon.hosts 73 | * 74 | 75 | 76 | 77 | hadoop.proxyuser.hbase.groups 78 | * 79 | 80 | 81 | 82 | hadoop.proxyuser.hbase.hosts 83 | * 84 | 85 | 86 | 87 | hadoop.proxyuser.hcat.groups 88 | * 89 | 90 | 91 | 92 | hadoop.proxyuser.hcat.hosts 93 | ha03,ha04 94 | 95 | 96 | 97 | hadoop.proxyuser.hdfs.groups 98 | * 99 | 100 | 101 | 102 | hadoop.proxyuser.hdfs.hosts 103 | * 104 | 105 | 106 | 107 | hadoop.proxyuser.hive.groups 108 | * 109 | 110 | 111 | 112 | hadoop.proxyuser.hive.hosts 113 | ha03,ha04 114 | 115 | 116 | 117 | hadoop.proxyuser.httpfs.groups 118 | * 119 | 120 | 121 | 122 | hadoop.proxyuser.httpfs.hosts 123 | * 124 | 125 | 126 | 127 | hadoop.proxyuser.hue.groups 128 | * 129 | 130 | 131 | 132 | hadoop.proxyuser.hue.hosts 133 | * 134 | 135 | 136 | 137 | hadoop.proxyuser.oozie.groups 138 | * 139 | 140 | 141 | 142 | hadoop.proxyuser.oozie.hosts 143 | ha03,ha05 144 | 145 | 146 | 147 | hadoop.proxyuser.root.groups 148 | * 149 | 150 | 151 | 152 | hadoop.proxyuser.root.hosts 153 | ha05 154 | 155 | 156 | 157 | hadoop.proxyuser.yarn.hosts 158 | ha02,ha05 159 | 160 | 161 | 162 | hadoop.security.auth_to_local 163 | DEFAULT 164 | 165 | 166 | 167 | hadoop.security.authentication 168 | simple 169 | 170 | 171 | 172 | hadoop.security.authorization 173 | false 174 | 175 | 176 | 177 | io.compression.codecs 178 | 179 | org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.SnappyCodec 180 | 181 | 182 | 183 | 184 | io.file.buffer.size 185 | 131072 186 | 187 | 188 | 189 | io.serializations 190 | org.apache.hadoop.io.serializer.WritableSerialization 191 | 192 | 193 | 194 | ipc.client.connect.max.retries 195 | 50 196 | 197 | 198 | 199 | ipc.client.connection.maxidletime 200 | 30000 201 | 202 | 203 | 204 | ipc.client.idlethreshold 205 | 8000 206 | 207 | 208 | 209 | ipc.server.tcpnodelay 210 | true 211 | 212 | 213 | 214 | mapreduce.jobtracker.webinterface.trusted 215 | false 216 | 217 | 218 | 219 | net.topology.script.file.name 220 | /etc/hadoop/conf/topology_script.py 221 | 222 | 223 | -------------------------------------------------------------------------------- /src/main/resources/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.block.access.token.enable 5 | true 6 | 7 | 8 | 9 | dfs.blockreport.initialDelay 10 | 120 11 | 12 | 13 | 14 | dfs.blocksize 15 | 134217728 16 | 17 | 18 | 19 | dfs.client.failover.proxy.provider.cluster 20 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 21 | 22 | 23 | 24 | dfs.client.read.shortcircuit 25 | true 26 | 27 | 28 | 29 | dfs.client.read.shortcircuit.streams.cache.size 30 | 4096 31 | 32 | 33 | 34 | dfs.client.retry.policy.enabled 35 | false 36 | 37 | 38 | 39 | dfs.cluster.administrators 40 | hdfs 41 | 42 | 43 | 44 | dfs.content-summary.limit 45 | 5000 46 | 47 | 48 | 49 | dfs.datanode.address 50 | 0.0.0.0:50010 51 | 52 | 53 | 54 | dfs.datanode.balance.bandwidthPerSec 55 | 6250000 56 | 57 | 58 | 59 | dfs.datanode.data.dir 60 | /disk1/hadoop/hdfs/data,/disk2/hadoop/hdfs/data,/disk3/hadoop/hdfs/data,/disk4/hadoop/hdfs/data 61 | true 62 | 63 | 64 | 65 | dfs.datanode.data.dir.perm 66 | 750 67 | 68 | 69 | 70 | dfs.datanode.du.reserved 71 | 246107496448 72 | 73 | 74 | 75 | dfs.datanode.failed.volumes.tolerated 76 | 1 77 | true 78 | 79 | 80 | 81 | dfs.datanode.http.address 82 | 0.0.0.0:50075 83 | 84 | 85 | 86 | dfs.datanode.https.address 87 | 0.0.0.0:50475 88 | 89 | 90 | 91 | dfs.datanode.ipc.address 92 | 0.0.0.0:8010 93 | 94 | 95 | 96 | dfs.datanode.max.transfer.threads 97 | 16384 98 | 99 | 100 | 101 | dfs.domain.socket.path 102 | /var/lib/hadoop-hdfs/dn_socket 103 | 104 | 105 | 106 | dfs.encrypt.data.transfer.cipher.suites 107 | AES/CTR/NoPadding 108 | 109 | 110 | 111 | dfs.ha.automatic-failover.enabled 112 | true 113 | 114 | 115 | 116 | dfs.ha.fencing.methods 117 | shell(/bin/true) 118 | 119 | 120 | 121 | dfs.ha.namenodes.cluster 122 | nn1,nn2 123 | 124 | 125 | 126 | dfs.heartbeat.interval 127 | 3 128 | 129 | 130 | 131 | dfs.hosts.exclude 132 | /etc/hadoop/conf/dfs.exclude 133 | 134 | 135 | 136 | dfs.http.policy 137 | HTTP_ONLY 138 | 139 | 140 | 141 | dfs.https.port 142 | 50470 143 | 144 | 145 | 146 | dfs.internal.nameservices 147 | cluster 148 | 149 | 150 | 151 | dfs.journalnode.edits.dir 152 | /hadoop/hdfs/journal 153 | 154 | 155 | 156 | dfs.journalnode.http-address 157 | 0.0.0.0:8480 158 | 159 | 160 | 161 | dfs.journalnode.https-address 162 | 0.0.0.0:8481 163 | 164 | 165 | 166 | dfs.namenode.accesstime.precision 167 | 0 168 | 169 | 170 | 171 | dfs.namenode.acls.enabled 172 | true 173 | 174 | 175 | 176 | dfs.namenode.audit.log.async 177 | true 178 | 179 | 180 | 181 | dfs.namenode.avoid.read.stale.datanode 182 | true 183 | 184 | 185 | 186 | dfs.namenode.avoid.write.stale.datanode 187 | true 188 | 189 | 190 | 191 | dfs.namenode.checkpoint.dir 192 | /hadoop/hdfs/namesecondary 193 | 194 | 195 | 196 | dfs.namenode.checkpoint.edits.dir 197 | ${dfs.namenode.checkpoint.dir} 198 | 199 | 200 | 201 | dfs.namenode.checkpoint.period 202 | 21600 203 | 204 | 205 | 206 | dfs.namenode.checkpoint.txns 207 | 1000000 208 | 209 | 210 | 211 | dfs.namenode.fslock.fair 212 | false 213 | 214 | 215 | 216 | dfs.namenode.handler.count 217 | 200 218 | 219 | 220 | 221 | dfs.namenode.http-address.cluster.nn1 222 | ha05:50070 223 | 224 | 225 | 226 | dfs.namenode.http-address.cluster.nn2 227 | ha04:50070 228 | 229 | 230 | 231 | dfs.namenode.https-address.cluster.nn1 232 | ha05:50470 233 | 234 | 235 | 236 | dfs.namenode.https-address.cluster.nn2 237 | ha04:50470 238 | 239 | 240 | 241 | dfs.namenode.name.dir 242 | /disk1/hadoop/hdfs/namenode,/disk2/hadoop/hdfs/namenode,/disk3/hadoop/hdfs/namenode,/disk4/hadoop/hdfs/namenode 243 | true 244 | 245 | 246 | 247 | dfs.namenode.name.dir.restore 248 | true 249 | 250 | 251 | 252 | dfs.namenode.rpc-address.cluster.nn1 253 | ha05:8020 254 | 255 | 256 | 257 | dfs.namenode.rpc-address.cluster.nn2 258 | ha04:8020 259 | 260 | 261 | 262 | dfs.namenode.safemode.threshold-pct 263 | 0.99 264 | 265 | 266 | 267 | dfs.namenode.shared.edits.dir 268 | qjournal://ha04:8485;ha05:8485;ha02:8485/cluster 269 | 270 | 271 | 272 | dfs.namenode.stale.datanode.interval 273 | 30000 274 | 275 | 276 | 277 | dfs.namenode.startup.delay.block.deletion.sec 278 | 3600 279 | 280 | 281 | 282 | dfs.namenode.write.stale.datanode.ratio 283 | 1.0f 284 | 285 | 286 | 287 | dfs.nameservices 288 | cluster 289 | 290 | 291 | 292 | dfs.permissions.enabled 293 | true 294 | 295 | 296 | 297 | dfs.permissions.superusergroup 298 | hdfs 299 | 300 | 301 | 302 | dfs.replication 303 | 2 304 | 305 | 306 | 307 | dfs.replication.max 308 | 50 309 | 310 | 311 | 312 | dfs.support.append 313 | true 314 | true 315 | 316 | 317 | 318 | dfs.webhdfs.enabled 319 | true 320 | true 321 | 322 | 323 | 324 | fs.permissions.umask-mode 325 | 022 326 | 327 | 328 | 329 | hadoop.caller.context.enabled 330 | true 331 | 332 | 333 | 334 | hadoop.proxyuser.hue.groups 335 | * 336 | 337 | 338 | 339 | hadoop.proxyuser.hue.hosts 340 | * 341 | 342 | 343 | 344 | manage.include.files 345 | false 346 | 347 | 348 | 349 | nfs.exports.allowed.hosts 350 | * rw 351 | 352 | 353 | 354 | nfs.file.dump.dir 355 | /tmp/.hdfs-nfs 356 | 357 | 358 | -------------------------------------------------------------------------------- /src/main/resources/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | hive.metastore.uris 4 | thrift://ha03:9083,thrift://ha04:9083 5 | 6 | 7 | hive.metastore.warehouse.dir 8 | /apps/hive/warehouse 9 | 10 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | ### set log levels ### 2 | log4j.rootLogger = INFO , console , debug , error 3 | 4 | ### console ### 5 | log4j.appender.console = org.apache.log4j.ConsoleAppender 6 | log4j.appender.console.Target = System.out 7 | log4j.appender.console.layout = org.apache.log4j.PatternLayout 8 | log4j.appender.console.layout.ConversionPattern = %-d{yyyy-MM-dd HH\:mm\:ss} [%p]-[%c] %m%n 9 | 10 | ### log file ### 11 | log4j.appender.debug = org.apache.log4j.DailyRollingFileAppender 12 | log4j.appender.debug.File = ../log/debug.log 13 | log4j.appender.debug.Append = true 14 | log4j.appender.debug.Threshold = INFO 15 | log4j.appender.debug.layout = org.apache.log4j.PatternLayout 16 | log4j.appender.debug.layout.ConversionPattern = %-d{yyyy-MM-dd HH\:mm\:ss} [%p]-[%c] %m%n 17 | log4j.appender.debug.DatePattern='_' yyyy-MM-dd 18 | 19 | ### exception ### 20 | log4j.appender.error = org.apache.log4j.DailyRollingFileAppender 21 | log4j.appender.error.File = ../log/error.log 22 | log4j.appender.error.Append = true 23 | log4j.appender.error.Threshold = ERROR 24 | log4j.appender.error.layout = org.apache.log4j.PatternLayout 25 | log4j.appender.error.layout.ConversionPattern = %-d{yyyy-MM-dd HH\:mm\:ss} [%p]-[%c] %m%n 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/test/java/com/xkj/mlrc/clean/util/HdfsUtilsTest.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.util; 2 | 3 | 4 | import org.junit.Test; 5 | import org.junit.runner.RunWith; 6 | 7 | import java.io.IOException; 8 | 9 | import static org.junit.Assert.*; 10 | 11 | /** 12 | * @author lijf@2345.com 13 | * @date 2020/4/23 13:30 14 | * @desc 15 | */ 16 | 17 | public class HdfsUtilsTest { 18 | 19 | 20 | @Test 21 | public void movefile() throws IOException { 22 | HdfsUtils.movefile("/user/lijf/2019-01-03/zhangyong.log",".Trash/user/lijf/2019-01-03/zhangyong.log"); 23 | 24 | } 25 | 26 | @Test 27 | public void delete() throws IOException { 28 | System.setProperty("HADOOP_USER_NAME", "lijf"); 29 | 30 | HdfsUtils.delete("/user/lijf/2019-01-03"); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/java/com/xkj/mlrc/clean/util/JdbcHelperTest.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.clean.util; 2 | 3 | 4 | import org.junit.Test; 5 | 6 | import java.sql.*; 7 | 8 | 9 | /** 10 | * @author lijf@2345.com 11 | * @date 2020/4/23 15:55 12 | * @desc 13 | */ 14 | 15 | public class JdbcHelperTest { 16 | 17 | @Test 18 | public void getConnection() throws Exception { 19 | JdbcHelper jdbcHelper = JdbcHelper.getHiveInstance(); 20 | Connection connection = jdbcHelper.getConnection(); 21 | Statement statement = connection.createStatement(); 22 | ResultSet resultSet = statement.executeQuery("select * from all_hdfs_path"); 23 | while (resultSet.next()){ 24 | System.out.println(resultSet.getString(1)); 25 | 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/test/java/com/xkj/mlrc/fsimage/GetFromFsImageTest.java: -------------------------------------------------------------------------------- 1 | package com.xkj.mlrc.fsimage; 2 | 3 | 4 | import org.apache.commons.io.FileUtils; 5 | import org.apache.hadoop.fs.Path; 6 | import org.junit.Test; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.net.URL; 11 | import java.util.List; 12 | 13 | /** 14 | * @author lijf@2345.com 15 | * @date 2020/4/23 11:56 16 | * @desc 17 | */ 18 | 19 | public class GetFromFsImageTest { 20 | 21 | 22 | @Test 23 | public void getAllFiles() throws IOException { 24 | ClassLoader classLoader = GetFromFsImageTest.class.getClassLoader(); 25 | /** 26 | getResource()方法会去classpath下找这个文件，获取到url resource, 得到这个资源后，调用url.getFile获取到文件的绝对路径 27 | */ 28 | URL url = classLoader.getResource("exclusion_tables.txt"); 29 | /** 30 | * url.getFile() 得到这个文件的绝对路径 31 | */ 32 | System.out.println(url.getFile()); 33 | File file = new File(url.getFile()); 34 | System.out.println(file.exists()); 35 | List exclusionTables = FileUtils.readLines(file); 36 | 37 | String s = "fds/fdf/hfhgf/hjgf"; 38 | Path path = new Path(s); 39 | System.out.println(path.getName()); 40 | } 41 | 42 | 43 | } 44 | --------------------------------------------------------------------------------