├── .gitignore ├── build.bat ├── build.sh ├── libs ├── gbase-connector-java-8.3.81.53-build52.8-bin.jar ├── greenplum-jdbc-5.1.4.jar ├── greenplum-spark_2.11-1.6.0.jar ├── kingbasejdbc3.jar ├── mysql-connector-java-8.0.11.jar ├── ojdbc6.jar ├── redshift-jdbc42-1.2.43.1067.jar ├── redshift-jdbc42-no-awssdk-1.2.43.1067.jar ├── tdgssconfig.jar ├── terajdbc4.jar ├── tispark-assembly-2.3.11.jar ├── zdh_log4j_jdbc.jar └── zdh_rqueue.jar ├── pom.xml ├── readme.md ├── release ├── bin │ ├── ljars.sh │ ├── start_server.sh │ └── stop_server.sh ├── conf │ ├── application.conf │ ├── datasources.properties │ └── log4j.properties └── copy_spark_jars │ ├── gbase-connector-java-8.3.81.53-build52.8-bin.jar │ ├── greenplum-jdbc-5.1.4.jar │ ├── greenplum-spark_2.11-1.6.0.jar │ ├── hutool-all-4.4.5.jar │ ├── kingbasejdbc3.jar │ ├── mysql-connector-java-8.0.11.jar │ ├── ojdbc6.jar │ ├── redisson-3.22.0.jar │ ├── redshift-jdbc42-1.2.43.1067.jar │ ├── tdgssconfig.jar │ ├── terajdbc4.jar │ ├── tispark-assembly-2.3.11.jar │ ├── zdh_log4j_jdbc.jar │ └── zdh_rqueue.jar └── src ├── main ├── resources │ ├── application.conf │ ├── datasources.properties │ └── log4j.properties └── scala │ ├── com │ └── zyc │ │ ├── SystemInit.scala │ │ ├── common │ │ ├── HACommon.scala │ │ ├── Log4jJDBCAppender.scala │ │ ├── LogCommon.scala │ │ ├── MariadbCommon.scala │ │ ├── MongoDbLoggingEventBsonifier.java │ │ ├── RedisCommon.scala │ │ ├── ServerSparkListener.scala │ │ ├── SparkBuilder.scala │ │ └── ZdhMongoDbAppender.java │ │ ├── netty │ │ ├── HttpBaseHandler.scala │ │ ├── HttpServerHandler.scala │ │ └── NettyServer.scala │ │ ├── util │ │ ├── DateUtil.scala │ │ ├── HttpUtil.scala │ │ ├── JsonSchemaBuilder.scala │ │ ├── JsonUtil.scala │ │ └── StringDefault.scala │ │ └── zdh │ │ ├── DataSources.scala │ │ ├── ZdhDataSources.scala │ │ ├── ZdhHandler.scala │ │ └── datasources │ │ ├── CassandraDataSources.scala │ │ ├── DataFactorySources.scala │ │ ├── DataWareHouseSources.scala │ │ ├── DownDataSources.scala │ │ ├── ESDataSources.scala │ │ ├── FlumeDataSources.scala │ │ ├── FtpDataSources.scala │ │ ├── GreenplumDataSources.scala │ │ ├── HbaseDataSources.scala │ │ ├── HdfsDataSources.scala │ │ ├── HiveDataSources.scala │ │ ├── HttpDataSources.scala │ │ ├── IcebergDataSources.scala │ │ ├── ImageDataSources.scala │ │ ├── JdbcDataSources.scala │ │ ├── KafKaDataSources.scala │ │ ├── KuduDataSources.scala │ │ ├── LocalDataSources.scala │ │ ├── MemSqlDataSources.scala │ │ ├── MongoDBDataSources.scala │ │ ├── QualityDataSources.scala │ │ ├── RedisDataSources.scala │ │ ├── SFtpDataSources.scala │ │ ├── SolrDataSources.scala │ │ ├── TidbDataSources.scala │ │ ├── http │ │ ├── HttpOptions.scala │ │ ├── HttpRelation.scala │ │ └── HttpRelationProvider.scala │ │ └── sftp │ │ ├── DeleteTempFileShutdownHook.scala │ │ ├── SftpRelation.scala │ │ ├── SftpSource.scala │ │ ├── constants.scala │ │ └── util │ │ └── Utils.scala │ └── org │ └── apache │ └── spark │ └── sql │ └── execution │ └── datasources │ ├── clickhouse │ ├── ClickHouseDialect.scala │ ├── ClickHouseOptions.scala │ ├── ClickHouseRDD.scala │ ├── ClickHouseRelation.scala │ ├── ClickHouseRelationProvider.scala │ ├── ClickHouseUtils.scala │ ├── DriverRegistry.scala │ └── DriverWrapper.scala │ └── hive │ ├── DriverRegistry.scala │ ├── DriverWrapper.scala │ ├── HiveDialect.scala │ ├── HiveOptions.scala │ ├── HiveRDD.scala │ ├── HiveRelation.scala │ ├── HiveRelationProvider.scala │ └── HiveUtils.scala └── test ├── resources ├── datasources.propertites └── rules │ └── rules.drl └── scala └── com └── zyc ├── AppTest.scala ├── TEST_TRAIT2.scala └── zdh ├── CassandraDataSourcesTest.scala ├── DataSourcesTest.scala ├── HbaseDataSourcesTest.scala ├── KafKaDataSourcesTest.scala ├── MinioDataSourcesTest.scala ├── MongoDBDataSourcesTest.scala ├── RedisDataSourcesTest.scala ├── T1.scala ├── datasources ├── ESDataSourcesTest.scala ├── FlumeDataSourcesTest.scala ├── FtpDataSourcesTest.scala ├── GreenplumDataSourcesTest.scala ├── HdfsDataSourcesTest.scala ├── IcebergDataSourcesTest.scala ├── JdbcDataSourcesTest.scala ├── KuduDataSourcesTest.scala └── TidbDataSourcesTest.scala └── sparkTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | 4 | ### STS ### 5 | .apt_generated 6 | .classpath 7 | .factorypath 8 | .project 9 | .settings 10 | .springBeans 11 | 12 | ### IntelliJ IDEA ### 13 | .idea 14 | *.iws 15 | *.iml 16 | *.ipr 17 | 18 | ### NetBeans ### 19 | nbproject/private/ 20 | build/ 21 | nbbuild/ 22 | dist/ 23 | nbdist/ 24 | .nb-gradle/ 25 | 26 | logs/ 27 | 28 | shell_script/ 29 | 30 | release/bin/logs/ 31 | release/libs 32 | 33 | spark_submit/ 34 | 35 | src/main/resources/application-tmp.properties 36 | 37 | ssh.sh 38 | 39 | quick.bat 40 | 41 | *-RELEASE/ 42 | *-RELEASE.tar 43 | 44 | derby.log 45 | metastore_db -------------------------------------------------------------------------------- /build.bat: -------------------------------------------------------------------------------- 1 | mvn package -Dmaven.test.skip=true -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | mvn package -Dmaven.test.skip=true -------------------------------------------------------------------------------- /libs/gbase-connector-java-8.3.81.53-build52.8-bin.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/gbase-connector-java-8.3.81.53-build52.8-bin.jar -------------------------------------------------------------------------------- /libs/greenplum-jdbc-5.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/greenplum-jdbc-5.1.4.jar -------------------------------------------------------------------------------- /libs/greenplum-spark_2.11-1.6.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/greenplum-spark_2.11-1.6.0.jar -------------------------------------------------------------------------------- /libs/kingbasejdbc3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/kingbasejdbc3.jar -------------------------------------------------------------------------------- /libs/mysql-connector-java-8.0.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/mysql-connector-java-8.0.11.jar -------------------------------------------------------------------------------- /libs/ojdbc6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/ojdbc6.jar -------------------------------------------------------------------------------- /libs/redshift-jdbc42-1.2.43.1067.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/redshift-jdbc42-1.2.43.1067.jar -------------------------------------------------------------------------------- /libs/redshift-jdbc42-no-awssdk-1.2.43.1067.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/redshift-jdbc42-no-awssdk-1.2.43.1067.jar -------------------------------------------------------------------------------- /libs/tdgssconfig.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/tdgssconfig.jar -------------------------------------------------------------------------------- /libs/terajdbc4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/terajdbc4.jar -------------------------------------------------------------------------------- /libs/tispark-assembly-2.3.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/tispark-assembly-2.3.11.jar -------------------------------------------------------------------------------- /libs/zdh_log4j_jdbc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/zdh_log4j_jdbc.jar -------------------------------------------------------------------------------- /libs/zdh_rqueue.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/libs/zdh_rqueue.jar -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # 技术栈 2 | 3 | + spark 2.4.4 4 | + hadoop 3.1.x 5 | + hive > 2.3.3 6 | + kafka 1.x,2.x 7 | + scala 2.11.12 8 | + java 1.8 9 | + hbase < 1.3.6 (可选) 10 | 11 | # 提示 12 | zdh_server改名为zdh_spark 13 | zdh 分2部分,前端配置+后端数据ETL处理,此部分只包含ETL处理 14 | 前端配置项目 请参见项目 https://github.com/zhaoyachao/zdh_web 15 | zdh_web 和zdh_server 保持同步 大版本会同步兼容 如果zdh_web 选择版本1.0 ,zdh_server 使用1.x 都可兼容 16 | 二次开发同学 请选择dev 分支,dev 分支只有测试通过才会合并master,所以master 可能不是最新的,但是可保证可用性 17 | 18 | # 在线预览 19 | http://zycblog.cn:8081/login 20 | 用户名:zyc 21 | 密码:123456 22 | 23 | 服务器资源有限,界面只供预览,不包含数据处理部分,谢码友们手下留情 24 | 25 | # 项目介绍 26 | 27 | 数据采集ETL处理,通过spark 平台抽取数据,并根据etl 相关函数,做数据处理 28 | 新增数据源需要继承ZdhDataSources 公共接口,重载部分函数即可 29 | 30 | # 项目编译打包 31 | 项目采用maven 管理 32 | 打包命令,在当前项目目录下执行 33 | window: mvn package -Dmaven.test.skip=true 34 | 35 | 项目需要的jar 会自动生成到zdh_spark-xxxx-RELEASE 目录下 36 | 37 | 38 | # 部署 39 | 1 拷贝zdh_spark-xxxx-RELEASE到服务器(linux) 40 | 2 拷贝zdh_spark-xxxx-RELEASE/copy_spark_jars 目录下的jar 拷贝到spark home 目录下的jars 目录 41 | 3 修改zdh_spark-xxxx-RELEASE/conf/datasources.propertites 42 | 4 修改zdh_spark-xxxx-RELEASE/conf/log4j.propertites 43 | 5 配置系统spark环境变变量SPARK_HOME 44 | 6 启动脚本 start_server.sh 45 | 46 | # 启动脚本 47 | 注意项目需要用到log4j.properties 需要单独放到driver 机器上,启动采用client 模式 48 | 49 | 50 | # 停止脚本 51 | kill `ps -ef |grep SparkSubmit |grep zdh_server |awk -F ' ' '{print $2}'` 52 | 53 | # 个人联系方式 54 | 邮件:1209687056@qq.com 55 | 56 | # FAQ 57 | 使用tidb 连接时,需要在zdh_server 启动配置文件中添加如下配置 58 | spark.tispark.pd.addresses 192.168.1.100:2379 59 | spark.sql.extensions org.apache.spark.sql.TiExtensions 60 | 61 | # 版本更新说明 62 | + v5.1.1 修复http数据源 63 | + v5.3.0 优化pom文件 64 | 65 | + v5.3.4 支持消息队列获取任务 66 | + v5.3.5 升级优先级队列版本 67 | + v5.3.6 无改动仅配合版本变更 68 | 69 | + v5.4.0 修复启动jar缺失redisson,hutool 70 | 71 | + v5.6.3 hdfs支持kerberos认证 -------------------------------------------------------------------------------- /release/bin/ljars.sh: -------------------------------------------------------------------------------- 1 | classp= 2 | for jar in `ls $1/libs`; 3 | do 4 | if [ -z "$classp" ]; then 5 | classp="$1/libs/$jar" 6 | else 7 | classp="$classp,$1/libs/$jar" 8 | fi 9 | done 10 | echo $classp 11 | -------------------------------------------------------------------------------- /release/bin/start_server.sh: -------------------------------------------------------------------------------- 1 | set ff=unix 2 | BIN_PATH=$(cd `dirname $0`; pwd) 3 | BASE_RUN_PATH=$(cd "$BIN_PATH/../"; pwd) 4 | files=`sh $BASE_RUN_PATH/bin/ljars.sh $BASE_RUN_PATH` 5 | echo $files 6 | nohup ${SPARK_HOME}/bin/spark-submit \ 7 | --class com.zyc.SystemInit \ 8 | --driver-memory 800M \ 9 | --conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:$BASE_RUN_PATH/conf/log4j.properties" \ 10 | --driver-class-path $BASE_RUN_PATH/conf:$BASE_RUN_PATH/libs \ 11 | --files $BASE_RUN_PATH/conf/application.conf,$BASE_RUN_PATH/conf/datasources.properties \ 12 | --jars $files \ 13 | $BASE_RUN_PATH/zdh_spark.jar \ 14 | >zdh_spark.log & 15 | -------------------------------------------------------------------------------- /release/bin/stop_server.sh: -------------------------------------------------------------------------------- 1 | kill `ps -ef |grep SparkSubmit |grep zdh_server |awk -F ' ' '{print $2}'` 2 | -------------------------------------------------------------------------------- /release/conf/application.conf: -------------------------------------------------------------------------------- 1 | server{ 2 | host="" 3 | port=60001 4 | } 5 | #必须写 6 | instance=zdh_server 7 | 8 | #启动之后是否被web端可发现,0:不可用,1:可用 9 | online=1 10 | 11 | #单位秒 12 | time_interval=10 13 | 14 | #spark 历史服务器,可为空 15 | spark_history_server="http://127.0.0.1:18080/api/v1" 16 | 17 | 18 | redis{ 19 | #signle,cluster,如果不使用redis 存储model 为空 20 | model="" 21 | url="127.0.0.1:6379" 22 | password="zyczzu" 23 | } 24 | 25 | queue{ 26 | pre_key="zdh_spark_etl_queue" 27 | } -------------------------------------------------------------------------------- /release/conf/datasources.properties: -------------------------------------------------------------------------------- 1 | enable=true 2 | url=jdbc:mysql://127.0.0.1:3306/zdh?serverTimezone=GMT%2B8&useSSL=false 3 | driver=com.mysql.cj.jdbc.Driver 4 | username=zyc 5 | password=123456 -------------------------------------------------------------------------------- /release/conf/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,appender1,file 2 | log4j.logger.com.zyc=DEBUG,appender2 3 | 4 | log4j.appender.appender1=org.apache.log4j.ConsoleAppender 5 | log4j.appender.appender1.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.appender1.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss:SSS}[%p]: %m%n 7 | 8 | log4j.appender.file = org.apache.log4j.RollingFileAppender 9 | log4j.appender.file.File=logs/server.log 10 | log4j.appender.file.MaxFileSize=10mb 11 | log4j.appender.file.Threshold=DEBUG 12 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 13 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss:SSS}[%p]: %m%n 14 | 15 | log4j.appender.appender2=com.zyc.common.Log4jJDBCAppender 16 | log4j.appender.appender2.driver=com.mysql.cj.jdbc.Driver 17 | log4j.appender.appender2.URL=jdbc:mysql://127.0.0.1:3306/zdh?serverTimezone=GMT%2B8&useSSL=false&allowPublicKeyRetrieval=true 18 | log4j.appender.appender2.user=zyc 19 | log4j.appender.appender2.password=123456 20 | log4j.appender.appender2.sql=insert into zdh_logs (task_logs_id,job_id,log_time,msg,level) VALUES ('%X{task_logs_id}','%X{job_id}','%d{yyyy-MM-dd HH:mm:ss}', "%20c %m",'%p') 21 | log4j.appender.appender2.layout=org.apache.log4j.PatternLayout -------------------------------------------------------------------------------- /release/copy_spark_jars/gbase-connector-java-8.3.81.53-build52.8-bin.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/gbase-connector-java-8.3.81.53-build52.8-bin.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/greenplum-jdbc-5.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/greenplum-jdbc-5.1.4.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/greenplum-spark_2.11-1.6.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/greenplum-spark_2.11-1.6.0.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/hutool-all-4.4.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/hutool-all-4.4.5.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/kingbasejdbc3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/kingbasejdbc3.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/mysql-connector-java-8.0.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/mysql-connector-java-8.0.11.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/ojdbc6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/ojdbc6.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/redisson-3.22.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/redisson-3.22.0.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/redshift-jdbc42-1.2.43.1067.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/redshift-jdbc42-1.2.43.1067.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/tdgssconfig.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/tdgssconfig.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/terajdbc4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/terajdbc4.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/tispark-assembly-2.3.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/tispark-assembly-2.3.11.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/zdh_log4j_jdbc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/zdh_log4j_jdbc.jar -------------------------------------------------------------------------------- /release/copy_spark_jars/zdh_rqueue.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/release/copy_spark_jars/zdh_rqueue.jar -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | server{ 2 | host="" 3 | port=60001 4 | } 5 | #必须写 6 | instance=zdh_server 7 | 8 | #启动之后是否被web端可发现,0:不可用,1:可用 9 | online=1 10 | 11 | #单位秒 12 | time_interval=10 13 | 14 | #spark 历史服务器,可为空 15 | spark_history_server="http://127.0.0.1:18080/api/v1" 16 | 17 | 18 | redis{ 19 | #signle,cluster,如果不使用redis 存储model 为空 20 | model="" 21 | url="127.0.0.1:6379" 22 | password="zyczzu" 23 | } 24 | 25 | queue{ 26 | pre_key="zdh_spark_etl_queue" 27 | } -------------------------------------------------------------------------------- /src/main/resources/datasources.properties: -------------------------------------------------------------------------------- 1 | enable=true 2 | url=jdbc:mysql://127.0.0.1:3306/zdh?serverTimezone=GMT%2B8&useSSL=false&allowPublicKeyRetrieval=true 3 | driver=com.mysql.cj.jdbc.Driver 4 | username=zyc 5 | password=123456 -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,appender1,file 2 | log4j.logger.com.zyc=INFO,mysql 3 | 4 | log4j.appender.appender1=org.apache.log4j.ConsoleAppender 5 | log4j.appender.appender1.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.appender1.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss:SSS}[%p]: %m%n 7 | 8 | log4j.appender.file = org.apache.log4j.RollingFileAppender 9 | log4j.appender.file.File=logs/server.log 10 | log4j.appender.file.MaxFileSize=10mb 11 | log4j.appender.file.Threshold=DEBUG 12 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 13 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss:SSS}[%p]: %m%n 14 | 15 | log4j.appender.mysql=com.zyc.common.Log4jJDBCAppender 16 | log4j.appender.mysql.driver=com.mysql.cj.jdbc.Driver 17 | log4j.appender.mysql.URL=jdbc:mysql://127.0.0.1:3306/zdh?serverTimezone=GMT%2B8&useSSL=false&allowPublicKeyRetrieval=true 18 | log4j.appender.mysql.user=zyc 19 | log4j.appender.mysql.password=123456 20 | log4j.appender.mysql.sql=insert into zdh_logs (task_logs_id,job_id,log_time,msg,level) VALUES ('%X{task_logs_id}','%X{job_id}','%d{yyyy-MM-dd HH:mm:ss}', "%20c %m",'%p') 21 | log4j.appender.mysql.layout=org.apache.log4j.PatternLayout 22 | 23 | # mongoDB和 24 | #log4j.appender.MongoDB=com.zyc.common.ZdhMongoDbAppender 25 | #log4j.appender.MongoDB.databaseName=zdh 26 | #log4j.appender.MongoDB.collectionName=zdhLogs 27 | #log4j.appender.MongoDB.hostname=192.168.110.10 28 | #log4j.appender.MongoDB.port=27017 29 | #log4j.appender.MongoDB.layout=org.apache.log4j.PatternLayout -------------------------------------------------------------------------------- /src/main/scala/com/zyc/SystemInit.scala: -------------------------------------------------------------------------------- 1 | package com.zyc 2 | 3 | import java.util.concurrent.{LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit} 4 | 5 | import com.typesafe.config.{Config, ConfigFactory} 6 | import com.zyc.base.util.{HttpUtil, JsonUtil} 7 | import com.zyc.common.{MariadbCommon, ServerSparkListener, SparkBuilder} 8 | import com.zyc.netty.NettyServer 9 | import com.zyc.rqueue.{RQueueManager, RQueueMode} 10 | import com.zyc.zdh.ZdhHandler 11 | import org.apache.log4j.MDC 12 | import org.slf4j.LoggerFactory 13 | 14 | object SystemInit { 15 | 16 | val logger = LoggerFactory.getLogger(this.getClass) 17 | //心跳检测路径 18 | val keeplive_url="/api/v1/zdh/keeplive" 19 | 20 | private val threadpool = new ThreadPoolExecutor( 21 | 1, // core pool size 22 | 1, // max pool size 23 | 500, // keep alive time 24 | TimeUnit.MILLISECONDS, 25 | new LinkedBlockingQueue[Runnable]() 26 | ) 27 | 28 | def main(args: Array[String]): Unit = { 29 | MDC.put("job_id", "001") 30 | val configLoader=ConfigFactory.load("application.conf") 31 | var host = configLoader.getConfig("server").getString("host") 32 | val port = configLoader.getConfig("server").getString("port") 33 | val zdh_instance = configLoader.getString("instance") 34 | val time_interval = configLoader.getString("time_interval").toLong 35 | val spark_history_server = configLoader.getString("spark_history_server") 36 | val online = configLoader.getString("online") 37 | 38 | initRQueue(configLoader) 39 | 40 | logger.info("开始初始化SparkSession") 41 | val spark = SparkBuilder.getSparkSession() 42 | val uiWebUrl = spark.sparkContext.uiWebUrl.get 43 | if(host.trim.equals("")){ 44 | host=uiWebUrl.split(":")(1).substring(2) 45 | } 46 | val applicationId=spark.sparkContext.applicationId 47 | val master=spark.sparkContext.master 48 | spark.sparkContext.master 49 | try{ 50 | //org.apache.hadoop.conf.Configuration 51 | MariadbCommon.insertZdhHaInfo(zdh_instance,host , port, uiWebUrl.split(":")(2),applicationId,spark_history_server,master,online) 52 | logger.info("开始初始化netty server") 53 | new Thread(new Runnable { 54 | override def run(): Unit = new NettyServer().start() 55 | }).start() 56 | 57 | consumer(configLoader) 58 | 59 | while (true){ 60 | val list=MariadbCommon.getZdhHaInfo() 61 | 62 | list.filter(map=> !map.getOrElse("zdh_host","").equals(host) || !map.getOrElse("zdh_port","").equals(port)) 63 | .foreach(map=>{ 64 | val remote_host=map.getOrElse("zdh_host","") 65 | val remote_port=map.getOrElse("zdh_port","") 66 | val remote_url="http://"+remote_host+":"+remote_port+keeplive_url 67 | try{ 68 | val rs=HttpUtil.get(remote_url,Seq.empty[(String,String)]) 69 | }catch { 70 | case ex:Exception=>MariadbCommon.delZdhHaInfo(map.getOrElse("id","-1")) 71 | } 72 | }) 73 | 74 | if(list.filter(map=> map.getOrElse("zdh_host","").equals(host) && map.getOrElse("zdh_port","").equals(port)).size<1){ 75 | logger.debug("当前节点丢失,重新注册当前节点") 76 | MariadbCommon.insertZdhHaInfo(zdh_instance,host , port, uiWebUrl.split(":")(2),applicationId,spark_history_server,master,online) 77 | }else{ 78 | logger.debug("当前节点存在,更新当前节点") 79 | val instance=list.filter(map=> map.getOrElse("zdh_host","").equals(host) && map.getOrElse("zdh_port","").equals(port))(0) 80 | val id=instance.getOrElse("id","-1") 81 | MariadbCommon.updateZdhHaInfoUpdateTime(id) 82 | if(instance.getOrElse("online","0").equalsIgnoreCase("2")){ 83 | if(ServerSparkListener.jobs.size()<=0){ 84 | logger.info("当前节点物理下线成功") 85 | MariadbCommon.delZdhHaInfo("enabled",host,port) 86 | System.exit(0) 87 | }else{ 88 | logger.info("当前节点存在正在执行的任务,任务执行完成,自动物理下线") 89 | } 90 | }else if(instance.getOrElse("online","0").equalsIgnoreCase("0")){ 91 | if(ServerSparkListener.jobs.size()<=0){ 92 | logger.info("当前节点逻辑下线成功") 93 | }else{ 94 | logger.info("当前节点存在正在执行的任务,任务执行完成,自动逻辑下线") 95 | } 96 | } 97 | 98 | } 99 | 100 | Thread.sleep(time_interval*1000) 101 | } 102 | 103 | 104 | }catch { 105 | case ex:Exception=>{ 106 | logger.error(ex.getMessage) 107 | } 108 | }finally { 109 | MariadbCommon.delZdhHaInfo("enabled",host, port) 110 | } 111 | 112 | } 113 | 114 | def initRQueue(config: Config): Unit = { 115 | val url = config.getString("redis.url") 116 | val auth = config.getString("redis.password") 117 | RQueueManager.buildDefault(url, auth) 118 | } 119 | 120 | def consumer(config: Config): Unit ={ 121 | 122 | val queue_pre = config.getString("queue.pre_key") 123 | val instance = config.getString("instance") 124 | val queue = queue_pre + "_" + instance 125 | logger.info("加载当前queue: "+queue) 126 | threadpool.execute(new Runnable { 127 | override def run(): Unit = { 128 | 129 | //延迟启动30s 130 | Thread.sleep(1000*30) 131 | while(true){ 132 | import util.control.Breaks._ 133 | breakable { 134 | var rqueueClient = RQueueManager.getRQueueClient(queue, RQueueMode.BLOCKQUEUE) 135 | var o = rqueueClient.poll() 136 | if(o == null){ 137 | break() 138 | } 139 | 140 | val param:Map[String, Any] = JsonUtil.jsonToMap(o.toString) 141 | val dispatchOptions = param.getOrElse("tli", Map.empty[String, Any]).asInstanceOf[Map[String, Any]] 142 | val dispatch_task_id = dispatchOptions.getOrElse("job_id", "001").toString 143 | val task_logs_id=param.getOrElse("task_logs_id", "001").toString 144 | val etl_date = JsonUtil.jsonToMap(dispatchOptions.getOrElse("params", "").toString).getOrElse("ETL_DATE", "").toString 145 | MariadbCommon.updateTaskStatus(task_logs_id, dispatch_task_id, "etl", etl_date, "22") 146 | 147 | try{ 148 | //消费队列,调用 149 | val more_task = dispatchOptions.getOrElse("more_task", "") 150 | if(more_task.equals("ETL")){ 151 | ZdhHandler.etl(param) 152 | }else if(more_task.equals("MORE_ETL")){ 153 | ZdhHandler.moreEtl(param) 154 | }else if(more_task.toString.toUpperCase.equals("QUALITY")){ 155 | ZdhHandler.quality(param) 156 | }else if(more_task.toString.toUpperCase.equals("APPLY")){ 157 | ZdhHandler.apply(param) 158 | }else if(more_task.toString.toUpperCase.equals("DROOLS")){ 159 | ZdhHandler.droolsEtl(param) 160 | }else if(more_task.toString.toUpperCase.equals("SQL")){ 161 | ZdhHandler.sqlEtl(param) 162 | }else{ 163 | throw new Exception("未知的more_task: "+more_task) 164 | } 165 | 166 | }catch { 167 | case ex:Exception=>{ 168 | logger.error("[数据采集]:[consumer]:[ERROR]:" + ex.getMessage, ex.getCause) 169 | MariadbCommon.updateTaskStatus2(task_logs_id,dispatch_task_id,dispatchOptions,etl_date) 170 | } 171 | } 172 | 173 | } 174 | 175 | 176 | 177 | } 178 | 179 | } 180 | }) 181 | } 182 | 183 | 184 | } 185 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/HACommon.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.common 2 | 3 | import java.net.{InetSocketAddress, Socket} 4 | 5 | import com.typesafe.config.ConfigFactory 6 | import com.zyc.SystemInit.logger 7 | import com.zyc.base.util.{HttpUtil, JsonUtil} 8 | 9 | object HACommon { 10 | 11 | var enable=false 12 | var port="" 13 | var zdh_instance="" 14 | var current_host = "" 15 | var etcd_cluster ="" 16 | var timeout = 1000*5 17 | val checkServiceTime=1000*5 18 | var etcd_host=Array.empty[String] 19 | 20 | val ha_key="/v2/keys/zdh_ha" 21 | val ha_lock_key="/v2/keys/zdh_ha_lock" 22 | val ha_preExist="?prevExist=false&value=" 23 | val ha_update="?value=" 24 | 25 | /** 26 | * 返回 true 时 外部可以启动任务 27 | * @param configFile 28 | */ 29 | def etcdHA(configFile:String): Boolean ={ 30 | 31 | val config = ConfigFactory.load(configFile) 32 | zdh_instance = config.getString("instance") 33 | if (config.getConfig("zdh_ha").getBoolean("enable")) { 34 | enable=true 35 | port = config.getConfig("server").getString("port") 36 | current_host = config.getConfig("zdh_ha").getString("current_host") 37 | 38 | etcd_cluster = config.getConfig("zdh_ha").getString("etcd_cluster") 39 | timeout = config.getConfig("zdh_ha").getInt("timeout") 40 | etcd_host = etcd_cluster.split(",") 41 | 42 | return HAProcess() 43 | } 44 | enable=false 45 | true 46 | } 47 | 48 | 49 | def HAProcess(): Boolean ={ 50 | //检测到服务死亡 51 | checkService(true,etcd_host) 52 | 53 | //获取锁 54 | val lock=getEtcdLock(true) 55 | 56 | while (!lock){ 57 | //否则开始监听锁,返回true 表示监听到了事件 58 | waitLock(true) 59 | HAProcess() 60 | } 61 | 62 | lock 63 | } 64 | 65 | /** 66 | * 检测服务 67 | * @param enable 68 | * @param etcd_host 69 | */ 70 | def checkService(enable:Boolean,etcd_host:Array[String]): Unit = { 71 | println("检测HA") 72 | if (enable) { 73 | 74 | //默认false 表示不起新服务 75 | var retry = false 76 | var cluster_index = 0 77 | 78 | //etcd 请求创建Ha 标识,返回true 标识创建成功, 79 | val result = putEtcd(etcd_host, ha_key+ha_preExist+current_host) 80 | 81 | if (result == false) { 82 | println("项目已经..开始检查远程服务器是否存活") 83 | retry = isRetry(etcd_host, ha_key, port, timeout) 84 | } 85 | 86 | 87 | while (retry == false) { 88 | println("检查远程服务器存活") 89 | Thread.sleep(checkServiceTime) 90 | retry = isRetry(etcd_host, ha_key, port, timeout) 91 | } 92 | println("检查远程服务器不可达...准备启动新服务") 93 | 94 | } 95 | } 96 | 97 | /** 98 | * 等待锁 99 | */ 100 | def waitLock(enable:Boolean): Unit ={ 101 | 102 | println("等待锁") 103 | var change=false 104 | val config = ConfigFactory.load("application.conf") 105 | if (enable) { 106 | 107 | var cluster_index = 0 108 | //url="/v2/keys/zdh_ha?value="+current_host 109 | var retry = true 110 | var result = "" 111 | while (retry) { 112 | try { 113 | result = HttpUtil.get("http://" + etcd_host(cluster_index) + ha_lock_key+"?wait=true", Seq.empty) 114 | retry = false 115 | change=true 116 | } catch { 117 | case ex: Exception => { 118 | ex.printStackTrace() 119 | logger.info("请求etcd 出错,重新尝试") 120 | cluster_index = cluster_index + 1 121 | if (cluster_index > etcd_host.size) { 122 | retry = false 123 | } 124 | 125 | } 126 | } 127 | 128 | } 129 | 130 | } 131 | 132 | change 133 | 134 | } 135 | 136 | /** 137 | * 获取锁 138 | * @param enable 139 | * @return 140 | */ 141 | def getEtcdLock(enable:Boolean): Boolean = { 142 | println("尝试获取锁") 143 | var lock=false 144 | if (enable) { 145 | lock = putEtcd(etcd_host, ha_lock_key+"?prevExist=false&value=" + current_host) 146 | } 147 | 148 | lock 149 | 150 | } 151 | 152 | 153 | def deleteEtcdLock(): Unit ={ 154 | 155 | if(enable){ 156 | deleteEtcd(etcd_host,ha_lock_key) 157 | } 158 | } 159 | 160 | def updateEtcd(): Unit ={ 161 | if(enable){ 162 | putEtcd(etcd_host,ha_key+ha_update+current_host) 163 | } 164 | } 165 | 166 | /** 167 | * 168 | * @param etcd_host 169 | * @param url 170 | * @return 返回值表示 是否插入成功 171 | */ 172 | def putEtcd(etcd_host: Array[String], url: String): Boolean = { 173 | var cluster_index = 0 174 | //url="/v2/keys/zdh_ha?value="+current_host 175 | var retry = true 176 | var result = "" 177 | while (retry) { 178 | try { 179 | result = HttpUtil.put("http://" + etcd_host(cluster_index) + url, Seq.empty) 180 | retry = false 181 | } catch { 182 | case ex: Exception => { 183 | ex.printStackTrace() 184 | logger.info("请求etcd 出错,重新尝试") 185 | cluster_index = cluster_index + 1 186 | if (cluster_index > etcd_host.size) { 187 | retry = false 188 | } 189 | 190 | } 191 | } 192 | 193 | } 194 | 195 | 196 | //判断返回结果 197 | if (result.equals("") || result.contains("errorCode")) 198 | return false 199 | else 200 | true 201 | 202 | } 203 | 204 | /** 205 | * 获取kv, 206 | * @param etcd_host 207 | * @param url 208 | * @return 209 | */ 210 | def getEtcd(etcd_host: Array[String], url: String): String = { 211 | var cluster_index = 0 212 | //url="/v2/keys/zdh_ha?value="+current_host 213 | var retry = true 214 | var result = "{}" 215 | while (retry) { 216 | try { 217 | result = HttpUtil.get("http://" + etcd_host(cluster_index) + url, Seq.empty) 218 | retry = false 219 | } catch { 220 | case ex: Exception => { 221 | ex.printStackTrace() 222 | logger.info("请求etcd 出错,重新尝试") 223 | cluster_index = cluster_index + 1 224 | if (cluster_index > etcd_host.size) { 225 | retry = false 226 | } 227 | 228 | } 229 | } 230 | 231 | } 232 | 233 | result 234 | 235 | } 236 | 237 | def deleteEtcd(etcd_host: Array[String], url: String): String = { 238 | println("删除ha 临时锁") 239 | var cluster_index = 0 240 | //url="/v2/keys/zdh_ha?value="+current_host 241 | var retry = true 242 | var result = "{}" 243 | while (retry) { 244 | try { 245 | result = HttpUtil.delete("http://" + etcd_host(cluster_index) + url, Seq.empty) 246 | retry = false 247 | } catch { 248 | case ex: Exception => { 249 | ex.printStackTrace() 250 | logger.info("请求etcd 出错,重新尝试") 251 | cluster_index = cluster_index + 1 252 | if (cluster_index > etcd_host.size) { 253 | retry = false 254 | } 255 | 256 | } 257 | } 258 | 259 | } 260 | 261 | result 262 | 263 | } 264 | 265 | /** 266 | * 判断主程序是否重启 267 | * 268 | * @param etcd_host 269 | * @param url 270 | * @param port 271 | * @param timeout 272 | * @return false/true 273 | */ 274 | def isRetry(etcd_host: Array[String], url: String, port: String, timeout: Int): Boolean = { 275 | 276 | val socket = new Socket(); 277 | val value = getEtcd(etcd_host, url) 278 | 279 | var retry = false 280 | //获取远程ip 281 | val remote_host = JsonUtil.jsonToMap(value).getOrElse("node", Map.empty[String, String]).asInstanceOf[Map[String, String]].getOrElse("value", "") 282 | 283 | try { 284 | //测试远程服务器存活 285 | println("连接远程服务器ip:"+remote_host+",port:"+port) 286 | socket.connect(new InetSocketAddress(remote_host, Integer.parseInt(port)), timeout); 287 | } catch { 288 | case ex: Exception => { 289 | println("远程服务器不可达") 290 | retry = true 291 | } 292 | } 293 | retry 294 | 295 | 296 | } 297 | } 298 | 299 | 300 | 301 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/Log4jJDBCAppender.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.common 2 | 3 | import java.sql.{Connection, DriverManager} 4 | 5 | import org.apache.log4j.jdbc.JDBCAppender 6 | 7 | class Log4jJDBCAppender extends JDBCAppender{ 8 | 9 | override def getConnection: Connection = { 10 | 11 | if (!DriverManager.getDrivers().hasMoreElements()) 12 | setDriver("com.mysql.cj.jdbc.Driver") 13 | 14 | if ((null != connection) && !connection.isValid(5000)) { 15 | println("日志服务器重连......") 16 | connection = null; 17 | } 18 | 19 | if (connection == null) { 20 | connection = DriverManager.getConnection(databaseURL, databaseUser, databasePassword); 21 | } 22 | 23 | return connection; 24 | 25 | 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/LogCommon.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.common 2 | 3 | import java.sql.Timestamp 4 | import java.util.Date 5 | import java.util.concurrent.LinkedBlockingDeque 6 | 7 | case class zhd_logs(id:String,log_time:Timestamp,msg:String,level:String) 8 | 9 | object LogCommon { 10 | 11 | val linkedBlockingDeque=new LinkedBlockingDeque[zhd_logs]() 12 | 13 | 14 | 15 | def info(msg:String,level:String="info")(implicit id:String): Unit ={ 16 | 17 | val lon_time=new Timestamp(new Date().getTime) 18 | linkedBlockingDeque.add(zhd_logs(id,lon_time,msg,level)) 19 | //System.out.println("id:"+id+",log_time:"+lon_time+",msg:"+msg) 20 | 21 | } 22 | 23 | new Thread(new Runnable { 24 | override def run(): Unit = { 25 | 26 | while (true){ 27 | val log=linkedBlockingDeque.take() 28 | // MariadbCommon.insertJob(log.id,log.log_time,log.msg,log.level); 29 | } 30 | } 31 | }).start() 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/MongoDbLoggingEventBsonifier.java: -------------------------------------------------------------------------------- 1 | package com.zyc.common; 2 | 3 | import com.mongodb.BasicDBObject; 4 | import com.mongodb.DBObject; 5 | import org.apache.log4j.spi.LoggingEvent; 6 | import org.log4mongo.LoggingEventBsonifierImpl; 7 | 8 | import java.util.Date; 9 | 10 | public class MongoDbLoggingEventBsonifier extends LoggingEventBsonifierImpl { 11 | 12 | @Override 13 | public DBObject bsonify(LoggingEvent loggingEvent) { 14 | BasicDBObject result = null; 15 | if (loggingEvent != null) { 16 | result = new BasicDBObject(); 17 | String task_logs_id=String.valueOf(loggingEvent.getMDC("task_logs_id")); 18 | String job_id=String.valueOf(loggingEvent.getMDC("job_id")); 19 | 20 | result.put("task_logs_id", task_logs_id); 21 | result.put("job_id", job_id); 22 | result.put("log_time", new Date(loggingEvent.getTimeStamp())); 23 | this.nullSafePut(result, "level", loggingEvent.getLevel().toString()); 24 | this.nullSafePut(result, "msg", loggingEvent.getMessage()); 25 | this.addHostnameInformation(result); 26 | } 27 | return result; 28 | } 29 | 30 | } -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/RedisCommon.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.common 2 | 3 | import com.typesafe.config.ConfigFactory 4 | import org.slf4j.LoggerFactory 5 | import redis.clients.jedis.{HostAndPort, Jedis, JedisCluster, JedisPool, JedisPoolConfig, Pipeline} 6 | 7 | object RedisCommon { 8 | 9 | val logger = LoggerFactory.getLogger(this.getClass) 10 | 11 | private var model=""; 12 | var jedis:Jedis=null; 13 | var jedisCluster:JedisCluster=null; 14 | 15 | def isRedis(): String ={ 16 | model 17 | } 18 | def connect(configFile:String): Unit ={ 19 | val config = ConfigFactory.load(configFile).getConfig("redis") 20 | if(config.getString("model").equalsIgnoreCase("signle")){ 21 | logger.info("初始化单机版本的redis") 22 | model="signle" 23 | val Array(host,port)=config.getString("url").split(":",2) 24 | val passwd=config.getString("password") 25 | val jedisConf: JedisPoolConfig = new JedisPoolConfig() 26 | val jedisPool = new JedisPool(jedisConf, host.toString, port.toInt, 10000, passwd) 27 | jedis=jedisPool.getResource 28 | 29 | }else if(config.getString("model").equalsIgnoreCase("cluster")){ 30 | logger.info("初始化集群版本的is") 31 | model="cluster" 32 | val hosts=config.getString("url").split(",") 33 | val hostAndPortsSet = new java.util.HashSet[HostAndPort]() 34 | hosts.foreach(host=>{ 35 | val Array(hs,port)=host.split(":",2) 36 | hostAndPortsSet.add(new HostAndPort(hs, port.toInt)) 37 | }) 38 | jedisCluster = new JedisCluster(hostAndPortsSet) 39 | } 40 | } 41 | 42 | 43 | def set(key:String,value:String): Unit ={ 44 | 45 | if(model.equals("signle")){ 46 | jedis.set(key,value) 47 | }else if(model.equalsIgnoreCase("cluster")){ 48 | jedisCluster.set(key,value) 49 | } 50 | 51 | } 52 | 53 | def get(key:String): String ={ 54 | if(model.equals("signle")){ 55 | jedis.get(key) 56 | }else if(model.equalsIgnoreCase("cluster")){ 57 | jedisCluster.get(key) 58 | }else{ 59 | "" 60 | } 61 | } 62 | 63 | def keys(par:String): java.util.Set[String] ={ 64 | if(model.equals("signle")){ 65 | jedis.keys(par) 66 | }else if(model.equalsIgnoreCase("cluster")){ 67 | jedisCluster.keys(par) 68 | }else{ 69 | null 70 | } 71 | } 72 | 73 | def pipeline(): Pipeline ={ 74 | if(model.equals("signle")){ 75 | jedis.pipelined() 76 | }else if(model.equalsIgnoreCase("cluster")){ 77 | throw new Exception("暂时不支持集群事务") 78 | }else{ 79 | null 80 | } 81 | } 82 | 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/ServerSparkListener.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.common 2 | 3 | import com.zyc.zdh.DataSources 4 | import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart, SparkListenerStageCompleted, SparkListenerTaskEnd, StageInfo} 5 | 6 | object ServerSparkListener{ 7 | 8 | //存储jobid 和进度 9 | val jobs=new java.util.concurrent.ConcurrentHashMap[Int,Int] 10 | //job->stage 11 | val stages=new java.util.concurrent.ConcurrentHashMap[Int,Seq[StageInfo]] 12 | //job->tasknum 13 | val tasks=new java.util.concurrent.ConcurrentHashMap[Int,Int] 14 | // stage->job 15 | val stage_job=new java.util.concurrent.ConcurrentHashMap[Int,Int] 16 | // job -> task_log_instance 17 | val job_tli=new java.util.concurrent.ConcurrentHashMap[Int,String] 18 | 19 | 20 | } 21 | 22 | class ServerSparkListener extends SparkListener { 23 | 24 | //INPUT 25,50 25 | //OUTPUT 50,100 26 | 27 | override def onJobStart(jobStart: SparkListenerJobStart): Unit = { 28 | jobStart.properties.keySet().toArray.foreach(key=> println(key+"==="+jobStart.properties.getProperty(key.toString))) 29 | var PROCESS=jobStart.properties.getProperty(DataSources.SPARK_ZDH_PROCESS) 30 | println("Process:"+PROCESS) 31 | if(PROCESS == null){ 32 | PROCESS=jobStart.properties.getProperty(DataSources.SPARK_ZDH_LOCAL_PROCESS, "OUTPUT") 33 | } 34 | val pro_num:Int = PROCESS match { 35 | case "INPUT" => 25 36 | case "OUTPUT" => 61 37 | } 38 | //获取对应的task_log_instance id 39 | val tli_id=jobStart.properties.getProperty("spark.jobGroup.id").split("_")(0) 40 | if(tli_id == null || tli_id.length() != 18){ 41 | return ; 42 | } 43 | ServerSparkListener.job_tli.put(jobStart.jobId,tli_id) 44 | MariadbCommon.updateTaskStatus3(tli_id,pro_num) 45 | ServerSparkListener.jobs.put(jobStart.jobId,pro_num) 46 | val total_tasks=jobStart.stageInfos.map(stage => stage.numTasks).sum 47 | ServerSparkListener.stages.put(jobStart.jobId,jobStart.stageInfos) 48 | ServerSparkListener.tasks.put(jobStart.jobId,total_tasks) 49 | jobStart.stageIds.map(sid=> ServerSparkListener.stage_job.put(sid,jobStart.jobId)) 50 | } 51 | 52 | override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { 53 | if(!ServerSparkListener.jobs.containsKey(jobEnd.jobId)){ 54 | return ; 55 | } 56 | ServerSparkListener.jobs.remove(jobEnd.jobId) 57 | ServerSparkListener.stages.get(jobEnd.jobId).foreach(stage=>{ 58 | ServerSparkListener.stage_job.remove(stage.stageId) 59 | }) 60 | ServerSparkListener.tasks.remove(jobEnd.jobId) 61 | ServerSparkListener.job_tli.remove(jobEnd.jobId) 62 | ServerSparkListener.stages.remove(jobEnd.jobId) 63 | } 64 | 65 | override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { 66 | println("stategId:"+taskEnd.stageId) 67 | } 68 | 69 | 70 | override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { 71 | val stageId=stageCompleted.stageInfo.stageId 72 | if(!ServerSparkListener.stage_job.containsKey(stageId)){ 73 | return ; 74 | } 75 | //stage 获取job 76 | val job_id=ServerSparkListener.stage_job.get(stageId) 77 | //获取所有的任务数 78 | val total_tasks=ServerSparkListener.tasks.get(job_id) 79 | //获取job_id对应的task_log_instance id 80 | val tli_id=ServerSparkListener.job_tli.get(job_id) 81 | val new_pro_num=ServerSparkListener.jobs.get(job_id)+35*(stageCompleted.stageInfo.numTasks/total_tasks) 82 | ServerSparkListener.jobs.put(job_id,new_pro_num) 83 | MariadbCommon.updateTaskStatus3(tli_id,new_pro_num) 84 | } 85 | 86 | 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/SparkBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.common 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object SparkBuilder{ 7 | private var sparkSession:SparkSession=null; 8 | def initSparkSession(): Unit ={ 9 | val sparkConf = new SparkConf() 10 | val system = System.getProperty("os.name"); 11 | if(system.toLowerCase().startsWith("win")){ 12 | sparkConf.setMaster("local[*]") 13 | } 14 | //sparkConf.setAppName("Spark Shenzhen SERVER") 15 | sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 16 | // sparkConf.set("hive.orc.splits.include.file.footer","true") 17 | // sparkConf.set("hive.exec.orc.default.stripe.size","268435456") 18 | // sparkConf.set("hive.exec.orc.split.strategy", "BI") 19 | // sparkConf.set("spark.sql.orc.impl","hive") 20 | // sparkConf.set("spark.sql.hive.convertMetastoreOrc","false") 21 | // sparkConf.set("spark.sql.orc.enableVectorizedReader","false") 22 | sparkConf.set("spark.sql.crossJoin.enabled","true") 23 | sparkConf.set("spark.extraListeners", classOf[ServerSparkListener].getName) 24 | // sparkConf.set("spark.sql.shuffle.partitions","2000") 25 | // sparkConf.set("spark.sql.extensions","org.apache.spark.sql.TiExtensions") 26 | // sparkConf.set("spark.tispark.pd.addresses","192.168.110.10:2379") 27 | // sparkConf.set("spark.sql.catalog.hadoop_prod.type", "hadoop") // 设置数据源类别为hadoop 28 | // sparkConf.set("spark.sql.catalog.hadoop_prod", classOf[].getName) 29 | // 指定Hadoop数据源的根目录 30 | // sparkConf.set("spark.sql.catalog.hadoop_prod.warehouse", "/data/iceberg") 31 | val sparkSession = SparkSession 32 | .builder() 33 | .config(sparkConf) 34 | .enableHiveSupport() 35 | .getOrCreate() 36 | // sparkSession.sql("set spark.sql.orc.filterPushdown=true") 37 | this.sparkSession=sparkSession 38 | } 39 | 40 | 41 | def getSparkSession(): SparkSession ={ 42 | synchronized{ 43 | if(sparkSession==null){ 44 | initSparkSession() 45 | } 46 | } 47 | sparkSession 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/common/ZdhMongoDbAppender.java: -------------------------------------------------------------------------------- 1 | package com.zyc.common; 2 | 3 | 4 | import com.mongodb.DBObject; 5 | import org.apache.commons.lang3.StringUtils; 6 | import org.apache.log4j.spi.LoggingEvent; 7 | import org.log4mongo.LoggingEventBsonifier; 8 | 9 | public class ZdhMongoDbAppender extends org.log4mongo.MongoDbAppender { 10 | 11 | private LoggingEventBsonifier bsonifier = new MongoDbLoggingEventBsonifier(); 12 | 13 | @Override 14 | protected void append(LoggingEvent loggingEvent) { 15 | if (StringUtils.isNotBlank(loggingEvent.getLoggerName()) 16 | && loggingEvent.getLoggerName().contains("com.zyc")) { 17 | DBObject bson = (DBObject)this.bsonifier.bsonify(loggingEvent); 18 | this.append(bson); 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /src/main/scala/com/zyc/netty/HttpBaseHandler.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.netty 2 | 3 | import java.net.URLDecoder 4 | 5 | import io.netty.buffer.Unpooled 6 | import io.netty.handler.codec.http.{DefaultFullHttpResponse, HttpResponseStatus, HttpVersion} 7 | import io.netty.util.AsciiString 8 | 9 | trait HttpBaseHandler { 10 | val ContentType = AsciiString.cached("Content-Type") 11 | val ContentLength = AsciiString.cached("Content-Length") 12 | val Connection = AsciiString.cached("Connection") 13 | val KeepAlive = AsciiString.cached("keep-alive") 14 | val noParam="{\"code\":500,\"msg\":\"no params\"}" 15 | val noService="{\"code\":500,\"msg\":\"no match reportService\"}" 16 | val noUri="{\"code\":500,\"msg\":\"request uri is wrong\"}" 17 | val unknownParam="{\"code\":500,\"msg\":\"unknown cmd\"}" 18 | val cmdOk="{\"code\":200,\"msg\":\"command executed\"}" 19 | val execErr="{\"code\":500,\"msg\":\"command execute error\"}" 20 | val serverErr="{\"code\":500,\"msg\":\"server error\"}" 21 | val cacheIsNull="{\"code\":501,\"msg\":\"model cache is null\"}" 22 | val chartSet:String="utf-8" 23 | 24 | 25 | def defaultResponse(respContent: String):DefaultFullHttpResponse={ 26 | val response = new DefaultFullHttpResponse( 27 | HttpVersion.HTTP_1_1, 28 | HttpResponseStatus.OK, 29 | Unpooled.wrappedBuffer(respContent.getBytes()) 30 | ) 31 | response.headers().set(ContentType, "application/json") 32 | response.headers().setInt(ContentLength, response.content().readableBytes()) 33 | response 34 | } 35 | def parseGetParam(uri: String):Map[String,String]={ 36 | var map=Map.empty[String,String] 37 | val array=URLDecoder.decode(uri,chartSet).split("\\?") 38 | if(array.length>1){ 39 | val params=array.apply(1).split("&").map(_.trim) 40 | params.foreach(str=>{ 41 | val strArr=str.split("=") 42 | if(strArr.length>1){ 43 | map+=(strArr.apply(0)->strArr.apply(1)) 44 | }else if(strArr.length>0){ 45 | //有可能出现等号后面什么也没有 46 | map+=(strArr.apply(0)->"") 47 | } 48 | }) 49 | } 50 | map 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/netty/NettyServer.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.netty 2 | 3 | import com.typesafe.config.ConfigFactory 4 | import io.netty.bootstrap.ServerBootstrap 5 | import io.netty.channel.nio.NioEventLoopGroup 6 | import io.netty.channel.socket.SocketChannel 7 | import io.netty.channel.socket.nio.NioServerSocketChannel 8 | import io.netty.channel.{ChannelInitializer, ChannelOption} 9 | import io.netty.handler.codec.http._ 10 | import org.slf4j.LoggerFactory 11 | 12 | class NettyServer{ 13 | val logger=LoggerFactory.getLogger(this.getClass) 14 | val configLoader=ConfigFactory.load("application.conf") 15 | def start() = { 16 | val serverConf=configLoader.getConfig("server") 17 | val host=serverConf.getString("host") 18 | val port=serverConf.getInt("port") 19 | 20 | logger.info("netty server启动") 21 | this.bind(host,port) 22 | } 23 | 24 | 25 | def bind(host: String, port: Int): Unit = { 26 | //配置服务端线程池组 27 | //用于服务器接收客户端连接 28 | val bossGroup = new NioEventLoopGroup(1) 29 | //用户进行SocketChannel的网络读写 30 | val workerGroup = new NioEventLoopGroup(10) 31 | 32 | try { 33 | //是Netty用户启动NIO服务端的辅助启动类,降低服务端的开发复杂度 34 | val bootstrap = new ServerBootstrap() 35 | //将两个NIO线程组作为参数传入到ServerBootstrap 36 | bootstrap.group(bossGroup, workerGroup) 37 | //创建NioServerSocketChannel 38 | .channel(classOf[NioServerSocketChannel]) 39 | //绑定I/O事件处理类 40 | .childHandler(new ChannelInitializer[SocketChannel] { 41 | override def initChannel(ch: SocketChannel): Unit = { 42 | /*ch.pipeline().addLast(new HttpResponseEncoder()); 43 | ch.pipeline().addLast(new HttpRequestDecoder());*/ 44 | ch.pipeline().addLast(new HttpServerCodec()); 45 | //HttpObjectAggregator解码器 将多个消息对象转换为full 46 | ch.pipeline().addLast("aggregator", new HttpObjectAggregator(512*1024)) 47 | //压缩 48 | ch.pipeline().addLast("deflater", new HttpContentCompressor()); 49 | ch.pipeline().addLast(new HttpServerHandler()); 50 | } 51 | }).option[Integer](ChannelOption.SO_BACKLOG, 128) 52 | .childOption[java.lang.Boolean](ChannelOption.SO_KEEPALIVE, true) 53 | //绑定端口,调用sync方法等待绑定操作完成 54 | val channelFuture = bootstrap.bind(port).sync() 55 | //等待服务关闭 56 | channelFuture.channel().closeFuture().sync() 57 | } finally { 58 | //优雅的退出,释放线程池资源 59 | bossGroup.shutdownGracefully() 60 | workerGroup.shutdownGracefully() 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/util/HttpUtil.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.base.util 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import org.apache.http.NameValuePair 6 | import org.apache.http.client.entity.UrlEncodedFormEntity 7 | import org.apache.http.client.methods.{HttpDelete, HttpGet, HttpPost, HttpPut} 8 | import org.apache.http.client.utils.URIBuilder 9 | import org.apache.http.entity.StringEntity 10 | import org.apache.http.impl.client.{CloseableHttpClient, HttpClientBuilder} 11 | import org.apache.http.message.BasicNameValuePair 12 | 13 | import scala.collection.mutable.ArrayBuffer 14 | import scala.io.Source 15 | 16 | object HttpUtil { 17 | /** 18 | * 超时时间 单位:毫秒 19 | */ 20 | val connTimout:Long=5000 21 | def HttpClient(): CloseableHttpClient ={ 22 | val httpClient:CloseableHttpClient = HttpClientBuilder.create() 23 | .setConnectionTimeToLive(connTimout, TimeUnit.MILLISECONDS) 24 | .build() 25 | httpClient 26 | } 27 | 28 | /** 29 | * 30 | * @param addr 接口地址 31 | * @param param 请求参数 32 | * @return 33 | */ 34 | def get(addr:String,param: Seq[(String,String)]):String={ 35 | val builder=new URIBuilder(addr) 36 | if(param.nonEmpty){ 37 | param.foreach(r=>{ 38 | builder.addParameter(r._1,r._2) 39 | }) 40 | } 41 | val client=HttpClient() 42 | val httpResponse = client.execute(new HttpGet(builder.build())) 43 | val entity = httpResponse.getEntity() 44 | var content = "" 45 | if (entity != null) { 46 | val inputStream = entity.getContent() 47 | content = Source.fromInputStream(inputStream).getLines.mkString 48 | inputStream.close 49 | } 50 | client.close() 51 | content 52 | } 53 | 54 | def put(addr:String,param: Seq[(String,String)]):String={ 55 | val builder=new URIBuilder(addr) 56 | if(param.nonEmpty){ 57 | param.foreach(r=>{ 58 | builder.addParameter(r._1,r._2) 59 | }) 60 | } 61 | val client=HttpClient() 62 | println(builder.build().toString) 63 | val httpResponse = client.execute(new HttpPut(builder.build())) 64 | val entity = httpResponse.getEntity() 65 | var content = "" 66 | if (entity != null) { 67 | val inputStream = entity.getContent() 68 | content = Source.fromInputStream(inputStream).getLines.mkString 69 | inputStream.close 70 | } 71 | client.close() 72 | content 73 | } 74 | 75 | def delete(addr:String,param: Seq[(String,String)]):String={ 76 | val builder=new URIBuilder(addr) 77 | if(param.nonEmpty){ 78 | param.foreach(r=>{ 79 | builder.addParameter(r._1,r._2) 80 | }) 81 | } 82 | val client=HttpClient() 83 | println(builder.build().toString) 84 | val httpResponse = client.execute(new HttpDelete(builder.build())) 85 | val entity = httpResponse.getEntity() 86 | var content = "" 87 | if (entity != null) { 88 | val inputStream = entity.getContent() 89 | content = Source.fromInputStream(inputStream).getLines.mkString 90 | inputStream.close 91 | } 92 | client.close() 93 | content 94 | } 95 | 96 | def post(addr:String,param: Seq[(String,String)]):String={ 97 | val req=new HttpPost(addr) 98 | val listParms=new ArrayBuffer[NameValuePair]() 99 | param.foreach(r=>{ 100 | listParms+=new BasicNameValuePair(r._1,r._2) 101 | }) 102 | import scala.collection.JavaConverters._ 103 | val entity=new UrlEncodedFormEntity(listParms.toList.asJava,"utf-8") 104 | req.setEntity(entity) 105 | val client=HttpClient() 106 | val httpResponse = client.execute(req) 107 | val resEntity = httpResponse.getEntity() 108 | var content = "" 109 | if (resEntity != null) { 110 | val inputStream = resEntity.getContent() 111 | content = Source.fromInputStream(inputStream).getLines.mkString 112 | inputStream.close 113 | } 114 | client.close() 115 | content 116 | } 117 | 118 | def postJson(addr:String,json:String):String={ 119 | val req=new HttpPost(addr) 120 | val entity=new StringEntity(json,"utf-8") 121 | req.setEntity(entity) 122 | val client=HttpClient() 123 | val httpResponse = client.execute(req) 124 | val resEntity = httpResponse.getEntity() 125 | var content = "" 126 | if (resEntity != null) { 127 | val inputStream = resEntity.getContent() 128 | content = Source.fromInputStream(inputStream).getLines.mkString 129 | inputStream.close 130 | } 131 | client.close() 132 | content 133 | } 134 | 135 | def main(args: Array[String]): Unit = { 136 | println(get("http://127.0.0.1:60001/test",Seq("asdddddd"->"d你是谁啊"))) 137 | } 138 | 139 | 140 | } 141 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/util/JsonSchemaBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.base.util 2 | 3 | import java.util.regex.Pattern 4 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 5 | import scala.collection.mutable.ArrayBuffer 6 | 7 | object JsonSchemaBuilder { 8 | 9 | 10 | final val columnSplitPattern = Pattern.compile("\\s*,\\s*") 11 | private final val fieldSplitPattern = Pattern.compile("\\.") 12 | private final val fieldPattern = Pattern.compile("([\\w\\.]+)(?:\\s+as\\s+\\w+)?") 13 | private final val DEFAULT_NULLABLE=true 14 | 15 | 16 | def getJsonSchema(schema: String): StructType = { 17 | getSchemaByFieldsList(columnSplitPattern.split(schema).map(getFieldList).toList) 18 | } 19 | 20 | private def getFieldList(singleField: String): List[String] = { 21 | val fieldMatch = fieldPattern.matcher(singleField) 22 | if (fieldMatch.matches()) { 23 | val fieldSource = fieldMatch.group(1) 24 | val fieldArray = fieldSplitPattern.split(fieldSource) 25 | fieldArray.toList 26 | } else { 27 | throw new IllegalArgumentException(s"field format error:$singleField ,we need parent.children(as aliasName)") 28 | } 29 | } 30 | 31 | private def getSchemaByFieldsList(fieldsList: List[List[String]]): StructType = { 32 | fieldsList.map(getStrcutType).reduce(mergeStructType) 33 | } 34 | 35 | private def getStrcutType(fields: List[String]): StructType = { 36 | fields match { 37 | case head :: Nil ⇒ StructType(StructField(head, StringType, DEFAULT_NULLABLE) :: Nil) 38 | case head :: tail ⇒ StructType(StructField(head, getStrcutType(tail), DEFAULT_NULLABLE) :: Nil) 39 | } 40 | } 41 | 42 | private def mergeStructType(left: StructType, right: StructType): StructType = { 43 | val newFields = ArrayBuffer.empty[StructField] 44 | val leftFields = left.fields 45 | val rightFields = right.fields 46 | val rightMapped = fieldsMap(rightFields) 47 | leftFields.foreach { 48 | case leftField@StructField(leftName, leftType, leftNullable, _) => 49 | rightMapped.get(leftName) 50 | .map { 51 | case rightField@StructField(_, rightType, rightNullable, _) => 52 | leftField.copy( 53 | dataType = mergeStructType(leftType.asInstanceOf[StructType], rightType.asInstanceOf[StructType]), 54 | nullable = leftNullable || rightNullable) 55 | } 56 | .orElse(Some(leftField)) 57 | .foreach(newFields += _) 58 | } 59 | 60 | val leftMapped = fieldsMap(leftFields) 61 | rightFields 62 | .filterNot(f => leftMapped.get(f.name).nonEmpty) 63 | .foreach(newFields += _) 64 | StructType(newFields) 65 | } 66 | 67 | private def fieldsMap(fields: Array[StructField]): Map[String, StructField] = { 68 | import scala.collection.breakOut 69 | fields.map(s ⇒ (s.name, s))(breakOut) 70 | } 71 | 72 | } -------------------------------------------------------------------------------- /src/main/scala/com/zyc/util/JsonUtil.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.base.util 2 | 3 | import java.sql.Timestamp 4 | 5 | 6 | 7 | import org.json4s._ 8 | import org.json4s.native.JsonMethods._ 9 | import org.json4s.native.Serialization 10 | import org.json4s.{JValue, _} 11 | 12 | /** 13 | * json解析 工具类 14 | * 基于scala json4s 15 | */ 16 | object JsonUtil { 17 | case object TimestampSerializer extends CustomSerializer[java.sql.Timestamp](format => ( { 18 | case _ => null 19 | }, { 20 | case ts: Timestamp =>JLong(ts.getTime) 21 | }) 22 | ) 23 | implicit val formats = DefaultFormats 24 | 25 | /** 26 | * json 转 map 27 | * @param content 28 | * @return 29 | */ 30 | def jsonToMap(content: String):Map[String,Any]={ 31 | parse(content).extract[Map[String,Any]] 32 | } 33 | 34 | /** 35 | * json 转对象 36 | * @param content 37 | * @tparam T 38 | * @return 39 | */ 40 | def jsonToObj[T: Manifest](content:String):T={ 41 | org.json4s.jackson.JsonMethods.parse(content).extract[T] 42 | } 43 | 44 | def jsonToObj[T: Manifest](content: AnyRef): T = { 45 | org.json4s.jackson.JsonMethods.parse(toJson(content)).extract[T] 46 | } 47 | 48 | /** 49 | * json 转 list 50 | * @param content 51 | * @tparam T 52 | * @return 53 | */ 54 | def jsonToList[T:Manifest](content:String):List[T]={ 55 | org.json4s.jackson.JsonMethods.parse(content).extract[List[T]] 56 | } 57 | 58 | /** 59 | * 对象转json 60 | * @param obj 61 | * @return 62 | */ 63 | def toJson(obj:AnyRef):String={ 64 | try{ 65 | Serialization.write(obj) 66 | }catch { 67 | case ex:Exception=>throw new Exception("转json异常",ex.getCause) 68 | } 69 | } 70 | 71 | def toJson2(obj: List[Map[String,Any]]): String = { 72 | Serialization.write(obj) 73 | } 74 | sealed trait QueryCacheClusterData 75 | case class QueryCacheClusterSuccess(total: Int, totalPage: Int, rows: List[Map[String, Any]], columns: List[Map[String, Any]]) extends QueryCacheClusterData 76 | def main(args: Array[String]): Unit = { 77 | // val list=Seq(Map("A"->"B"),Map("A"->Map("b"->"asd")),Map("A"->"B")) 78 | // val str=toJson(list) 79 | val str2="{\"cityCode\":\"43019884\",\"cityName\":\"凤凰县支行\",\"branchList\":[{\"branchCode\":\"43020126\",\"branchName\":\"凤凰县吉信镇营业所\",\"level\":\"4\"},{\"branchCode\":\"43020138\",\"branchName\":\"凤凰县禾库镇营业所\",\"level\":\"4\"},{\"branchCode\":\"43020140\",\"branchName\":\"凤凰县腊尔山镇营业所\",\"level\":\"4\"},{\"branchCode\":\"43020153\",\"branchName\":\"凤凰县凤凰路营业所\",\"level\":\"4\"},{\"branchCode\":\"43020165\",\"branchName\":\"凤凰县虹桥路支行\",\"level\":\"4\"},{\"branchCode\":\"43020177\",\"branchName\":\"凤凰县木江坪镇营业所\",\"level\":\"4\"},{\"branchCode\":\"4399935Q\",\"branchName\":\"凤凰县支行\",\"level\":\"4\"},{\"branchCode\":\"43020189\",\"branchName\":\"凤凰县山江镇营业所\",\"level\":\"4\"},{\"branchCode\":\"43020191\",\"branchName\":\"凤凰县茶田镇营业所\",\"level\":\"4\"},{\"branchCode\":\"43020203\",\"branchName\":\"凤凰县新场乡营业所\",\"level\":\"4\"},{\"branchCode\":\"43020215\",\"branchName\":\"凤凰县廖家桥镇营业所\",\"level\":\"4\"},{\"branchCode\":\"43020227\",\"branchName\":\"凤凰县阿拉镇营业所\",\"level\":\"4\"},{\"branchCode\":\"43022707\",\"branchName\":\"凤凰县柳薄乡营业所\",\"level\":\"4\"},{\"branchCode\":\"43022729\",\"branchName\":\"凤凰县米良乡营业所\",\"level\":\"4\"}]}" 80 | // println(str) 81 | // println(jsonToList[Map[String,Any]](str)) 82 | val a="[{\"ORGAN_ID\":\"35000011\",\"ORGAN_NAME\":\"aaaa\",\"QUOTA\":77178.16113114754}]" 83 | 84 | println(jsonToMap(str2)) 85 | 86 | val total=100 87 | val totalPage=10 88 | val rows=List(Map("ORGAN_ID"->"35000011","ORGAN_NAME"->"aaaa","QUOTA"->77178.16113114754)) 89 | QueryCacheClusterSuccess(total,totalPage,rows,rows) 90 | println(QueryCacheClusterSuccess) 91 | println(Serialization.write(QueryCacheClusterSuccess(total,totalPage,rows,rows))) 92 | } 93 | } 94 | 95 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/util/StringDefault.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.base.util 2 | 3 | import org.apache.commons.lang3.StringUtils 4 | 5 | object StringDefault { 6 | 7 | /** 8 | * 判断value 是否为空,如果为空返回默认值空串 9 | * @param value 10 | * @param default 11 | */ 12 | def getDefault(value:String,default:String=""): String ={ 13 | if(StringUtils.isEmpty(value)) default else value 14 | } 15 | 16 | def isEmpty(value:String): Boolean ={ 17 | StringUtils.isEmpty(value) 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/ZdhDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh 2 | 3 | import java.util 4 | import java.util.Properties 5 | 6 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 7 | import org.apache.spark.sql.types.StructField 8 | import org.apache.spark.sql.functions._ 9 | import org.slf4j.LoggerFactory 10 | 11 | trait ZdhDataSources { 12 | 13 | val loggerz=LoggerFactory.getLogger(this.getClass) 14 | /** 15 | * 获取schema 16 | * 17 | * @param spark 18 | * @return 19 | */ 20 | def getSchema(spark: SparkSession, options: Map[String, String])(implicit dispatch_task_id: String): Array[StructField] = ??? 21 | 22 | /** 23 | * 24 | * @param spark 25 | * @param dispatchOption 26 | * @param inPut 27 | * @param inputOptions 28 | * @param inputCondition 29 | * @param inputCols 30 | * @param duplicateCols 去重字段 31 | * @param outPut 32 | * @param outputOptionions 33 | * @param outputCols 34 | * @param sql 35 | * @param dispatch_task_id 36 | * @return 37 | */ 38 | def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], inputCondition: String, 39 | inputCols: Array[String],duplicateCols:Array[String], 40 | outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = ??? 41 | 42 | 43 | def filter(spark:SparkSession,df:DataFrame,inputCondition: String,duplicateCols:Array[String]): DataFrame ={ 44 | var df_tmp=df 45 | if(inputCondition!=null && !inputCondition.trim.equals("")){ 46 | df_tmp=df.filter(inputCondition) 47 | } 48 | if(duplicateCols!=null && duplicateCols.size>0){ 49 | return df_tmp.dropDuplicates(duplicateCols) 50 | } 51 | return df_tmp 52 | 53 | } 54 | /** 55 | * 处理逻辑 56 | * 57 | * @param spark 58 | * @param df 59 | * @param select 60 | * @param dispatch_task_id 61 | * @return 62 | */ 63 | def process(spark: SparkSession, df: DataFrame, select: Array[Column], zdh_etl_date: String)(implicit dispatch_task_id: String): DataFrame = ??? 64 | 65 | 66 | def merge(spark: SparkSession, df: DataFrame, options:Map[String,String])(implicit dispatch_task_id: String): DataFrame ={ 67 | 68 | loggerz.info("[数据采集]:[MERGE]:合并分区,可配置merge 参数自定义分区个数,默认不合并") 69 | var df_tmp = df 70 | //合并小文件操作 71 | if (!options.getOrElse("merge", "-1").equals("-1")) { 72 | df_tmp = df.repartition(options.getOrElse("merge", "200").toInt) 73 | } 74 | return df_tmp 75 | } 76 | 77 | 78 | /** 79 | * 写入数据总入口 80 | * 81 | * @param spark 82 | * @param df 83 | * @param options 84 | * @param sql 85 | * @param dispatch_task_id 86 | */ 87 | def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = ??? 88 | 89 | 90 | def dataQuality(spark: SparkSession, df: DataFrame, error_rate: String, primary_columns: String, column_size: Int, rows_range: String, column_is_null: Seq[Column], column_length: Seq[Column],column_regex: Seq[Column]): Map[String, String] = { 91 | 92 | val report = new util.HashMap[String, String]() 93 | report.put("result", "通过") 94 | import scala.collection.JavaConverters._ 95 | try { 96 | import spark.implicits._ 97 | val total_count = df.count() 98 | report.put("总行数", total_count.toString) 99 | var error_count = 0L; 100 | //表级质量检测 101 | //判断主键是否重复 102 | if (!primary_columns.trim.equals("")) { 103 | val cols = primary_columns.split(",").map(col(_)) 104 | val new_count = df.select(concat_ws("_", cols: _*) as "primary_key") 105 | .map(f => (f.getAs[String]("primary_key"), 1)).rdd.reduceByKey(_ + _) 106 | val primary_keys = new_count.filter(f => f._2 > 1) 107 | val primary_count = primary_keys.count() 108 | if (primary_count > 0) { 109 | error_count = error_count + primary_count 110 | loggerz.info("存在主键重复") 111 | report.put("primary_columns", "存在主键重复") 112 | report.put("result", "不通过") 113 | } else { 114 | loggerz.info("主键检测通过") 115 | report.put("primary_columns", "主键检测通过") 116 | } 117 | } 118 | 119 | //判断 120 | if (column_size > 0) { 121 | if (df.columns.size != column_size) { 122 | report.put("column_size", "解析字段个数不对") 123 | loggerz.info("解析字段个数不对") 124 | report.put("result", "不通过") 125 | } else { 126 | report.put("column_size", "字段个数检测通过") 127 | loggerz.info("字段个数检测通过") 128 | } 129 | } 130 | 131 | //判断行数 132 | if (!rows_range.equals("")) { 133 | var start_count = 0 134 | var end_count = 0 135 | if(rows_range.contains("-")){ 136 | start_count = rows_range.split("-")(0).toInt 137 | end_count = rows_range.split("-")(1).toInt 138 | }else{ 139 | start_count=rows_range.toInt 140 | end_count=start_count 141 | } 142 | 143 | if (total_count < start_count || total_count > end_count) { 144 | loggerz.info("数据行数异常") 145 | report.put("rows_range", "数据行数异常") 146 | report.put("result", "不通过") 147 | } else { 148 | report.put("rows_range", "数据行数检测通过") 149 | loggerz.info("数据行数检测通过") 150 | } 151 | } 152 | 153 | //字段级检测 154 | //是否为空检测, 155 | //col("").isNull 156 | if (column_is_null.size > 0) { 157 | val filter_columns = column_is_null.tail.foldLeft(column_is_null.head)((x, y) => { 158 | x or y 159 | }) 160 | 161 | val null_count = df.filter(filter_columns).count() 162 | if (null_count > 0) { 163 | error_count = error_count + null_count 164 | report.put("column_is_null", "存在字段为空,但是此字段不允许为空") 165 | report.put("result", "不通过") 166 | loggerz.info("存在字段为空,但是此字段不允许为空") 167 | } else { 168 | report.put("column_is_null", "字段是否为空,检测通过") 169 | loggerz.info("字段是否为空,检测通过") 170 | } 171 | } 172 | 173 | 174 | // 175 | //length(col(""))==长度 176 | if (column_length.size > 0) { 177 | val filter_column_length = column_length.tail.foldLeft(column_length.head)((x, y) => { 178 | x or y 179 | }) 180 | 181 | val length_count = df.filter(filter_column_length).count() 182 | if (length_count > 0) { 183 | error_count = error_count + length_count 184 | report.put("column_length", "存在字段长度不满足") 185 | report.put("result", "不通过") 186 | loggerz.info("存在字段长度不满足") 187 | } else { 188 | loggerz.info("字段长度检测通过") 189 | report.put("column_length", "字段长度检测通过") 190 | } 191 | } 192 | 193 | if (column_regex.size > 0) { 194 | 195 | val c_r=column_regex.map(f=>f==="false") 196 | val filter_column_regex = c_r.tail.foldLeft(c_r.head)((x, y) => { 197 | x or y 198 | }) 199 | val regex_count = df.filter(filter_column_regex).count() 200 | if (regex_count > 0) { 201 | error_count = error_count + regex_count 202 | report.put("column_regex", "正则判断不通过") 203 | report.put("result", "不通过") 204 | loggerz.info("存在正则不满足") 205 | } else { 206 | loggerz.info("正则判断检测通过") 207 | report.put("column_regex", "正则判断检测通过") 208 | } 209 | } 210 | 211 | loggerz.info("error_count:" + error_count) 212 | 213 | 214 | 215 | 216 | var error_num = total_count * error_rate.toDouble 217 | if (error_num < 1) { 218 | loggerz.info("容错的条数至少是1条") 219 | error_num = 1 220 | } 221 | if (error_count <= error_num && error_count!=0) { 222 | loggerz.info("在指定容错率范围内") 223 | report.put("result", "容错率内") 224 | } 225 | report.asScala.toMap[String, String] 226 | } catch { 227 | case e: Exception => { 228 | e.printStackTrace() 229 | throw e 230 | } 231 | } 232 | 233 | 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/CassandraDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.zdh.ZdhDataSources 4 | import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession} 5 | import org.slf4j.LoggerFactory 6 | 7 | object CassandraDataSources extends ZdhDataSources { 8 | 9 | val logger = LoggerFactory.getLogger(this.getClass) 10 | 11 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 12 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 13 | 14 | try { 15 | logger.info("[数据采集]:输入源为[CASSANDRA],开始匹配对应参数") 16 | val url: String = inputOptions.getOrElse("url", "").toString 17 | if (url.trim.equals("")) { 18 | throw new Exception("[zdh],cassandra数据源读取:url为空") 19 | } 20 | 21 | //keyspace.table 22 | val table: String = inputOptions.getOrElse("paths", "").toString 23 | if (table.trim.equals("") || !table.contains(".")) { 24 | throw new Exception("[zdh],cassandra数据源读取:paths为空,或者没有指定keyspace") 25 | } 26 | 27 | spark.conf.set("spark.cassandra.connection.host", url) 28 | if (url.contains(":")) { 29 | spark.conf.set("spark.cassandra.connection.host", url.split(":")(0)) 30 | spark.conf.set("spark.cassandra.connection.port", url.split(":")(1)) 31 | } 32 | 33 | var df = spark 34 | .read 35 | .format("org.apache.spark.sql.cassandra") 36 | .options(inputOptions) 37 | .option("table", table.split("\\.")(1)) 38 | .option("keyspace", table.split("\\.")(0)) 39 | .load() 40 | 41 | filter(spark,df,inputCondition,duplicateCols) 42 | 43 | } catch { 44 | case ex: Exception => { 45 | ex.printStackTrace() 46 | logger.error("[数据采集]:[CASSANDRA]:[WRITE]:[ERROR]:" + ex.getMessage.replace("\"","'")) 47 | throw ex 48 | } 49 | } 50 | } 51 | 52 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 53 | try { 54 | logger.info("[数据采集]:[CASSANDRA]:[SELECT]") 55 | logger.debug("[数据采集]:[CASSANDRA]:[SELECT]:" + select.mkString(",")) 56 | if(select==null || select.isEmpty){ 57 | logger.debug("[数据采集]:[CASSANDRA]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 58 | return df 59 | } 60 | df.select(select: _*) 61 | } catch { 62 | case ex: Exception => { 63 | logger.error("[数据采集]:[CASSANDRA]:[SELECT]:[ERROR]" + ex.getMessage.replace("\"","'")) 64 | throw ex 65 | } 66 | } 67 | } 68 | 69 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 70 | try { 71 | logger.info("[数据采集]:[CASSANDRA]:[WRITE]:[options]:" + options.mkString(",")) 72 | 73 | //默认是append 74 | val model = options.getOrElse("model", "").toString.toLowerCase match { 75 | case "overwrite" => SaveMode.Overwrite 76 | case "append" => SaveMode.Append 77 | case "errorifexists" => SaveMode.ErrorIfExists 78 | case "ignore" => SaveMode.Ignore 79 | case _ => SaveMode.Append 80 | } 81 | 82 | val url: String = options.getOrElse("url", "").toString 83 | if (url.trim.equals("")) { 84 | throw new Exception("[zdh],cassandra数据源读取:url为空") 85 | } 86 | 87 | spark.conf.set("spark.cassandra.connection.host", url) 88 | if (url.contains(":")) { 89 | spark.conf.set("spark.cassandra.connection.host", url.split(":")(0)) 90 | spark.conf.set("spark.cassandra.connection.port", url.split(":")(1)) 91 | } 92 | 93 | val table: String = options.getOrElse("paths", "").toString 94 | if (table.trim.equals("") || !table.contains(".")) { 95 | throw new Exception("[zdh],cassandra数据源读取:paths为空,或者没有指定keyspace") 96 | } 97 | 98 | //合并小文件操作 99 | var df_tmp = merge(spark,df,options) 100 | 101 | try { 102 | df_tmp.write 103 | .format("org.apache.spark.sql.cassandra") 104 | .mode(model) 105 | .options(options) 106 | .option("table", table.split("\\.")(1)) 107 | .option("keyspace", table.split("\\.")(0)) 108 | .save() 109 | } catch { 110 | case ex: Exception => { 111 | import com.datastax.spark.connector._ 112 | if (ex.getMessage.replace("\"","'").contains("any similarly named keyspace and table pairs")) { 113 | logger.info("[数据采集]:[CASSANDRA]:[WRITE]:[WARN]:表或者键空间不存在,将进行自动创建") 114 | df_tmp.createCassandraTable(table.split("\\.")(0),table.split("\\.")(1)) 115 | df_tmp.write 116 | .format("org.apache.spark.sql.cassandra") 117 | .mode(model) 118 | .options(options) 119 | .option("table", table.split("\\.")(1)) 120 | .option("keyspace", table.split("\\.")(0)) 121 | .save() 122 | } 123 | } 124 | } 125 | 126 | } catch { 127 | case ex: Exception => { 128 | 129 | ex.printStackTrace() 130 | logger.error("[数据采集]:[CASSANDRA]:[WRITE]:[ERROR]:" + ex.getMessage.replace("\"","'")) 131 | throw ex 132 | } 133 | } 134 | } 135 | 136 | } 137 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/DataFactorySources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | object DataFactorySources { 4 | 5 | def main(args: Array[String]): Unit = { 6 | 7 | //解析字段 8 | 9 | //地址类型 10 | if(""==""){ 11 | 12 | //角色 13 | if("role"==""){ 14 | 15 | if("生成方式"==""){ 16 | 17 | } 18 | 19 | } 20 | } 21 | 22 | 23 | 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/DataWareHouseSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.util.Properties 4 | 5 | import com.zyc.base.util.JsonUtil 6 | import com.zyc.zdh.ZdhDataSources 7 | import org.apache.spark.sql.types.StructField 8 | import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession} 9 | import org.slf4j.LoggerFactory 10 | case class desc_table_class(col_name:String,data_type:String,comment:String) 11 | 12 | /** 13 | * 本地数据仓库分析 14 | */ 15 | object DataWareHouseSources extends ZdhDataSources { 16 | 17 | val logger = LoggerFactory.getLogger(this.getClass) 18 | 19 | /** 20 | * 获取数据源schema 21 | * 22 | * @param spark 23 | * @param options 24 | * @return 25 | */ 26 | override def getSchema(spark: SparkSession, options: Map[String, String])(implicit dispatch_task_id: String): Array[StructField] = { 27 | logger.info("[数据采集]:[DATAWAREHOUSE]:[SCHEMA]:" + options.mkString(",")) 28 | null 29 | } 30 | 31 | 32 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 33 | inputCondition: String, inputCols: Array[String], duplicateCols: Array[String], outPut: String, outputOptionions: Map[String, String], 34 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 35 | try { 36 | logger.info("[数据采集]:输入源为[DATAWAREHOUSE],开始匹配对应参数") 37 | 38 | logger.info("[数据采集]:[DATAWAREHOUSE]:[READ]:" + inputOptions.mkString(",")) 39 | logger.info("[数据采集]:[DATAWAREHOUSE]:执行语句:"+sql) 40 | if (!sql.trim.equals("")) { 41 | val exe_sql_ary = sql.split(";\r\n|;\n") 42 | var result: DataFrame = null 43 | exe_sql_ary.foreach(sql_t => { 44 | if (!sql_t.trim.equals("")) 45 | result = spark.sql(sql_t) 46 | }) 47 | 48 | result 49 | } else { 50 | logger.info("[数据采集]:[DATAWAREHOUSE]:执行语句:为空") 51 | null 52 | } 53 | } catch { 54 | case ex: Exception => { 55 | logger.error("[数据采集]:[DATAWAREHOUSE]:[READ]:" + "[ERROR]:" + ex.getMessage.replace("\"","'"), "error") 56 | throw ex 57 | } 58 | } 59 | } 60 | 61 | /** 62 | * 读取数据源之后的字段映射 63 | * 64 | * @param spark 65 | * @param df 66 | * @param select 67 | * @return 68 | */ 69 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column], zdh_etl_date: String)(implicit dispatch_task_id: String): DataFrame = { 70 | try { 71 | logger.info("[数据采集]:[DATAWAREHOUSE]:[SELECT]") 72 | logger.debug("[数据采集]:[DATAWAREHOUSE]:[SELECT]:" + select.mkString(",")) 73 | if(select==null || select.isEmpty){ 74 | logger.debug("[数据采集]:[DATAWAREHOUSE]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 75 | return df 76 | } 77 | df.select(select:_*) 78 | } catch { 79 | case ex: Exception => { 80 | logger.error("[数据采集]:[DATAWAREHOUSE]:[SELECT]:[ERROR]:" + ex.getMessage.replace("\"","'"), "error") 81 | throw ex 82 | } 83 | } 84 | 85 | } 86 | 87 | def show_databases(spark: SparkSession): String = { 88 | import spark.implicits._ 89 | val databaseNames = spark.sql("show databases").select("databaseName").as[String].collect() 90 | 91 | JsonUtil.toJson(databaseNames) 92 | } 93 | 94 | def show_tables(spark: SparkSession, databaseName: String): String = { 95 | import spark.implicits._ 96 | spark.sql("use " + databaseName) 97 | val tableNames = spark.sql("show tables").select("tableName").as[String].collect() 98 | 99 | JsonUtil.toJson(tableNames) 100 | } 101 | 102 | def desc_table(spark: SparkSession, table: String): String = { 103 | import spark.implicits._ 104 | 105 | import org.apache.spark.sql.functions._ 106 | val tableCols = spark.sql("desc "+table).as[desc_table_class].collect() 107 | 108 | JsonUtil.toJson(tableCols) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/DownDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.io._ 4 | 5 | import com.zyc.base.util.JsonSchemaBuilder 6 | import com.zyc.zdh.ZdhDataSources 7 | import org.apache.poi.xssf.streaming.SXSSFWorkbook 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.functions._ 10 | import org.slf4j.LoggerFactory 11 | 12 | object DownDataSources extends ZdhDataSources { 13 | 14 | val logger = LoggerFactory.getLogger(this.getClass) 15 | 16 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 17 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 18 | 19 | throw new Exception("[数据采集]:[外部下载]:[READ]:不支持外部下载读取数据源") 20 | 21 | } 22 | 23 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 24 | throw new Exception("[数据采集]:[外部下载]:[SELECT]:不支持外部下载读取数据源") 25 | } 26 | 27 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 28 | try { 29 | logger.info("[数据采集]:[外部下载]:[WRITE]:[options]:" + options.mkString(",")) 30 | var url = options.getOrElse("url", "") 31 | var port = "22" 32 | if (url.contains(":")) { 33 | port = url.split(":")(1) 34 | url = url.split(":")(0) 35 | } 36 | 37 | val paths = options.getOrElse("paths", "") 38 | val sep = options.getOrElse("sep", ",") 39 | 40 | val username = options.getOrElse("user", "") 41 | val password = options.getOrElse("password", "") 42 | 43 | val filtType = options.getOrElse("fileType", "csv").toString.toLowerCase 44 | 45 | val root_path = options.getOrElse("root_path", "") 46 | //合并小文件操作 47 | var df_tmp = merge(spark,df,options) 48 | 49 | if (!url.equals("")) { 50 | logger.info("[数据采集]:[外部下载]:[WRITE]:输出到sftp数据源,文件路径:" + root_path + "/" + paths+".csv") 51 | df_tmp.write. 52 | format("com.springml.spark.sftp"). 53 | options(options). 54 | option("host", url). 55 | option("port", port). 56 | option("username", username). 57 | option("password", password). 58 | option("fileType", filtType). 59 | option("delimiter", sep). 60 | save(root_path + "/" + paths+".csv") 61 | } else { 62 | logger.info("[数据采集]:[外部下载]:[WRITE]:未配置sftp 服务器,输出到本地数据源") 63 | val file=new File(root_path) 64 | if(!file.exists()){ 65 | file.mkdirs() 66 | logger.info("[数据采集]:[外部下载]:[WRITE]:输出到本地数据源,自动创建目录"+root_path) 67 | } 68 | writeFile3(spark, df_tmp, root_path + "/" + paths, null); 69 | } 70 | 71 | 72 | df_tmp 73 | 74 | } catch { 75 | case ex: Exception => { 76 | ex.printStackTrace() 77 | logger.error("[数据采集]:[外部下载]:[WRITE]:[ERROR]:" + ex.getMessage.replace("\"","'")) 78 | throw ex 79 | } 80 | } 81 | } 82 | 83 | def writeFile2(spark: SparkSession, dataset: Dataset[_], path: String, style: Map[String, String], sheetName: String = "sheet1"): Unit = { 84 | import spark.implicits._ 85 | val columns = dataset.columns 86 | val rs = dataset.na.fill("").select(array(col("*"))).as[Array[String]].collect() 87 | 88 | println("生成文件数据量:" + rs.length + ",文件路径:" + path) 89 | val sxssf = new SXSSFWorkbook() 90 | // val cellStyle = getHeaderStyle(sxssf,style.getOrElse("headerColor","52").toShort) 91 | 92 | val sheet = sxssf.createSheet(sheetName) 93 | 94 | val header = sheet.createRow(0) 95 | for (j <- 0 until columns.length) { 96 | sheet.setColumnWidth(j, 5000) 97 | val cell = header.createCell(j) 98 | cell.setCellValue(columns(j)) 99 | } 100 | 101 | for (i <- 0 until rs.length) { 102 | val row = sheet.createRow(i + 1) 103 | for (j <- 0 until columns.length) { 104 | var cell1 = row.createCell(j) 105 | cell1.setCellValue(rs(i)(j).toString) 106 | } 107 | } 108 | var out = new FileOutputStream(path + ".csv"); 109 | var a = new OutputStreamWriter(out, "utf-8") 110 | sxssf.write(out); 111 | out.close(); 112 | sxssf.close() 113 | 114 | } 115 | 116 | def writeFile3(spark: SparkSession, dataset: Dataset[_], path: String, style: Map[String, String], sheetName: String = "sheet1"): Unit = { 117 | 118 | val writeFile = new File(path + ".csv"); 119 | import spark.implicits._ 120 | val columns = dataset.columns 121 | val rs = dataset.na.fill("").select(array(col("*"))).as[Array[String]].collect() 122 | println("生成文件数据量:" + rs.length + ",文件路径:" + path + ".csv") 123 | try { 124 | //第二步:通过BufferedReader类创建一个使用默认大小输出缓冲区的缓冲字符输出流 125 | val writeText = new BufferedWriter(new FileWriter(writeFile)); 126 | 127 | //第三步:将文档的下一行数据赋值给lineData,并判断是否为空,若不为空则输出 128 | 129 | writeText.write(columns.mkString(",")); 130 | for (i <- 0 until rs.length) { 131 | writeText.newLine(); //换行 132 | writeText.write(rs(i).mkString(",")); 133 | } 134 | //使用缓冲区的刷新方法将数据刷到目的地中 135 | writeText.flush(); 136 | //关闭缓冲区,缓冲区没有调用系统底层资源,真正调用底层资源的是FileWriter对象,缓冲区仅仅是一个提高效率的作用 137 | //因此,此处的close()方法关闭的是被缓存的流对象 138 | writeText.close(); 139 | } catch { 140 | case e: Exception => { 141 | e.printStackTrace() 142 | throw e 143 | } 144 | } 145 | } 146 | 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/ESDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.zdh.ZdhDataSources 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 6 | import org.slf4j.LoggerFactory 7 | 8 | object ESDataSources extends ZdhDataSources { 9 | 10 | val logger = LoggerFactory.getLogger(this.getClass) 11 | 12 | 13 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 14 | inputCondition: String, inputCols: Array[String], duplicateCols:Array[String],outPut: String, outputOptionions: Map[String, String], 15 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame ={ 16 | try { 17 | logger.info("[数据采集]:输入源为[ES],开始匹配对应参数") 18 | 19 | 20 | val path=inputOptions.getOrElse("paths","").toString 21 | if(path.trim.equals("")){ 22 | throw new Exception("[zdh],es数据源读取:paths为空") 23 | } 24 | 25 | val url: String = inputOptions.getOrElse("url", "").toString 26 | 27 | if(url.trim.equals("") || !url.trim.contains(":")){ 28 | throw new Exception("[zdh],es数据源读取:url为空或者不是ip:port 格式") 29 | } 30 | 31 | val nodes: String = inputOptions.getOrElse("es.nodes", url.split(":")(0)).toString 32 | if(nodes.trim.equals("")){ 33 | throw new Exception("[zdh],es数据源读取:nodes为空") 34 | } 35 | 36 | val port: String = inputOptions.getOrElse("es.port", url.split(":")(1)).toString 37 | if(port.trim.equals("")){ 38 | throw new Exception("[zdh],es数据源读取:port为空") 39 | } 40 | 41 | 42 | logger.info("[数据采集]:[ES]:[READ]:路径:" + path + "," + inputOptions.mkString(",") + " [FILTER]:" + inputCondition) 43 | 44 | // cfg.put("es.nodes", "192.168.56.12"); 45 | // cfg.put("es.port", "9200"); 46 | 47 | //获取jdbc 配置 48 | val df=spark.read.format("org.elasticsearch.spark.sql").options(inputOptions.+("es.nodes"->nodes).+("es.port"->port)).load(path) 49 | 50 | filter(spark,df,inputCondition,duplicateCols) 51 | 52 | } catch { 53 | case ex: Exception => { 54 | logger.error("[数据采集]:[ES]:[READ]:路径:" + inputOptions.getOrElse("paths","").toString+ "[ERROR]:" + ex.getMessage) 55 | throw ex 56 | } 57 | } 58 | } 59 | 60 | /** 61 | * 62 | * @param spark 63 | * @param df 64 | * @param select 65 | * @param dispatch_task_id 66 | * @return 67 | */ 68 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id:String): DataFrame = { 69 | try{ 70 | logger.info("[数据采集]:[ES]:[SELECT]:"+select.mkString(",")) 71 | if(select==null || select.isEmpty){ 72 | logger.debug("[数据采集]:[ES]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 73 | return df 74 | } 75 | df.select(select: _*) 76 | }catch { 77 | case ex:Exception=>{ 78 | logger.error("[数据采集]:[ES]:[SELECT]:[ERROR]:"+ex.getMessage) 79 | throw ex 80 | } 81 | } 82 | 83 | } 84 | 85 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 86 | 87 | writeDS(spark,df,options.getOrElse("paths",""),options) 88 | } 89 | 90 | /** 91 | * 写入ES 92 | * @param spark 93 | * @param df 94 | * @param path 95 | * @param options 96 | * @param dispatch_task_id 97 | */ 98 | def writeDS(spark: SparkSession, df: DataFrame, path: String, options: Map[String, String])(implicit dispatch_task_id: String): Unit = { 99 | try { 100 | logger.info("[数据采集]:[ES]:[WRITE]:路径:" + path + "," + options.mkString(",")) 101 | val url: String = options.getOrElse("url", "").toString 102 | 103 | if(url.trim.equals("") || !url.trim.contains(":")){ 104 | throw new Exception("[zdh],es数据源写入:url为空或者不是ip:port 格式") 105 | } 106 | 107 | val nodes: String = options.getOrElse("es.nodes", url.split(":")(0)).toString 108 | if(nodes.trim.equals("")){ 109 | throw new Exception("[zdh],es数据源写入:nodes为空") 110 | } 111 | 112 | val port: String = options.getOrElse("es.port", url.split(":")(1)).toString 113 | if(port.trim.equals("")){ 114 | throw new Exception("[zdh],es数据源写入:port为空") 115 | } 116 | 117 | val opt=options.+(("es.nodes"->nodes),("es.port"->port)) 118 | 119 | //合并小文件操作 120 | var df_tmp = merge(spark,df,options) 121 | 122 | if(options.getOrElse("mode","").equals("")){ 123 | df_tmp.write.format("org.elasticsearch.spark.sql").options(opt).save(path) 124 | }else{ 125 | df_tmp.write.format("org.elasticsearch.spark.sql").options(opt).mode(options.getOrElse("mode","")).save(path) 126 | } 127 | 128 | } catch { 129 | case ex: Exception => { 130 | logger.error("[数据采集]:[ES]:[WRITE]:路径:" +path + "," + "[ERROR]:" + ex.getMessage.replace("\"","'")) 131 | throw ex 132 | } 133 | } 134 | 135 | } 136 | 137 | } 138 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/FlumeDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.net.InetSocketAddress 4 | import java.util.concurrent.{LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit} 5 | 6 | import com.zyc.base.util.JsonSchemaBuilder 7 | import com.zyc.zdh.{DataSources, ZdhDataSources} 8 | import org.apache.spark.sql.functions._ 9 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 10 | import org.apache.spark.storage.StorageLevel 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.streaming.flume.FlumeUtils 13 | import org.slf4j.LoggerFactory 14 | 15 | //Kafka spark-streaming-kafka-0-10_2.12 16 | //Flume spark-streaming-flume_2.12 17 | //Kinesis 18 | //spark-streaming-kinesis-asl_2.12 [Amazon Software License] 19 | 20 | object FlumeDataSources extends ZdhDataSources { 21 | 22 | val logger = LoggerFactory.getLogger(this.getClass) 23 | 24 | val flumeInstance = new java.util.concurrent.ConcurrentHashMap[String, StreamingContext]() 25 | 26 | //程线程池 27 | private val threadpool = new ThreadPoolExecutor( 28 | 1, // core pool size 29 | 10, // max pool size 30 | 500, // keep alive time 31 | TimeUnit.MILLISECONDS, 32 | new LinkedBlockingQueue[Runnable]() 33 | ) 34 | 35 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 36 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], 37 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 38 | logger.info("[数据采集]:输入源为[FLUME],开始匹配对应参数") 39 | val brokers = inputOptions.getOrElse("url", "") 40 | if (brokers.equals("")) { 41 | throw new Exception("[zdh],flume数据源读取:url为空") 42 | } 43 | 44 | if (!outPut.toLowerCase.equals("jdbc")) { 45 | throw new Exception("[zdh],flume数据源读取:输出数据源只支持jdbc数据源,请修改输出数据源为jdbc") 46 | } 47 | 48 | 49 | createFlumeDataSources(spark, brokers, "", "", inputOptions, inputCols,outPut, outputCols, outputOptionions, inputCondition,sql) 50 | 51 | null 52 | } 53 | 54 | def createFlumeDataSources(spark: SparkSession, brokers: String, topics: String, groupId: String, options: Map[String, String], cols: Array[String], 55 | outPut:String, 56 | outputCols: Array[Map[String, String]], outputOptions: Map[String, String], 57 | inputCondition: String,sql:String)(implicit dispatch_task_id: String): Unit = { 58 | logger.info("[数据采集]:[FLUME]:[READ]:其他参数:," + options.mkString(",") + " [FILTER]:" + inputCondition) 59 | //获取jdbc 配置 60 | if (flumeInstance.size() < 10) { 61 | 62 | threadpool.execute(new Runnable { 63 | override def run(): Unit = { 64 | import org.apache.spark.streaming._ 65 | import spark.implicits._ 66 | val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) 67 | flumeInstance.put(dispatch_task_id, ssc) 68 | 69 | val address = brokers.split(",").map(f => new InetSocketAddress(f.split(":")(0), f.split(":")(1).toInt)).toSeq 70 | 71 | val stream = FlumeUtils.createPollingStream(ssc, address, StorageLevel.MEMORY_ONLY_SER_2) 72 | val sep = options.getOrElse("sep", ",") 73 | //判断消息类型 74 | val msgType = options.getOrElse("msgType", "csv") 75 | //多行模式 76 | val multiline = options.getOrElse("multiline", "false") 77 | 78 | val ncols = cols.zipWithIndex.map(f => trim(col("value").getItem(f._2)) as f._1) 79 | val message = stream.map(f => new String(f.event.getBody.array())) 80 | 81 | message.foreachRDD(rdd => { 82 | var tmp: DataFrame = null 83 | if (msgType.equals("csv")) { 84 | tmp = rdd.map(f=>f.split(sep)).toDF("value").select(ncols: _*) 85 | } else { 86 | val schema = JsonSchemaBuilder.getJsonSchema(cols.mkString(",")) 87 | import spark.implicits._ 88 | val outCols = outputCols.map(f => expr(f.getOrElse("column_expr", "")) as f.getOrElse("column_alias", "")) 89 | tmp = rdd.toDF("value").as[String].select(from_json(col("value").cast("string"), schema) as "data") 90 | .select($"data.*").select(outCols: _*) 91 | } 92 | 93 | if (tmp != null && !tmp.isEmpty) 94 | DataSources.outPutHandler(spark, tmp, outPut, outputOptions, outputCols, sql) 95 | }) 96 | 97 | ssc.start() 98 | ssc.awaitTermination() 99 | } 100 | }) 101 | 102 | 103 | } else { 104 | 105 | } 106 | 107 | } 108 | 109 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 110 | null 111 | } 112 | 113 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 114 | logger.info("[数据采集]:[FLUME]:[WRITE]:") 115 | throw new Exception("[数据采集]:[FLUME]:[WRITE]:[ERROR]:不支持写入flume数据源") 116 | 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/FtpDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.io.{FileWriter, PrintWriter} 4 | import java.net.URL 5 | 6 | import com.zyc.base.util.JsonSchemaBuilder 7 | import com.zyc.zdh.ZdhDataSources 8 | import org.apache.spark.SparkFiles 9 | import org.apache.spark.sql.functions.{col, from_json} 10 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 11 | import org.slf4j.LoggerFactory 12 | 13 | object FtpDataSources extends ZdhDataSources { 14 | val logger = LoggerFactory.getLogger(this.getClass) 15 | 16 | 17 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 18 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], 19 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 20 | try { 21 | logger.info("[数据采集]:[FTP]:匹配文件格式") 22 | var url=inputOptions.getOrElse("url","") 23 | var port="21" 24 | if(url.contains(":")){ 25 | port=url.split(":")(1) 26 | url=url.split(":")(0) 27 | } 28 | if(url.trim.equals("")){ 29 | throw new Exception("[zdh],ftp数据源读取:url为空") 30 | } 31 | 32 | var paths=inputOptions.getOrElse("paths","") 33 | 34 | if(paths.trim.equals("")){ 35 | throw new Exception("[zdh],ftp数据源读取:paths为空") 36 | } 37 | if(!paths.startsWith("/")){ 38 | paths = "/"+paths 39 | } 40 | 41 | val sep=inputOptions.getOrElse("sep",",") 42 | 43 | val username=inputOptions.getOrElse("user","") 44 | val password=inputOptions.getOrElse("password","") 45 | 46 | val fileType=inputOptions.getOrElse("fileType", "csv").toString.toLowerCase 47 | 48 | logger.info("[数据采集]:[FTP]:[CSV]:[READ]:分割符为多位" + sep + ",如果是以下符号会自动转义( )*+ -/ [ ] { } ? ^ | .") 49 | if (inputCols == null || inputCols.isEmpty) { 50 | throw new Exception("[数据采集]:[FTP]:[CSV]:[READ]:分割符为多位" + sep + ",数据结构必须由外部指定") 51 | } 52 | var sep_tmp = sep.replace("\\", "\\\\") 53 | if (sep_tmp.contains('$')) { 54 | sep_tmp = sep_tmp.replace("$", "\\$") 55 | } 56 | if (sep_tmp.contains('(') || sep_tmp.contains(')')) { 57 | sep_tmp = sep_tmp.replace("(", "\\(").replace(")", "\\)") 58 | } 59 | if (sep_tmp.contains('*')) { 60 | sep_tmp = sep_tmp.replace("*", "\\*") 61 | } 62 | if (sep_tmp.contains('+')) { 63 | sep_tmp = sep_tmp.replace("+", "\\+") 64 | } 65 | if (sep_tmp.contains('-')) { 66 | sep_tmp = sep_tmp.replace("-", "\\-") 67 | } 68 | if (sep_tmp.contains('[') || sep_tmp.contains(']')) { 69 | sep_tmp = sep_tmp.replace("[", "\\[").replace("]", "\\]") 70 | } 71 | if (sep_tmp.contains('{') || sep_tmp.contains('}')) { 72 | sep_tmp = sep_tmp.replace("{", "\\{").replace("}", "\\}") 73 | } 74 | if (sep_tmp.contains('^')) { 75 | sep_tmp = sep_tmp.replace("^", "\\^") 76 | } 77 | if (sep_tmp.contains('|')) { 78 | sep_tmp = sep_tmp.replace("|", "\\|") 79 | } 80 | 81 | var ncols = inputCols.zipWithIndex.map(f => col("value").getItem(f._2) as f._1) 82 | //ncols2=ds.columns.mkString(",").split(sep_tmp).zipWithIndex.map(f => col("value").getItem(f._2) as f._1) 83 | val schema=JsonSchemaBuilder.getJsonSchema(inputCols.mkString(",")) 84 | logger.info("[数据采集]:[FTP]:[READ]:paths:"+url+":"+port+paths) 85 | var filename = s"ftp://${username}:${password}@${url}${paths}" 86 | import spark.implicits._ 87 | spark.sparkContext.addFile(filename) 88 | 89 | var df:DataFrame = spark.emptyDataFrame 90 | var df_tmp = spark.sparkContext.textFile(SparkFiles.get(filename.split("/").last)) 91 | if(fileType.equalsIgnoreCase("csv")){ 92 | df = df_tmp.map(line=>line.split(sep_tmp)).toDF("value") 93 | .select(ncols: _*) 94 | .filter(col(inputCols.head) =!= inputCols.head) //过滤表头信息 95 | } 96 | if(fileType.equalsIgnoreCase("json")){ 97 | df = df_tmp.toDF("value") 98 | .as[String].select(from_json(col("value").cast("string"), schema) as "data") 99 | .select(col("data.*")) 100 | } 101 | 102 | 103 | filter(spark,df,inputCondition,duplicateCols) 104 | } catch { 105 | case ex: Exception => { 106 | ex.printStackTrace() 107 | logger.error("[数据采集]:[FTP]:[READ]:[ERROR]:" + ex.getMessage.replace("\"","'")) 108 | throw ex 109 | } 110 | } 111 | } 112 | 113 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 114 | try{ 115 | logger.info("[数据采集]:[FTP]:[SELECT]") 116 | logger.debug("[数据采集]:[FTP]:[SELECT]:"+select.mkString(",")) 117 | if(select==null || select.isEmpty){ 118 | logger.debug("[数据采集]:[FTP]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 119 | return df 120 | } 121 | df.select(select: _*) 122 | }catch { 123 | case ex:Exception=>{ 124 | logger.error("[数据采集]:[FTP]:[SELECT]:[ERROR]"+ex.getMessage.replace("\"","'")) 125 | throw ex 126 | } 127 | } 128 | } 129 | 130 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 131 | try { 132 | logger.info("[数据采集]:[FTP]:[WRITE]:[options]:"+options.mkString(",")) 133 | var url=options.getOrElse("url","") 134 | var port="21" 135 | if(url.contains(":")){ 136 | port=url.split(":")(1) 137 | url=url.split(":")(0) 138 | } 139 | 140 | var paths=options.getOrElse("paths","") 141 | val sep=options.getOrElse("sep",",") 142 | if(!paths.startsWith("/")){ 143 | paths = "/"+paths 144 | } 145 | val username=options.getOrElse("user","") 146 | val password=options.getOrElse("password","") 147 | 148 | val filtType=options.getOrElse("fileType", "csv").toString.toLowerCase 149 | 150 | //合并小文件操作 151 | var df_tmp = merge(spark,df,options) 152 | var filename = s"ftp://${username}:${password}@${url}:21${paths}" 153 | logger.info("[数据采集]:[FTP]:[WRITE]:当前只支持覆盖写入ftp文件,不支持追加") 154 | val header = df_tmp.columns.mkString(sep) 155 | df_tmp.repartition(1) 156 | .foreachPartition(rows=>{ 157 | val ftp = new URL(filename) 158 | val pw = new PrintWriter(ftp.openConnection().getOutputStream) 159 | pw.write(header+"\n") 160 | rows.foreach(row=> pw.write(row.mkString(sep)+"\n")) 161 | pw.flush() 162 | pw.close() 163 | }) 164 | df_tmp 165 | } catch { 166 | case ex: Exception => { 167 | ex.printStackTrace() 168 | logger.error("[数据采集]:[FTP]:[WRITE]:[ERROR]:" + ex.getMessage.replace("\"","'")) 169 | throw ex 170 | } 171 | } 172 | } 173 | 174 | 175 | } 176 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/GreenplumDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.util.Properties 4 | 5 | import com.zyc.zdh.ZdhDataSources 6 | import org.apache.spark.sql.types.StructField 7 | import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession} 8 | import org.slf4j.LoggerFactory 9 | 10 | /** 11 | * 使用此数据源连接所有的jdbc数据,包括hive,mysql,oracle 等 12 | */ 13 | object GreenplumDataSources extends ZdhDataSources{ 14 | 15 | val logger=LoggerFactory.getLogger(this.getClass) 16 | 17 | /** 18 | * 获取数据源schema 19 | * 20 | * @param spark 21 | * @param options 22 | * @return 23 | */ 24 | override def getSchema(spark: SparkSession, options: Map[String,String])(implicit dispatch_task_id:String): Array[StructField] = { 25 | logger.info("[数据采集]:[Greenplum]:[SCHEMA]:"+options.mkString(",")) 26 | var dbtable: String = options.getOrElse("dbtable", "").toString 27 | val paths = options.getOrElse("paths", "").toString 28 | if (paths.trim.equals("")) { 29 | throw new Exception("[zdh],Greenplum数据源读取:paths为空") 30 | } 31 | var dbschema="" 32 | if(paths.contains(".")){ 33 | dbtable=paths.split(".")(1) 34 | dbschema=paths.split(".")(0) 35 | }else{ 36 | dbtable=paths 37 | } 38 | var tmpOptions=options.+("dbtable"->dbtable,"dbschema"->dbschema) 39 | spark.read.format("greenplum").options(tmpOptions).load().schema.fields 40 | } 41 | 42 | 43 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 44 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], 45 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 46 | var tmpOptions=inputOptions 47 | try{ 48 | logger.info("[数据采集]:输入源为[Greenplum],开始匹配对应参数") 49 | val url: String = inputOptions.getOrElse("url", "").toString 50 | if(url.trim.equals("")){ 51 | throw new Exception("[zdh],Greenplum数据源读取:url为空") 52 | } 53 | var dbtable: String = inputOptions.getOrElse("dbtable", "").toString 54 | val paths = inputOptions.getOrElse("paths", "").toString 55 | if (paths.trim.equals("")) { 56 | throw new Exception("[zdh],Greenplum数据源读取:paths为空") 57 | } 58 | var dbschema="" 59 | if(paths.contains(".")){ 60 | dbtable=paths.split("\\.")(1) 61 | dbschema=paths.split("\\.")(0) 62 | }else{ 63 | dbtable=paths 64 | } 65 | 66 | val user: String = inputOptions.getOrElse("user", "").toString 67 | if(user.trim.equals("")){ 68 | logger.info("[zdh],Greenplum数据源读取:user为空") 69 | // throw new Exception("[zdh],jdbc数据源读取:user为空") 70 | } 71 | val password: String = inputOptions.getOrElse("password", "").toString 72 | if(password.trim.equals("")){ 73 | logger.info("[zdh],Greenplum数据源读取:password为空") 74 | // throw new Exception("[zdh],jdbc数据源读取:password为空") 75 | } 76 | // val driver: String = inputOptions.getOrElse("driver", "").toString 77 | // if(driver.trim.equals("")){ 78 | // throw new Exception("[zdh],Greenplum数据源读取:driver为空") 79 | // } 80 | 81 | tmpOptions=inputOptions.+("dbtable"->dbtable,"dbschema"->dbschema) 82 | 83 | logger.info("[数据采集]:[Greenplum]:[READ]:表名:"+tmpOptions.getOrElse("dbtable","")+","+tmpOptions.mkString(",")+" [FILTER]:"+inputCondition) 84 | //获取jdbc 配置 85 | //https://github.com/kongyew/greenplum-spark-connector/blob/master/usecase1/README.MD 86 | var format="greenplum" 87 | var df:DataFrame=spark.read.format(format).options(tmpOptions).load() 88 | 89 | filter(spark,df,inputCondition,duplicateCols) 90 | 91 | }catch { 92 | case ex:Exception=>{ 93 | logger.error("[数据采集]:[Greenplum]:[READ]:表名:"+tmpOptions.getOrElse("paths","")+"[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 94 | throw ex 95 | } 96 | } 97 | 98 | } 99 | 100 | /** 101 | * 读取数据源之后的字段映射 102 | * @param spark 103 | * @param df 104 | * @param select 105 | * @return 106 | */ 107 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id:String): DataFrame = { 108 | try{ 109 | logger.info("[数据采集]:[Greenplum]:[SELECT]") 110 | logger.debug("[数据采集]:[Greenplum]:[SELECT]:"+select.mkString(",")) 111 | if(select==null || select.isEmpty){ 112 | logger.debug("[数据采集]:[Greenplum]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 113 | return df 114 | } 115 | df.select(select: _*) 116 | }catch { 117 | case ex:Exception=>{ 118 | logger.error("[数据采集]:[Greenplum]:[SELECT]:[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 119 | throw ex 120 | } 121 | } 122 | 123 | } 124 | 125 | 126 | override def writeDS(spark: SparkSession,df:DataFrame,options: Map[String,String], sql: String)(implicit dispatch_task_id:String): Unit = { 127 | try{ 128 | logger.info("[数据采集]:[JDBC]:[WRITE]:表名:"+options.getOrElse("paths","")+","+options.mkString(",")) 129 | 130 | val paths = options.getOrElse("paths", "").toString 131 | if (paths.trim.equals("")) { 132 | throw new Exception("[zdh],Greenplum数据源读取:paths为空") 133 | } 134 | var dbtable: String = paths 135 | var dbschema="" 136 | if(paths.contains(".")){ 137 | dbtable=paths.split(".")(1) 138 | dbschema=paths.split(".")(0) 139 | } 140 | val url=options.getOrElse("url","") 141 | if(!sql.equals("")){ 142 | deleteJDBC(spark,url,options,sql) 143 | } 144 | 145 | var tmpOptions=options.+("dbtable"->dbtable,"dbschema"->dbschema) 146 | var format="greenplum" 147 | //合并小文件操作 148 | var df_tmp = merge(spark,df,options) 149 | df_tmp.write.format(format).mode(SaveMode.Append).options(tmpOptions).save() 150 | 151 | }catch { 152 | case ex:Exception=>{ 153 | ex.printStackTrace() 154 | logger.info("[数据采集]:[JDBC]:[WRITE]:表名:"+options.getOrElse("dbtable","")+","+"[ERROR]:"+ex.getMessage.replace("\"","'")) 155 | throw ex 156 | } 157 | } 158 | 159 | } 160 | 161 | 162 | /** 163 | * 写入jdbc 之前 清空特定数据 164 | * 165 | * @param spark 166 | * @param url 167 | * @param options 168 | * @param sql 169 | */ 170 | def deleteJDBC(spark: SparkSession, url: String, options: Map[String,String], sql: String)(implicit dispatch_task_id:String): Unit = { 171 | logger.info("[数据采集]:[Greenplum]:[CLEAR]:url:"+url+","+options.mkString(",")+",sql:"+sql) 172 | import scala.collection.JavaConverters._ 173 | val properties=new Properties() 174 | properties.putAll(options.asJava) 175 | var driver = properties.getProperty("driver", "") 176 | if (driver.equals("")) { 177 | driver = getDriver(url) 178 | } 179 | Class.forName(driver) 180 | var cn: java.sql.Connection = null 181 | var ps: java.sql.PreparedStatement = null 182 | try { 183 | cn = java.sql.DriverManager.getConnection(url, properties) 184 | ps = cn.prepareStatement(sql) 185 | ps.execute() 186 | ps.close() 187 | cn.close() 188 | } 189 | catch { 190 | case ex: Exception => { 191 | ps.close() 192 | cn.close() 193 | if(ex.getMessage.replace("\"","'").contains("doesn't exist") || ex.getMessage.replace("\"","'").contains("Unknown table")){ 194 | logger.warn("[数据采集]:[Greenplum]:[CLEAR]:[WARN]:"+ex.getMessage.replace("\"","'")) 195 | }else{ 196 | throw ex 197 | } 198 | } 199 | } 200 | } 201 | 202 | 203 | def getDriver(url: String): String = { 204 | 205 | url match { 206 | case u if u.toLowerCase.contains("jdbc:mysql") => "com.mysql.jdbc.Driver" 207 | case _ => "" 208 | } 209 | 210 | 211 | } 212 | 213 | } 214 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/HiveDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.zdh.ZdhDataSources 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types.StructField 6 | import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession} 7 | import org.slf4j.LoggerFactory 8 | 9 | /** 10 | * 通过配置文件方式读写hive 11 | */ 12 | object HiveDataSources extends ZdhDataSources{ 13 | 14 | val logger=LoggerFactory.getLogger(this.getClass) 15 | 16 | override def getSchema(spark: SparkSession, options: Map[String, String])(implicit dispatch_task_id:String): Array[StructField] = { 17 | logger.info(s"获取hive表的schema信息table:${options.getOrElse("table","")},option:${options.mkString(",")}") 18 | spark.table(options.getOrElse("table","")).schema.fields 19 | } 20 | 21 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 22 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 23 | try{ 24 | logger.info("[数据采集]:输入源为[HIVE]") 25 | val tableName=inputOptions.getOrElse("tableName","").toString 26 | if(tableName.trim.equals("")){ 27 | throw new Exception("[zdh],hive数据源读取:tableName为空") 28 | } 29 | 30 | logger.info("[数据采集]:[HIVE]:[READ]:[table]:"+tableName+"[FILTER]:"+inputCondition) 31 | val df=spark.table(tableName) 32 | filter(spark,df,inputCondition,duplicateCols) 33 | 34 | }catch { 35 | case ex:Exception=>{ 36 | logger.error("[数据采集]:[HIVE]:[READ]:[ERROR]:"+ex.getMessage.replace("\"","'")) 37 | throw ex 38 | } 39 | } 40 | } 41 | 42 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id:String): DataFrame ={ 43 | try{ 44 | logger.info("[数据采集]:[HIVE]:[SELECT]") 45 | logger.debug("[数据采集]:[HIVE]:[SELECT]:"+select.mkString(",")) 46 | if(select==null || select.isEmpty){ 47 | logger.debug("[数据采集]:[HIVE]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 48 | return df 49 | } 50 | df.select(select: _*) 51 | }catch { 52 | case ex:Exception=>{ 53 | logger.error("[数据采集]:[HIVE]:[SELECT]:[ERROR]"+ex.getMessage.replace("\"","'")) 54 | throw ex 55 | } 56 | } 57 | 58 | } 59 | 60 | override def writeDS(spark: SparkSession,df:DataFrame,options: Map[String,String], sql: String="")(implicit dispatch_task_id:String): Unit = { 61 | try{ 62 | logger.info("[数据采集]:[HIVE]:[WRITE]:[options]:"+options.mkString(",")) 63 | 64 | //默认是append 65 | val model=options.getOrElse("model","").toString.toLowerCase match { 66 | case "overwrite"=>SaveMode.Overwrite 67 | case "append"=>SaveMode.Append 68 | case "errorifexists"=>SaveMode.ErrorIfExists 69 | case "ignore"=>SaveMode.Ignore 70 | case _=>SaveMode.Append 71 | } 72 | 73 | //如果需要建立外部表需要options中另外传入path 参数 example 外部表t1 path:/dir1/dir2/t1 74 | val format=options.getOrElse("format","orc") 75 | val tableName=options.getOrElse("paths","") 76 | val partitionBy=options.getOrElse("partitionBy","") 77 | 78 | //合并小文件操作 79 | var df_tmp = merge(spark,df,options) 80 | 81 | if(spark.catalog.tableExists(tableName)){ 82 | val cols=spark.table(tableName).columns 83 | df_tmp.select(cols.map(col(_)):_*) 84 | .write.mode(model).insertInto(tableName) 85 | }else{ 86 | if(partitionBy.equals("")){ 87 | df_tmp.write.mode(model).format(format).options(options).saveAsTable(tableName) 88 | }else{ 89 | df_tmp.write.mode(model).format(format).partitionBy(partitionBy).options(options).saveAsTable(tableName) 90 | } 91 | 92 | } 93 | }catch { 94 | case ex:Exception=>{ 95 | ex.printStackTrace() 96 | logger.error("[数据采集]:[HIVE]:[WRITE]:[ERROR]:"+ex.getMessage.replace("\"","'")) 97 | throw ex 98 | } 99 | } 100 | 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/HttpDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.zdh.ZdhDataSources 4 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 5 | import org.slf4j.LoggerFactory 6 | 7 | object HttpDataSources extends ZdhDataSources { 8 | 9 | val logger = LoggerFactory.getLogger(this.getClass) 10 | 11 | val http_source = "com.zyc.zdh.datasources.http.HttpRelationProvider" 12 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 13 | inputCondition: String, inputCols: Array[String], duplicateCols:Array[String],outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 14 | try{ 15 | logger.info("[数据采集]:输入源为[HTTP],开始匹配对应参数") 16 | val url: String = inputOptions.getOrElse("url", "").toString 17 | 18 | if(url.trim.equals("")){ 19 | throw new Exception("[zdh],http数据源读取:url为空") 20 | } 21 | 22 | val paths = inputOptions.getOrElse("paths", "").toString 23 | if (paths.trim.equals("")) { 24 | throw new Exception("[zdh],http数据源读取:paths为空") 25 | } 26 | 27 | 28 | var df=spark.read.format(http_source) 29 | .options(inputOptions) 30 | .option("url",url) 31 | .option("schema",inputCols.mkString(",")) 32 | .option("paths",paths) 33 | .load() 34 | 35 | filter(spark,df,inputCondition,duplicateCols) 36 | }catch { 37 | case ex:Exception=>{ 38 | logger.error("[数据采集]:[HTTP]:[READ]:[ERROR]:"+ex.getMessage.replace("\"","'")) 39 | throw ex 40 | } 41 | } 42 | 43 | 44 | } 45 | 46 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 47 | try{ 48 | logger.info("[数据采集]:[HTTP]:[SELECT]") 49 | logger.debug("[数据采集]:[HTTP]:[SELECT]:"+select.mkString(",")) 50 | if(select==null || select.isEmpty){ 51 | logger.debug("[数据采集]:[HTTP]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 52 | return df 53 | } 54 | df.select(select: _*) 55 | }catch { 56 | case ex:Exception=>{ 57 | logger.error("[数据采集]:[HTTP]:[SELECT]:[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 58 | throw ex 59 | } 60 | } 61 | } 62 | 63 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 64 | logger.info("[数据采集]:[HTTP]:[WRITE]:") 65 | throw new Exception("[数据采集]:[HTTP]:[WRITE]:[ERROR]:不支持写入solr数据源") 66 | } 67 | 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/ImageDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.zdh.ZdhDataSources 4 | import org.slf4j.LoggerFactory 5 | 6 | object ImageDataSources extends ZdhDataSources { 7 | 8 | val logger=LoggerFactory.getLogger(this.getClass) 9 | 10 | 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/KuduDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.zdh.ZdhDataSources 4 | import org.apache.kudu.client.CreateTableOptions 5 | import org.apache.kudu.spark.kudu.KuduContext 6 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 9 | import org.slf4j.LoggerFactory 10 | 11 | object KuduDataSources extends ZdhDataSources { 12 | 13 | val logger = LoggerFactory.getLogger(this.getClass) 14 | 15 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 16 | inputCondition: String, inputCols: Array[String], duplicateCols:Array[String],outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 17 | 18 | try{ 19 | logger.info("[数据采集]:[KUDU]:匹配文件格式") 20 | val url: String = inputOptions.getOrElse("url", "").toString 21 | if (url.trim.equals("")) { 22 | throw new Exception("[zdh],kudu数据源读取:url为空") 23 | } 24 | val paths = inputOptions.getOrElse("paths", "").toString 25 | 26 | if (paths.trim.equals("")) { 27 | throw new Exception("[zdh],kudu数据源读取:paths为空") 28 | } 29 | 30 | logger.info("[数据采集]:[KUDU]:[READ]:[TABLE]:"+paths+",[options]:"+inputOptions.mkString(",")) 31 | 32 | val kuduOptions: Map[String, String] = Map("kudu.table" -> paths, "kudu.master" -> url) 33 | 34 | import org.apache.kudu.spark.kudu._ 35 | val df = spark.read.options(kuduOptions).format("kudu").load() 36 | 37 | filter(spark,df,inputCondition,duplicateCols) 38 | 39 | }catch { 40 | case ex:Exception=>{ 41 | ex.printStackTrace() 42 | logger.error("[数据采集]:[KUDU]:[READ]:[TABLE]:[ERROR]:"+ex.getMessage.replace("\"","'")) 43 | throw ex 44 | } 45 | } 46 | 47 | } 48 | 49 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 50 | try{ 51 | logger.info("[数据采集]:[KUDU]:[SELECT]") 52 | logger.debug("[数据采集]:[KUDU]:[SELECT]:"+select.mkString(",")) 53 | if(select==null || select.isEmpty){ 54 | logger.debug("[数据采集]:[KUDU]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 55 | return df 56 | } 57 | df.select(select: _*) 58 | }catch { 59 | case ex:Exception=>{ 60 | logger.error("[数据采集]:[KUDU]:[SELECT]:[ERROR]"+ex.getMessage.replace("\"","'")) 61 | throw ex 62 | } 63 | } 64 | } 65 | 66 | 67 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 68 | 69 | try { 70 | import spark.implicits._ 71 | logger.info("[数据采集]:[KUDU]:[WRITE]:[options]:" + options.mkString(",")+",[FILTER]:"+sql+",[END]") 72 | val url: String = options.getOrElse("url", "").toString 73 | if (url.trim.equals("")) { 74 | throw new Exception("[zdh],kudu数据源写入:url为空") 75 | } 76 | val paths = options.getOrElse("paths", "").toString 77 | if (paths.trim.equals("")) { 78 | throw new Exception("[zdh],kudu数据源写入:paths为空") 79 | } 80 | 81 | if(!sql.trim.equals("")){ 82 | throw new Exception("[zdh],kudu数据源写入:暂不支持删除历史数据") 83 | } 84 | 85 | val primaryKey = options.getOrElse("primary_key", "zdh_auto_md5") 86 | .split(",",-1).toSeq 87 | val replicas = options.getOrElse("replicas", "1").toInt 88 | 89 | val kuduOptions: Map[String, String] = Map("kudu.table" -> paths, "kudu.master" -> url) 90 | 91 | val kuduContext = new KuduContext(url, spark.sparkContext) 92 | import org.apache.kudu.spark.kudu._ 93 | 94 | //合并小文件操作 95 | var df_tmp = merge(spark,df,options) 96 | 97 | var df_result=df_tmp 98 | var schema = df_result.schema 99 | if(primaryKey(0).equals("zdh_auto_md5")){ 100 | df_result=df_tmp.withColumn("zdh_auto_md5",md5(concat(rand(),current_timestamp()))) 101 | schema=df_result.schema 102 | }else{ 103 | schema=StructType(df_result.schema.fields.map(f=>if(primaryKey.contains(f.name)){ 104 | StructField(f.name,f.dataType,false,f.metadata) 105 | }else{ 106 | f 107 | })) 108 | } 109 | 110 | 111 | if (!kuduContext.tableExists(paths)) { 112 | logger.info("[数据采集]:[KUDU]:[WRITE]:写入表不存在,将自动创建表") 113 | val kuduTableOptions = new CreateTableOptions() 114 | import scala.collection.JavaConverters._ 115 | kuduTableOptions.setRangePartitionColumns(primaryKey.asJava).setNumReplicas(replicas); 116 | kuduContext.createTable(paths, schema, primaryKey, kuduTableOptions) 117 | logger.info("[数据采集]:[KUDU]:[WRITE]:完成自动创建表") 118 | } 119 | 120 | 121 | 122 | kuduContext.insertRows(df_result, paths) 123 | 124 | } catch { 125 | case ex: Exception => { 126 | ex.printStackTrace() 127 | logger.error("[数据采集]:[KUDU]:[WRITE]:[ERROR]:" + ex.getMessage.replace("\"","'")) 128 | throw ex 129 | } 130 | } 131 | 132 | } 133 | 134 | } 135 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/LocalDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.base.util.JsonSchemaBuilder 4 | import com.zyc.zdh.ZdhDataSources 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.functions._ 7 | import org.slf4j.LoggerFactory 8 | 9 | object LocalDataSources extends ZdhDataSources { 10 | 11 | val logger = LoggerFactory.getLogger(this.getClass) 12 | 13 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 14 | inputCondition: String, inputCols: Array[String], duplicateCols:Array[String],outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 15 | try { 16 | logger.info("[数据采集]:输入源为[外部上传],开始匹配对应参数") 17 | 18 | var url = inputOptions.getOrElse("url", "") 19 | var port = "22" 20 | if (url.contains(":")) { 21 | port = url.split(":")(1) 22 | url = url.split(":")(0) 23 | } 24 | val sep = inputOptions.getOrElse("sep", ",") 25 | 26 | val username = inputOptions.getOrElse("user", "") 27 | val password = inputOptions.getOrElse("password", "") 28 | 29 | val paths = inputOptions.getOrElse("paths", "").toString 30 | if (paths.trim.equals("")) { 31 | throw new Exception("[zdh],外部上传数据源读取:paths为空") 32 | } 33 | 34 | val fileType = inputOptions.getOrElse("fileType", "csv").toString.toLowerCase 35 | 36 | val encoding = inputOptions.getOrElse("encoding", "") 37 | 38 | val schema = JsonSchemaBuilder.getJsonSchema(inputCols.mkString(",")) 39 | var df = spark.emptyDataFrame 40 | if (!url.equals("")) { 41 | //nginx 资源 42 | logger.error("[数据采集]:[外部上传]:[READ]:读取ftp资源") 43 | logger.error("[数据采集]:[外部上传]:[READ]:读取ftp资源,目前只支持utf-8 编码") 44 | df = spark.read. 45 | format("com.zyc.zdh.datasources.sftp.SftpSource"). 46 | schema(schema). 47 | options(inputOptions). 48 | option("inputCols",inputCols.mkString(",")). 49 | option("host", url). 50 | option("port", port). 51 | option("username", username). 52 | option("password", password). 53 | option("fileType", fileType). 54 | option("delimiter", sep). 55 | load(paths) 56 | } else { 57 | //本地资源 58 | logger.error("[数据采集]:[外部上传]:[READ]:读取本地资源") 59 | if (!fileType.equals("orc") && !fileType.equals("parquet")) { 60 | df = spark.read.format(fileType).schema(schema).options(inputOptions).load("file:///" + paths) 61 | } else { 62 | df = spark.read.format(fileType).options(inputOptions).load("file:///" + paths).select(inputCols.map(col(_)): _*) 63 | } 64 | } 65 | 66 | filter(spark,df,inputCondition,duplicateCols) 67 | } catch { 68 | case ex: Exception => { 69 | logger.error("[数据采集]:[外部上传]:[READ]:[ERROR]:" + ex.getMessage.replace("\"","'")) 70 | throw ex 71 | } 72 | } 73 | } 74 | 75 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 76 | try { 77 | logger.info("[数据采集]:[外部上传]:[SELECT]") 78 | logger.debug("[数据采集]:[外部上传]:[SELECT]:" + select.mkString(",")) 79 | if(select==null || select.isEmpty){ 80 | logger.debug("[数据采集]:[外部上传]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 81 | return df 82 | } 83 | df.select(select: _*) 84 | } catch { 85 | case ex: Exception => { 86 | logger.error("[数据采集]:[外部上传]:[SELECT]:[ERROR]:" + ex.getMessage.replace("\"","'"), "error") 87 | throw ex 88 | } 89 | } 90 | } 91 | 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/MemSqlDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.util.Properties 4 | 5 | import com.zyc.zdh.ZdhDataSources 6 | import org.apache.spark.sql.types.StructField 7 | import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession} 8 | import org.slf4j.LoggerFactory 9 | 10 | /** 11 | * 使用此数据源连接所有的memsql数据,包括hive,mysql,oracle 等 12 | */ 13 | object MemSqlDataSources extends ZdhDataSources{ 14 | 15 | val logger=LoggerFactory.getLogger(this.getClass) 16 | 17 | /** 18 | * 获取数据源schema 19 | * 20 | * @param spark 21 | * @param options 22 | * @return 23 | */ 24 | override def getSchema(spark: SparkSession, options: Map[String,String])(implicit dispatch_task_id:String): Array[StructField] = { 25 | logger.info("[数据采集]:[MEMSQL]:[SCHEMA]:"+options.mkString(",")) 26 | spark.read.format("memsql").options(options).load().schema.fields 27 | } 28 | 29 | 30 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 31 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], 32 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 33 | try{ 34 | logger.info("[数据采集]:输入源为[MEMSQL],开始匹配对应参数") 35 | val url: String = inputOptions.getOrElse("url", "").toString 36 | if(url.trim.equals("")){ 37 | throw new Exception("[zdh],memsql数据源读取:url为空") 38 | } 39 | val dbtable: String = inputOptions.getOrElse("dbtable", "").toString 40 | if(dbtable.trim.equals("")){ 41 | throw new Exception("[zdh],memsql数据源读取:dbtable为空") 42 | } 43 | val user: String = inputOptions.getOrElse("user", "").toString 44 | if(user.trim.equals("")){ 45 | logger.info("[zdh],memsql数据源读取:user为空") 46 | // throw new Exception("[zdh],memsql数据源读取:user为空") 47 | } 48 | val password: String = inputOptions.getOrElse("password", "").toString 49 | if(password.trim.equals("")){ 50 | logger.info("[zdh],memsql数据源读取:password为空") 51 | // throw new Exception("[zdh],memsql数据源读取:password为空") 52 | } 53 | val driver: String = inputOptions.getOrElse("driver", "").toString 54 | if(driver.trim.equals("")){ 55 | throw new Exception("[zdh],memsql数据源读取:driver为空") 56 | } 57 | val paths = inputOptions.getOrElse("paths", "").toString 58 | if(!paths.contains(".")){ 59 | throw new Exception("[zdh],memsql数据源读取:表名必须是database.table") 60 | } 61 | logger.info("[数据采集]:[MEMSQL]:[READ]:表名:"+paths+","+inputOptions.mkString(",")+" [FILTER]:"+inputCondition) 62 | //获取memsql 配置 63 | var format="memsql" 64 | if(inputOptions.getOrElse("url","").toLowerCase.contains("memsql:hive2:")){ 65 | format="org.apache.spark.sql.execution.datasources.hive.HiveRelationProvider" 66 | logger.info("[数据采集]:[MEMSQL]:[READ]:表名:"+inputOptions.getOrElse("dbtable","")+",使用自定义hive-memsql数据源") 67 | } 68 | if(inputOptions.getOrElse("url","").toLowerCase.contains("memsql:clickhouse:")){ 69 | format="org.apache.spark.sql.execution.datasources.clickhouse.ClickHouseRelationProvider" 70 | logger.info("[数据采集]:[MEMSQL]:[READ]:表名:"+inputOptions.getOrElse("dbtable","")+",使用自定义clickhouse-memsql数据源") 71 | } 72 | 73 | var df:DataFrame=spark.read 74 | .format(format) 75 | .option("ddlEndpoint", url) 76 | .option("user", user) 77 | .option("password",password) 78 | .options(inputOptions) 79 | .load(paths) 80 | 81 | filter(spark,df,inputCondition,duplicateCols) 82 | 83 | }catch { 84 | case ex:Exception=>{ 85 | logger.error("[数据采集]:[MEMSQL]:[READ]:表名:"+inputOptions.getOrElse("paths","")+"[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 86 | throw ex 87 | } 88 | } 89 | 90 | } 91 | 92 | /** 93 | * 读取数据源之后的字段映射 94 | * @param spark 95 | * @param df 96 | * @param select 97 | * @return 98 | */ 99 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id:String): DataFrame = { 100 | try{ 101 | logger.info("[数据采集]:[MEMSQL]:[SELECT]") 102 | logger.debug("[数据采集]:[MEMSQL]:[SELECT]:"+select.mkString(",")) 103 | if(select==null || select.isEmpty){ 104 | logger.debug("[数据采集]:[MEMSQL]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 105 | return df 106 | } 107 | df.select(select: _*) 108 | }catch { 109 | case ex:Exception=>{ 110 | logger.error("[数据采集]:[MEMSQL]:[SELECT]:[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 111 | throw ex 112 | } 113 | } 114 | 115 | } 116 | 117 | 118 | override def writeDS(spark: SparkSession,df:DataFrame,options: Map[String,String], sql: String)(implicit dispatch_task_id:String): Unit = { 119 | try{ 120 | logger.info("[数据采集]:[MEMSQL]:[WRITE]:表名:"+options.getOrElse("paths","")+","+options.mkString(",")) 121 | val url=options.getOrElse("url","") 122 | if(!sql.equals("")){ 123 | deletememsql(spark,url,options,sql) 124 | } 125 | 126 | val paths = options.getOrElse("paths", "").toString 127 | if(!paths.contains(".")){ 128 | throw new Exception("[zdh],memsql数据源写入:表名必须是database.table") 129 | } 130 | 131 | val model = options.getOrElse("model", "").toString.toLowerCase match { 132 | case "overwrite" => SaveMode.Overwrite 133 | case "append" => SaveMode.Append 134 | case "errorifexists" => SaveMode.ErrorIfExists 135 | case "ignore" => SaveMode.Ignore 136 | case _ => SaveMode.Append 137 | } 138 | 139 | //合并小文件操作 140 | var df_tmp = merge(spark,df,options) 141 | 142 | var format="memsql" 143 | df_tmp.write.format(format) 144 | .mode(model) 145 | .option("ddlEndpoint", url) 146 | .options(options).save(paths) 147 | 148 | }catch { 149 | case ex:Exception=>{ 150 | ex.printStackTrace() 151 | logger.info("[数据采集]:[MEMSQL]:[WRITE]:表名:"+options.getOrElse("paths","")+","+"[ERROR]:"+ex.getMessage.replace("\"","'")) 152 | throw ex 153 | } 154 | } 155 | 156 | } 157 | 158 | 159 | /** 160 | * 写入memsql 之前 清空特定数据 161 | * 162 | * @param spark 163 | * @param url 164 | * @param options 165 | * @param sql 166 | */ 167 | def deletememsql(spark: SparkSession, url: String, options: Map[String,String], sql: String)(implicit dispatch_task_id:String): Unit = { 168 | logger.info("[数据采集]:[MEMSQL]:[CLEAR]:url:"+url+","+options.mkString(",")+",sql:"+sql) 169 | import scala.collection.JavaConverters._ 170 | val properties=new Properties() 171 | properties.putAll(options.asJava) 172 | var driver = properties.getProperty("driver", "org.mariadb.jdbc.Driver") 173 | Class.forName(driver) 174 | var cn: java.sql.Connection = null 175 | var ps: java.sql.PreparedStatement = null 176 | try { 177 | cn = java.sql.DriverManager.getConnection("jdbc:mariadb://"+url, properties) 178 | ps = cn.prepareStatement(sql) 179 | ps.execute() 180 | ps.close() 181 | cn.close() 182 | } 183 | catch { 184 | case ex: Exception => { 185 | ps.close() 186 | cn.close() 187 | if(ex.getMessage.replace("\"","'").contains("doesn't exist") || ex.getMessage.replace("\"","'").contains("Unknown table")){ 188 | logger.warn("[数据采集]:[MEMSQL]:[CLEAR]:[WARN]:"+ex.getMessage.replace("\"","'")) 189 | }else{ 190 | throw ex 191 | } 192 | } 193 | } 194 | } 195 | 196 | 197 | def getDriver(url: String): String = { 198 | 199 | url match { 200 | case u if u.toLowerCase.contains("jdbc:mariadb") => "org.mariadb.jdbc.Driver" 201 | case _ => "" 202 | } 203 | 204 | 205 | } 206 | 207 | } 208 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/MongoDBDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.io.StringReader 4 | import java.util 5 | 6 | import com.mongodb.{MongoClient, MongoClientURI} 7 | import com.zyc.zdh.ZdhDataSources 8 | import net.sf.jsqlparser.parser.CCJSqlParserManager 9 | import net.sf.jsqlparser.statement.delete.Delete 10 | import org.apache.spark.sql.types.StructField 11 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 12 | import org.slf4j.LoggerFactory 13 | 14 | object MongoDBDataSources extends ZdhDataSources { 15 | 16 | val logger=LoggerFactory.getLogger(this.getClass) 17 | 18 | override def getSchema(spark: SparkSession, options: Map[String, String])(implicit dispatch_task_id: String): Array[StructField] = super.getSchema(spark, options) 19 | 20 | 21 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 22 | inputCondition: String, inputCols: Array[String], duplicateCols:Array[String],outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 23 | 24 | try{ 25 | 26 | logger.info("[数据采集]:输入源为[MONGODB],开始匹配对应参数") 27 | val url: String = inputOptions.getOrElse("url", "").toString 28 | if(url.trim.equals("")){ 29 | throw new Exception("[zdh],MONGODB数据源读取:url为空") 30 | } 31 | var map=inputOptions.+("spark.mongodb.input.uri"->url) 32 | val paths=inputOptions.getOrElse("paths","").toString 33 | if(paths.trim.equals("")){ 34 | throw new Exception("[zdh],MONGODB数据源读取:paths为空") 35 | } 36 | map=map.+("spark.mongodb.input.collection"->paths) 37 | 38 | 39 | val df=spark.read.format("mongo").options(map).load() 40 | filter(spark,df,inputCondition,duplicateCols) 41 | 42 | }catch { 43 | case ex:Exception=>{ 44 | logger.error("[数据采集]:[MONGODB]:[SELECT]:[ERROR]:"+ex.getMessage.replace("\"","'")) 45 | throw ex 46 | } 47 | } 48 | 49 | } 50 | 51 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 52 | try{ 53 | logger.info("[数据采集]:[MONGODB]:[SELECT]") 54 | logger.debug("[数据采集]:[MONGODB]:[SELECT]:"+select.mkString(",")) 55 | if(select==null || select.isEmpty){ 56 | logger.debug("[数据采集]:[MONGODB]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 57 | return df 58 | } 59 | df.select(select: _*) 60 | }catch { 61 | case ex:Exception=>{ 62 | logger.error("[数据采集]:[MONGODB]:[SELECT]:[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 63 | throw ex 64 | } 65 | } 66 | } 67 | 68 | 69 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 70 | 71 | try{ 72 | logger.info("[数据采集]:[MONGODB]:[WRITE]:表名:"+options.getOrElse("paths","")+","+options.mkString(",")) 73 | val url=options.getOrElse("url","") 74 | if(url.trim.equals("")){ 75 | throw new Exception("[zdh],MONGODB数据源输出:url为空") 76 | } 77 | var map=options.+("spark.mongodb.output.uri"->url) 78 | val paths=options.getOrElse("paths","").toString 79 | if(paths.trim.equals("")){ 80 | throw new Exception("[zdh],MONGODB数据源输出:paths为空") 81 | } 82 | map=map.+("spark.mongodb.output.collection"->paths) 83 | 84 | if(!sql.equals("")){ 85 | deleteJDBC(url,paths,sql) 86 | } 87 | 88 | var format="mongo" 89 | //合并小文件操作 90 | var df_tmp = merge(spark,df,options) 91 | 92 | df_tmp.write.format(format).mode(map.getOrElse("model","append")).options(map).save() 93 | }catch { 94 | case ex:Exception=>{ 95 | ex.printStackTrace() 96 | logger.error("[数据采集]:[MONGODB]:[WRITE]:[ERROR]:表名:"+options.getOrElse("paths","")+","+"[ERROR]:"+ex.getMessage.replace("\"","'")) 97 | throw ex 98 | } 99 | } 100 | 101 | } 102 | 103 | def deleteJDBC(url:String,collection:String,sql:String)(implicit dispatch_task_id: String): Unit ={ 104 | try{ 105 | 106 | logger.info("[数据采集]:[MONGODB]:[CLEAR]:url"+url+",collection:"+collection) 107 | 108 | val connectString=new MongoClientURI(url) 109 | 110 | val mongoClient=new MongoClient(connectString) 111 | 112 | import com.mongodb.client.model.Filters._ 113 | val dataBase=mongoClient.getDatabase(connectString.getDatabase) 114 | 115 | val parser=new CCJSqlParserManager(); 116 | val reader=new StringReader(sql); 117 | val list=new util.ArrayList[String](); 118 | val stmt=parser.parse(new StringReader(sql)); 119 | 120 | if(stmt.isInstanceOf[Delete]){ 121 | val whereExpr=stmt.asInstanceOf[Delete].getWhere 122 | whereExpr.toString match { 123 | 124 | case ex if (ex.contains(">=")) =>dataBase.getCollection(collection).deleteMany(gte(ex.split(">=")(0),ex.split(">=")(1))) 125 | case ex if (ex.contains("<=")) =>dataBase.getCollection(collection).deleteMany(lte(ex.split("<=")(0),ex.split("<=")(1))) 126 | case ex if (ex.contains(">")) =>dataBase.getCollection(collection).deleteMany(gt(ex.split(">")(0),ex.split(">")(1))) 127 | case ex if (ex.contains("<")) =>dataBase.getCollection(collection).deleteMany(lt(ex.split("<")(0),ex.split("<")(1))) 128 | case ex if (ex.contains("=")) =>dataBase.getCollection(collection).deleteMany(com.mongodb.client.model.Filters.eq(ex.split("=")(0),ex.split("=")(1))) 129 | } 130 | 131 | } 132 | }catch { 133 | case ex:Exception=>{ 134 | logger.info("[数据采集]:[MONGODB]:[CLEAR]:[ERROR]:url:"+url+",collection:"+collection+ex.getMessage.replace("\"","'")) 135 | ex.printStackTrace() 136 | } 137 | } 138 | 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/RedisDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.redislabs.provider.redis._ 4 | import com.zyc.base.util.JsonSchemaBuilder 5 | import com.zyc.zdh.ZdhDataSources 6 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 7 | import org.slf4j.LoggerFactory 8 | 9 | object RedisDataSources extends ZdhDataSources { 10 | 11 | val logger=LoggerFactory.getLogger(this.getClass) 12 | 13 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 14 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 15 | try{ 16 | logger.info("[数据采集]:输入源为[REDIS]") 17 | logger.info("[数据采集]:[REDIS]:[READ]:[paths]:"+inputOptions.getOrElse("paths", "")+"[其他参数]:"+inputOptions.mkString(",")+"[FILTER]:"+inputCondition) 18 | import spark.implicits._ 19 | 20 | //"redis://:yld@127.0.0.1:6379" 21 | val url = inputOptions.getOrElse("url", "") 22 | if(url.trim.equals("")){ 23 | throw new Exception("[zdh],redis数据源读取:url为空") 24 | } 25 | 26 | val password: String = inputOptions.getOrElse("password", "").toString 27 | //可以是表达式 也可以是具体的key 28 | val paths = inputOptions.getOrElse("paths", "") 29 | if(paths.trim.equals("")){ 30 | throw new Exception("[zdh],redis数据源读取:paths为空") 31 | } 32 | var host = url 33 | var port = "6379" 34 | if (url.contains(":")) { 35 | host = url.split(":")(0) 36 | port = url.split(":")(1) 37 | } 38 | 39 | 40 | val schema = JsonSchemaBuilder.getJsonSchema(inputCols.mkString(",")) 41 | 42 | //string,list,hash,set,table 43 | val dataType = inputOptions.getOrElse("data_type", "string").toLowerCase 44 | 45 | val redisConfig = new RedisConfig(new RedisEndpoint(host,port.toInt,password)) 46 | 47 | val df = dataType match { 48 | case "string" => spark.sparkContext.fromRedisKV(paths)(redisConfig).toDF(inputCols: _*) 49 | case "hash" => { 50 | spark.read 51 | .format("org.apache.spark.sql.redis") 52 | .option("host", host) 53 | .option("keys.pattern", paths) 54 | .option("port", port) 55 | .option("auth", password) 56 | .options(inputOptions) 57 | .schema(schema) 58 | .load() 59 | } 60 | case "list" => spark.sparkContext.fromRedisList(paths)(redisConfig).toDF(inputCols: _*) 61 | case "set" => spark.sparkContext.fromRedisSet(paths)(redisConfig).toDF(inputCols: _*) 62 | case "table"=>{ 63 | spark.read 64 | .format("org.apache.spark.sql.redis") 65 | .option("host", host) 66 | .option("table", paths) 67 | .option("port", port) 68 | .option("auth", password) 69 | .options(inputOptions) 70 | .schema(schema) 71 | .load() 72 | } 73 | } 74 | 75 | filter(spark,df,inputCondition,duplicateCols) 76 | }catch { 77 | case ex:Exception=>{ 78 | ex.printStackTrace() 79 | logger.error("[数据采集]:[REDIS]:[READ]:[ERROR]:"+ex.getMessage.replace("\"","'")) 80 | throw ex 81 | } 82 | } 83 | 84 | } 85 | 86 | 87 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 88 | logger.info("[数据采集]:[REDIS]:[SELECT]") 89 | logger.debug("[数据采集]:[REDIS]:[SELECT]:"+select.mkString(",")) 90 | if(select==null || select.isEmpty){ 91 | logger.debug("[数据采集]:[REDIS]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 92 | return df 93 | } 94 | df.select(select: _*) 95 | } 96 | 97 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 98 | try{ 99 | logger.info("[数据采集]:[REDIS]:[WRITE]:[options]:"+options.mkString(",")) 100 | import spark.implicits._ 101 | 102 | //"redis://:yld@127.0.0.1:6379" 103 | val url = options.getOrElse("url", "") 104 | if(url.trim.equals("")){ 105 | throw new Exception("[zdh],redis数据源读取:url为空") 106 | } 107 | 108 | val password: String = options.getOrElse("password", "").toString 109 | //可以是表达式 也可以是具体的key 110 | val paths = options.getOrElse("paths", "") 111 | if(paths.trim.equals("")){ 112 | throw new Exception("[zdh],redis数据源读取:paths为空") 113 | } 114 | var host = url 115 | var port = "6379" 116 | if (url.contains(":")) { 117 | host = url.split(":")(0) 118 | port = url.split(":")(1) 119 | } 120 | 121 | 122 | //string,list,hash,set,table 123 | val dataType = options.getOrElse("data_type", "string").toLowerCase 124 | 125 | val saveModel=options.getOrElse("model","append") 126 | 127 | val redisConfig = new RedisConfig(new RedisEndpoint(host,port.toInt,password)) 128 | 129 | //合并小文件操作 130 | var df_tmp = merge(spark,df,options) 131 | 132 | 133 | val redisRDD = dataType match { 134 | case "string" => spark.sparkContext.toRedisKV(df_tmp.map(row=>(row.getString(0),row.getString(1))).rdd) 135 | // case "hash" => { 136 | // df.map(row=>(row.getString(0),row.getString(1))) 137 | // 138 | // } 139 | // case "list" => spark.sparkContext.toRedisList(paths)(redisConfig).toDF(inputCols: _*) 140 | // case "set" => spark.sparkContext.fromRedisSet(paths)(redisConfig).toDF(inputCols: _*) 141 | case "table"=>{ 142 | df_tmp.write.format("org.apache.spark.sql.redis") 143 | .mode(saveModel) 144 | .option("host", host) 145 | .option("table", paths) 146 | .option("port", port) 147 | .option("auth", password) 148 | .options(options) 149 | .save() 150 | } 151 | } 152 | }catch { 153 | case ex:Exception=>{ 154 | ex.printStackTrace() 155 | logger.error("[数据采集]:[REDIS]:[WRITE]:[ERROR]:"+ex.getMessage.replace("\"","'")) 156 | throw ex 157 | } 158 | } 159 | 160 | 161 | } 162 | 163 | } 164 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/SFtpDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.base.util.JsonSchemaBuilder 4 | import com.zyc.zdh.ZdhDataSources 5 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 6 | import org.slf4j.LoggerFactory 7 | 8 | object SFtpDataSources extends ZdhDataSources { 9 | val logger = LoggerFactory.getLogger(this.getClass) 10 | 11 | 12 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 13 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], 14 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 15 | try { 16 | logger.info("[数据采集]:[SFTP]:匹配文件格式") 17 | var url=inputOptions.getOrElse("url","") 18 | var port="22" 19 | if(url.contains(":")){ 20 | port=url.split(":")(1) 21 | url=url.split(":")(0) 22 | } 23 | if(url.trim.equals("")){ 24 | throw new Exception("[zdh],ftp数据源读取:url为空") 25 | } 26 | 27 | val paths=inputOptions.getOrElse("paths","") 28 | 29 | if(paths.trim.equals("")){ 30 | throw new Exception("[zdh],ftp数据源读取:paths为空") 31 | } 32 | val sep=inputOptions.getOrElse("sep",",") 33 | 34 | val username=inputOptions.getOrElse("user","") 35 | val password=inputOptions.getOrElse("password","") 36 | 37 | val fileType=inputOptions.getOrElse("fileType", "csv").toString.toLowerCase 38 | 39 | val schema=JsonSchemaBuilder.getJsonSchema(inputCols.mkString(",")) 40 | logger.info("[数据采集]:[SFTP]:[READ]:paths:"+url+":"+port+paths) 41 | val df = spark.read. 42 | format("com.zyc.zdh.datasources.sftp.SftpSource"). 43 | schema(schema). 44 | options(inputOptions). 45 | option("inputCols",inputCols.mkString(",")). 46 | option("host", url). 47 | option("port",port). 48 | option("username", username). 49 | option("password",password). 50 | option("fileType", fileType). 51 | option("delimiter", sep). 52 | load(paths) 53 | 54 | filter(spark,df,inputCondition,duplicateCols) 55 | } catch { 56 | case ex: Exception => { 57 | ex.printStackTrace() 58 | logger.error("[数据采集]:[SFTP]:[READ]:[ERROR]:" + ex.getMessage.replace("\"","'")) 59 | throw ex 60 | } 61 | } 62 | } 63 | 64 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 65 | try{ 66 | logger.info("[数据采集]:[SFTP]:[SELECT]") 67 | logger.debug("[数据采集]:[SFTP]:[SELECT]:"+select.mkString(",")) 68 | if(select==null || select.isEmpty){ 69 | logger.debug("[数据采集]:[SFTP]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 70 | return df 71 | } 72 | df.select(select: _*) 73 | }catch { 74 | case ex:Exception=>{ 75 | logger.error("[数据采集]:[SFTP]:[SELECT]:[ERROR]"+ex.getMessage.replace("\"","'")) 76 | throw ex 77 | } 78 | } 79 | } 80 | 81 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 82 | try { 83 | logger.info("[数据采集]:[SFTP]:[WRITE]:[options]:"+options.mkString(",")) 84 | var url=options.getOrElse("url","") 85 | var port="22" 86 | if(url.contains(":")){ 87 | port=url.split(":")(1) 88 | url=url.split(":")(0) 89 | } 90 | 91 | val paths=options.getOrElse("paths","") 92 | val sep=options.getOrElse("sep",",") 93 | 94 | val username=options.getOrElse("user","") 95 | val password=options.getOrElse("password","") 96 | 97 | val filtType=options.getOrElse("fileType", "csv").toString.toLowerCase 98 | 99 | //合并小文件操作 100 | var df_tmp = merge(spark,df,options) 101 | 102 | df_tmp.write. 103 | format("com.springml.spark.sftp"). 104 | options(options). 105 | option("host", url). 106 | option("port",port). 107 | option("username", username). 108 | option("password",password). 109 | option("fileType", filtType). 110 | option("delimiter", sep). 111 | save(paths) 112 | 113 | df_tmp 114 | 115 | } catch { 116 | case ex: Exception => { 117 | ex.printStackTrace() 118 | logger.error("[数据采集]:[SFTP]:[WRITE]:[ERROR]:" + ex.getMessage.replace("\"","'")) 119 | throw ex 120 | } 121 | } 122 | } 123 | 124 | 125 | } 126 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/SolrDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.zdh.ZdhDataSources 4 | import org.apache.spark.sql.{Column, DataFrame, SparkSession} 5 | import org.slf4j.LoggerFactory 6 | 7 | /** 8 | * 只支持集群,不支持单机 9 | */ 10 | object SolrDataSources extends ZdhDataSources { 11 | 12 | val logger=LoggerFactory.getLogger(this.getClass) 13 | 14 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 15 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 16 | 17 | logger.info("[数据采集]:[SOLR]:[READ]:其他参数:," + inputOptions.mkString(",") + " [FILTER]:" + inputCondition) 18 | 19 | import spark.implicits._ 20 | 21 | val zkUrl = inputOptions.getOrElse("url", "") 22 | 23 | val collection = inputOptions.getOrElse("paths", "") 24 | 25 | //参考 https://github.com/lucidworks/spark-solr 26 | val df = spark.read.format("solr") 27 | .options(inputOptions) 28 | .option("zkhost", zkUrl) 29 | .option("collection", collection) 30 | .load 31 | 32 | filter(spark,df,inputCondition,duplicateCols) 33 | 34 | } 35 | 36 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id: String): DataFrame = { 37 | try{ 38 | logger.info("[数据采集]:[SOLR]:[SELECT]") 39 | logger.debug("[数据采集]:[SOLR]:[SELECT]:"+select.mkString(",")) 40 | if(select==null || select.isEmpty){ 41 | logger.debug("[数据采集]:[SOLR]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 42 | return df 43 | } 44 | df.select(select: _*) 45 | }catch { 46 | case ex:Exception=>{ 47 | logger.error("[数据采集]:[SOLR]:[SELECT]:[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 48 | throw ex 49 | } 50 | } 51 | } 52 | 53 | override def writeDS(spark: SparkSession, df: DataFrame, options: Map[String, String], sql: String)(implicit dispatch_task_id: String): Unit = { 54 | logger.info("[数据采集]:[SOLR]:[WRITE]:") 55 | throw new Exception("[数据采集]:[SOLR]:[WRITE]:[ERROR]:不支持写入solr数据源") 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/TidbDataSources.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.util.Properties 4 | 5 | import com.zyc.zdh.ZdhDataSources 6 | import org.apache.spark.sql.types.StructField 7 | import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession} 8 | import org.slf4j.LoggerFactory 9 | 10 | /** 11 | * 使用此数据源连接所有的jdbc数据,包括hive,mysql,oracle 等 12 | */ 13 | object TidbDataSources extends ZdhDataSources{ 14 | 15 | val source="TIDB" 16 | val logger=LoggerFactory.getLogger(this.getClass) 17 | 18 | /** 19 | * 获取数据源schema 20 | * 21 | * @param spark 22 | * @param options 23 | * @return 24 | */ 25 | override def getSchema(spark: SparkSession, options: Map[String,String])(implicit dispatch_task_id:String): Array[StructField] = { 26 | logger.info(s"[数据采集]:[${source}]:[SCHEMA]:"+options.mkString(",")) 27 | spark.read.format("jdbc").options(options).load().schema.fields 28 | } 29 | 30 | 31 | override def getDS(spark: SparkSession, dispatchOption: Map[String, Any], inPut: String, inputOptions: Map[String, String], 32 | inputCondition: String, inputCols: Array[String],duplicateCols:Array[String], outPut: String, outputOptionions: Map[String, String], 33 | outputCols: Array[Map[String, String]], sql: String)(implicit dispatch_task_id: String): DataFrame = { 34 | try{ 35 | logger.info("[数据采集]:输入源为[TIDB],开始匹配对应参数") 36 | 37 | // 检查spark.tispark.pd.addresses,spark.sql.extensions 38 | if(!spark.conf.getAll.contains("spark.tispark.pd.addresses") || !spark.conf.get("spark.sql.extensions") .equalsIgnoreCase("org.apache.spark.sql.TiExtensions")){ 39 | throw new Exception("[zdh],TIDB数据源读取:请设置spark.tispark.pd.addresses,并且set spark.sql.extensions=org.apache.spark.sql.TiExtensions,也可使用jdbc方式读取tidb") 40 | } 41 | 42 | val tableName=inputOptions.getOrElse("paths","").toString 43 | if(tableName.trim.equals("")|| !tableName.contains(".")){ 44 | throw new Exception("[数据采集]:[TIDB]:[READ]:paths参数为空,必须是database.tablename 格式") 45 | } 46 | 47 | logger.info("[数据采集]:[TIDB]:[READ]:表名:"+tableName+","+inputOptions.mkString(",")+" [FILTER]:"+inputCondition) 48 | 49 | 50 | val sql=s"select ${inputCols.mkString(",")} from ${tableName}"; 51 | 52 | logger.info(sql) 53 | var df:DataFrame=spark.sql(sql) 54 | 55 | filter(spark,df,inputCondition,duplicateCols) 56 | 57 | }catch { 58 | case ex:Exception=>{ 59 | logger.error("[数据采集]:[TIDB]:[READ]:表名:"+inputOptions.getOrElse("dbtable","")+"[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 60 | throw ex 61 | } 62 | } 63 | 64 | } 65 | 66 | /** 67 | * 读取数据源之后的字段映射 68 | * @param spark 69 | * @param df 70 | * @param select 71 | * @return 72 | */ 73 | override def process(spark: SparkSession, df: DataFrame, select: Array[Column],zdh_etl_date:String)(implicit dispatch_task_id:String): DataFrame = { 74 | try{ 75 | logger.info("[数据采集]:[TIDB]:[SELECT]") 76 | logger.debug("[数据采集]:[TIDB]:[SELECT]:"+select.mkString(",")) 77 | if(select==null || select.isEmpty){ 78 | logger.debug("[数据采集]:[TIDB]:[SELECT]:[智能识别字段]" +df.columns.mkString(",")) 79 | return df 80 | } 81 | df.select(select: _*) 82 | }catch { 83 | case ex:Exception=>{ 84 | logger.error("[数据采集]:[TIDB]:[SELECT]:[ERROR]:"+ex.getMessage.replace("\"","'"),"error") 85 | throw ex 86 | } 87 | } 88 | 89 | } 90 | 91 | 92 | override def writeDS(spark: SparkSession,df:DataFrame,options: Map[String,String], sql: String)(implicit dispatch_task_id:String): Unit = { 93 | try{ 94 | logger.info("[数据采集]:[TIDB]:[WRITE]:表名:"+options.getOrElse("dbtable","")+","+options.mkString(",")) 95 | var options_tmp=options 96 | val dbtable: String = options.getOrElse("paths", "").toString 97 | if(dbtable.trim.equals("")|| !dbtable.contains(".")){ 98 | throw new Exception("[数据采集]:[TIDB]:[WRITE]:paths参数为空,必须是database.tablename 格式") 99 | } 100 | 101 | val url=options.getOrElse("url","") 102 | if(url.trim.equals("")){ 103 | throw new Exception("[数据采集]:[TIDB]:[WRITE]:url参数为空") 104 | } 105 | var addr="127.0.0.1" 106 | var port="4000" 107 | if(url.contains(":")){ 108 | addr=url.split(":")(0) 109 | port=url.split(":")(1) 110 | }else{ 111 | addr=url 112 | } 113 | 114 | options_tmp=options_tmp.+("tidb.user"->options.getOrElse("user","root")) 115 | options_tmp=options_tmp.+("tidb.password"->options.getOrElse("password","")) 116 | options_tmp=options_tmp.+("table"->dbtable.split("\\.")(1)) 117 | options_tmp=options_tmp.+("tidb.addr"->addr) 118 | options_tmp=options_tmp.+("tidb.port"->port) 119 | options_tmp=options_tmp.+("database"->dbtable.split("\\.")(0)) 120 | 121 | if(!sql.equals("")){ 122 | deleteTIDB(spark,url,options,sql) 123 | } 124 | 125 | var format="tidb" 126 | // df.write. 127 | // format("tidb"). 128 | // option("tidb.user", "root"). 129 | // option("tidb.password", ""). 130 | // option("database", "tpch_test"). 131 | // option("table", "target_table_orders"). 132 | // mode("append"). 133 | // save() 134 | //合并小文件操作 135 | var df_tmp = merge(spark,df,options) 136 | df_tmp.write.format(format).mode(SaveMode.Append).options(options_tmp).save() 137 | 138 | }catch { 139 | case ex:Exception=>{ 140 | ex.printStackTrace() 141 | logger.info("[数据采集]:[TIDB]:[WRITE]:表名:"+options.getOrElse("dbtable","")+","+"[ERROR]:"+ex.getMessage.replace("\"","'")) 142 | throw ex 143 | } 144 | } 145 | 146 | } 147 | 148 | 149 | /** 150 | * 写入jdbc 之前 清空特定数据 151 | * 152 | * @param spark 153 | * @param url 154 | * @param options 155 | * @param sql 156 | */ 157 | def deleteTIDB(spark: SparkSession, url: String, options: Map[String,String], sql: String)(implicit dispatch_task_id:String): Unit = { 158 | val new_url="jdbc:mysql://"+url 159 | logger.info("[数据采集]:[TIDB]:[CLEAR]:url:"+new_url+","+options.mkString(",")+",sql:"+sql) 160 | import scala.collection.JavaConverters._ 161 | val properties=new Properties() 162 | properties.putAll(options.asJava) 163 | var driver = properties.getProperty("driver", "com.mysql.cj.jdbc.Driver") 164 | if (driver.equals("")) { 165 | driver = getDriver(new_url) 166 | } 167 | Class.forName(driver) 168 | var cn: java.sql.Connection = null 169 | var ps: java.sql.PreparedStatement = null 170 | try { 171 | cn = java.sql.DriverManager.getConnection(new_url, properties) 172 | ps = cn.prepareStatement(sql) 173 | ps.execute() 174 | ps.close() 175 | cn.close() 176 | } 177 | catch { 178 | case ex: Exception => { 179 | ps.close() 180 | cn.close() 181 | if(ex.getMessage.replace("\"","'").contains("doesn't exist") || ex.getMessage.replace("\"","'").contains("Unknown table")){ 182 | logger.warn("[数据采集]:[TIDB]:[CLEAR]:[WARN]:"+ex.getMessage.replace("\"","'")) 183 | }else{ 184 | throw ex 185 | } 186 | } 187 | } 188 | } 189 | 190 | 191 | def getDriver(url: String): String = { 192 | 193 | url match { 194 | case u if u.toLowerCase.contains("jdbc:mysql") => "com.mysql.cj.jdbc.Driver" 195 | case _ => "" 196 | } 197 | 198 | 199 | } 200 | 201 | } 202 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/http/HttpOptions.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources.http 2 | 3 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 4 | 5 | class HttpOptions( 6 | @transient val parameters: CaseInsensitiveMap[String]) 7 | extends Serializable { 8 | 9 | import HttpOptions._ 10 | 11 | def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters)) 12 | 13 | require( 14 | parameters.get(HTTP_URL).isDefined, 15 | s"Option '$HTTP_URL' is required. " + 16 | s"Option '$HTTP_URL' is not null.") 17 | 18 | def getSchema(): String = { 19 | parameters.getOrElse(SCHEMA, "value").toString 20 | } 21 | 22 | def getHttp_Url: String = { 23 | if (!parameters.get(HTTP_URL).get.endsWith("/") && !parameters.get(PATHS).get.startsWith("/")) 24 | parameters.get(HTTP_URL).get + "/" + parameters.get(PATHS).get 25 | else 26 | parameters.get(HTTP_URL).get + parameters.get(PATHS).get 27 | } 28 | 29 | def getMethod(): String ={ 30 | parameters.getOrElse(METHOD,"get").toString.toLowerCase 31 | } 32 | 33 | def getTimeOut():Long={ 34 | parameters.getOrElse(TIME_OUT,"5000").toLong 35 | } 36 | 37 | def getFileType(): String ={ 38 | parameters.getOrElse(FILETYPE,"json").toString.toLowerCase 39 | } 40 | def getSep(): String ={ 41 | parameters.getOrElse(SEP,",").toString 42 | } 43 | 44 | def getResultColumn(): String ={ 45 | parameters.getOrElse(RESULT_COLUMN, "").toString 46 | } 47 | 48 | } 49 | 50 | object HttpOptions { 51 | 52 | val HTTP_URL = "url" 53 | val SCHEMA = "schema" 54 | val PATHS = "paths" 55 | val METHOD="method" 56 | val TIME_OUT="time_out" 57 | val FILETYPE="fileType" 58 | val SEP="sep" 59 | val RESULT_COLUMN="result_column" 60 | 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/http/HttpRelation.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources.http 2 | 3 | import java.sql.Timestamp 4 | import java.util.concurrent.TimeUnit 5 | 6 | import org.apache.http.NameValuePair 7 | import org.apache.http.client.entity.UrlEncodedFormEntity 8 | import org.apache.http.client.methods.{HttpDelete, HttpGet, HttpPost, HttpPut} 9 | import org.apache.http.client.utils.URIBuilder 10 | import org.apache.http.entity.StringEntity 11 | import org.apache.http.impl.client.{CloseableHttpClient, HttpClientBuilder} 12 | import org.apache.http.message.BasicNameValuePair 13 | import org.apache.http.util.EntityUtils 14 | import org.apache.spark.Partition 15 | import org.apache.spark.rdd.RDD 16 | import org.apache.spark.sql.functions.col 17 | import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} 18 | import org.apache.spark.sql.types.StructType 19 | import org.apache.spark.sql.{Row, SQLContext, SparkSession} 20 | import org.json4s.{CustomSerializer, DefaultFormats, JLong} 21 | 22 | 23 | case class HttpRelation( 24 | override val schema: StructType, 25 | parts: Array[Partition], 26 | httpOptions: HttpOptions)(@transient val sparkSession: SparkSession) 27 | extends BaseRelation 28 | with PrunedFilteredScan { 29 | case object TimestampSerializer extends CustomSerializer[java.sql.Timestamp](format => ( { 30 | case _ => null 31 | }, { 32 | case ts: Timestamp =>JLong(ts.getTime) 33 | }) 34 | ) 35 | implicit val formats = DefaultFormats 36 | 37 | override def sqlContext: SQLContext =sparkSession.sqlContext 38 | 39 | override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { 40 | 41 | import sparkSession.implicits._ 42 | //println(httpOptions.getHttp_Url) 43 | //println(httpOptions.getSep()) 44 | //连接http 获取数据 45 | var http_result=requestUrl(httpOptions.getHttp_Url,httpOptions.parameters) 46 | import org.json4s.jackson.Serialization.read 47 | import org.json4s.jackson.Serialization.write 48 | 49 | if(!httpOptions.getResultColumn().isEmpty){ 50 | var result = read[Map[String,Any]](http_result) 51 | var tmp:Any = null 52 | var tmp2:AnyRef = null 53 | val columns = httpOptions.getResultColumn().split("\\.") 54 | Seq.range(0, columns.size).foreach(index=>{ 55 | if(index == 0){ 56 | tmp = result.get(columns(index)).get 57 | }else{ 58 | result = tmp.asInstanceOf[Map[String,Any]] 59 | tmp = result.get(columns(index)).get 60 | } 61 | tmp2 = tmp.asInstanceOf[AnyRef] 62 | }) 63 | http_result = write(tmp2) 64 | } 65 | 66 | if(httpOptions.getFileType().toLowerCase.equals("csv")){ 67 | val sep=httpOptions.getSep() 68 | val ncols = requiredColumns.zipWithIndex.map(f => col("value").getItem(f._2) as f._1) 69 | val result=sparkSession.sparkContext.parallelize(http_result.split("\n").toSeq) 70 | .map(f=>f.split(sep)).toDF("value") 71 | .select(ncols:_*) 72 | result.rdd 73 | }else{ 74 | sparkSession.read.schema(schema).options(httpOptions.parameters).json(Seq(http_result).toDS()).rdd 75 | } 76 | 77 | } 78 | 79 | def requestUrl(url: String, params: Map[String, String]):String = { 80 | 81 | httpOptions.getMethod() match { 82 | case "get"=>get(url,params.toSeq) 83 | case "post"=>post(url,params.toSeq) 84 | case "delete"=>delete(url,params.toSeq) 85 | case "put"=>put(url,params.toSeq) 86 | case _=>get(url,params.toSeq) 87 | } 88 | } 89 | 90 | /** 91 | * 超时时间 单位:毫秒 92 | */ 93 | def HttpClient(): CloseableHttpClient ={ 94 | val httpClient:CloseableHttpClient = HttpClientBuilder.create() 95 | .setConnectionTimeToLive(httpOptions.getTimeOut(), TimeUnit.MILLISECONDS) 96 | .build() 97 | httpClient 98 | } 99 | 100 | /** 101 | * 102 | * @param addr 接口地址 103 | * @param param 请求参数 104 | * @return 105 | */ 106 | def get(addr:String,param: Seq[(String,String)]):String={ 107 | val builder=new URIBuilder(addr) 108 | if(param.nonEmpty){ 109 | param.foreach(r=>{ 110 | if(!r._1.startsWith("header.")){ 111 | builder.addParameter(r._1,r._2) 112 | } 113 | }) 114 | } 115 | val client=HttpClient() 116 | val httpGet = new HttpGet(builder.build()) 117 | if(param.nonEmpty){ 118 | param.foreach(r=>{ 119 | if(r._1.startsWith("header.")){ 120 | httpGet.setHeader(r._1.substring(7),r._2) 121 | } 122 | }) 123 | } 124 | val httpResponse = client.execute(httpGet) 125 | val entity = httpResponse.getEntity() 126 | var content = "" 127 | if (entity != null) { 128 | content=EntityUtils.toString(entity) 129 | } 130 | client.close() 131 | content 132 | } 133 | 134 | def put(addr:String,param: Seq[(String,String)]):String={ 135 | val builder=new URIBuilder(addr) 136 | if(param.nonEmpty){ 137 | param.foreach(r=>{ 138 | if(!r._1.startsWith("header.")){ 139 | builder.addParameter(r._1,r._2) 140 | } 141 | }) 142 | } 143 | val client=HttpClient() 144 | val httpPut = new HttpPut(builder.build()) 145 | if(param.nonEmpty){ 146 | param.foreach(r=>{ 147 | if(r._1.startsWith("header.")){ 148 | httpPut.setHeader(r._1.substring(7),r._2) 149 | } 150 | }) 151 | } 152 | val httpResponse = client.execute(httpPut) 153 | val entity = httpResponse.getEntity() 154 | var content = "" 155 | if (entity != null) { 156 | content=EntityUtils.toString(entity) 157 | } 158 | client.close() 159 | content 160 | } 161 | 162 | def delete(addr:String,param: Seq[(String,String)]):String={ 163 | val builder=new URIBuilder(addr) 164 | if(param.nonEmpty){ 165 | param.foreach(r=>{ 166 | if(!r._1.startsWith("header.")){ 167 | builder.addParameter(r._1,r._2) 168 | } 169 | }) 170 | } 171 | val client=HttpClient() 172 | val httpDelete = new HttpDelete(builder.build()) 173 | if(param.nonEmpty){ 174 | param.foreach(r=>{ 175 | if(r._1.startsWith("header.")){ 176 | httpDelete.setHeader(r._1.substring(7),r._2) 177 | } 178 | }) 179 | } 180 | val httpResponse = client.execute(httpDelete) 181 | val entity = httpResponse.getEntity() 182 | var content = "" 183 | if (entity != null) { 184 | content=EntityUtils.toString(entity) 185 | } 186 | client.close() 187 | content 188 | } 189 | 190 | def post(addr:String,param: Seq[(String,String)]):String={ 191 | val req=new HttpPost(addr) 192 | 193 | import org.json4s.jackson.Serialization.write 194 | implicit val formats = org.json4s.DefaultFormats 195 | //import scala.collection.JavaConverters._ 196 | //val entity=new UrlEncodedFormEntity(listParms.toList.asJava,"utf-8") 197 | val entity = new StringEntity(write(param.toMap)) 198 | req.setEntity(entity) 199 | val client=HttpClient() 200 | 201 | if(param.nonEmpty){ 202 | param.foreach(r=>{ 203 | if(r._1.startsWith("header.")){ 204 | req.setHeader(r._1.substring(7),r._2) 205 | } 206 | }) 207 | } 208 | 209 | val httpResponse = client.execute(req) 210 | val resEntity = httpResponse.getEntity() 211 | var content = "" 212 | if (resEntity != null) { 213 | content=EntityUtils.toString(resEntity) 214 | } 215 | client.close() 216 | content 217 | } 218 | 219 | } 220 | 221 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/http/HttpRelationProvider.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources.http 2 | 3 | import com.zyc.base.util.JsonSchemaBuilder 4 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 5 | import org.apache.spark.sql.sources.{CreatableRelationProvider, DataSourceRegister, RelationProvider} 6 | 7 | class HttpRelationProvider extends CreatableRelationProvider 8 | with RelationProvider with DataSourceRegister{ 9 | 10 | override def shortName() = "http" 11 | 12 | override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame):HttpRelation = { 13 | 14 | //println("http参数1:"+parameters.mkString(",")) 15 | createRelation(sqlContext,parameters) 16 | } 17 | 18 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]):HttpRelation = { 19 | import sqlContext.implicits._ 20 | //println("http参数2:"+parameters.mkString(",")) 21 | val httpOptions = new HttpOptions(parameters) 22 | val schema=JsonSchemaBuilder.getJsonSchema(httpOptions.getSchema()) 23 | val parts = null 24 | HttpRelation(schema, parts, httpOptions)(sqlContext.sparkSession) 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/sftp/DeleteTempFileShutdownHook.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources.sftp 2 | 3 | import org.apache.commons.io.FileUtils 4 | import java.io.File 5 | import org.apache.log4j.Logger 6 | 7 | /** 8 | * Delete the temp file created during spark shutdown 9 | */ 10 | class DeleteTempFileShutdownHook( 11 | fileLocation: String) extends Thread { 12 | 13 | private val logger = Logger.getLogger(classOf[SftpRelation]) 14 | 15 | override def run(): Unit = { 16 | logger.info("Deleting " + fileLocation ) 17 | FileUtils.deleteQuietly(new File(fileLocation)) 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/sftp/SftpRelation.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources.sftp 2 | 3 | import org.apache.log4j.Logger 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.functions.col 6 | import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext} 7 | import org.apache.spark.sql.sources.{BaseRelation, TableScan} 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * Abstract relation class for reading data from file 12 | */ 13 | case class SftpRelation( 14 | fileLocation: String, 15 | fileType: String, 16 | inferSchema: String, 17 | header: String, 18 | delimiter: String, 19 | quote: String, 20 | escape: String, 21 | multiLine: String, 22 | rowTag: String, 23 | customSchema: StructType, 24 | parameters:Map[String,String], 25 | sqlContext: SQLContext) extends BaseRelation with TableScan { 26 | 27 | private val logger = Logger.getLogger(classOf[SftpRelation]) 28 | 29 | val df = read() 30 | 31 | private def read(): DataFrame = { 32 | import sqlContext.implicits._ 33 | var dataframeReader = sqlContext.read 34 | if (customSchema != null) { 35 | dataframeReader = dataframeReader.schema(customSchema) 36 | } 37 | 38 | var df: DataFrame = null 39 | 40 | df = fileType match { 41 | case "avro" => dataframeReader.format("avro").options(parameters).load(fileLocation) 42 | case "txt" => dataframeReader.format("text").options(parameters).load(fileLocation) 43 | case "xml" => dataframeReader.format(constants.xmlClass) 44 | .option(constants.xmlRowTag, rowTag) 45 | .load(fileLocation) 46 | case "csv" => { 47 | val sep_tmp=resolveSep(parameters.getOrElse("delimiter",",")) 48 | var ds=dataframeReader.format("csv").options(parameters).option("sep",",").load(fileLocation) 49 | var columns=Array.empty[Column] 50 | 51 | var ds1=ds.map(f => f.mkString(",").split(sep_tmp)).toDF("value") 52 | if(header.equalsIgnoreCase("true")){ 53 | columns=ds.columns.mkString(",").split(sep_tmp).zipWithIndex.map(f => col("value").getItem(f._2) as f._1) 54 | ds1=ds1.select(columns:_*) 55 | }else{ 56 | val inputCols=parameters.getOrElse("inputCols","") 57 | logger.info("[sftp:csv]:"+inputCols) 58 | if(!inputCols.trim.equals("")){ 59 | columns=inputCols.split(",").zipWithIndex.map(f => col("value").getItem(f._2) as f._1) 60 | } 61 | ds1=ds1.select(columns:_*) 62 | } 63 | ds1 64 | } 65 | case "excel"=>dataframeReader.format("com.crealytics.spark.excel").options(parameters).load(fileLocation) 66 | case _ => dataframeReader.format(fileType).load(fileLocation) 67 | } 68 | df 69 | } 70 | 71 | override def schema: StructType = { 72 | df.schema 73 | } 74 | 75 | override def buildScan(): RDD[Row] = { 76 | df.rdd 77 | } 78 | 79 | def resolveSep(sep:String): String ={ 80 | var sep_tmp = sep.replace("\\", "\\\\") 81 | if (sep_tmp.contains('$')) { 82 | sep_tmp = sep_tmp.replace("$", "\\$") 83 | } 84 | if (sep_tmp.contains('(') || sep_tmp.contains(')')) { 85 | sep_tmp = sep_tmp.replace("(", "\\(").replace(")", "\\)") 86 | } 87 | if (sep_tmp.contains('*')) { 88 | sep_tmp = sep_tmp.replace("*", "\\*") 89 | } 90 | if (sep_tmp.contains('+')) { 91 | sep_tmp = sep_tmp.replace("+", "\\+") 92 | } 93 | if (sep_tmp.contains('-')) { 94 | sep_tmp = sep_tmp.replace("-", "\\-") 95 | } 96 | if (sep_tmp.contains('[') || sep_tmp.contains(']')) { 97 | sep_tmp = sep_tmp.replace("[", "\\[").replace("]", "\\]") 98 | } 99 | if (sep_tmp.contains('{') || sep_tmp.contains('}')) { 100 | sep_tmp = sep_tmp.replace("{", "\\{").replace("}", "\\}") 101 | } 102 | if (sep_tmp.contains('^')) { 103 | sep_tmp = sep_tmp.replace("^", "\\^") 104 | } 105 | if (sep_tmp.contains('|')) { 106 | sep_tmp = sep_tmp.replace("|", "\\|") 107 | } 108 | sep_tmp 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/sftp/constants.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources.sftp 2 | 3 | /** 4 | * Created by bagopalan on 9/16/18. 5 | */ 6 | object constants { 7 | 8 | val xmlClass: String = "com.databricks.spark.xml" 9 | val xmlRowTag: String = "rowTag" 10 | val xmlRootTag: String = "rootTag" 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/com/zyc/zdh/datasources/sftp/util/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources.sftp.util 2 | 3 | import org.apache.spark.sql.DataFrameWriter 4 | 5 | 6 | object Utils { 7 | 8 | 9 | /** 10 | * [[DataFrameWriter]] implicits 11 | */ 12 | implicit class ImplicitDataFrameWriter[T](dataFrameWriter: DataFrameWriter[T]) { 13 | 14 | /** 15 | * Adds an output option for the underlying data source if the option has a value. 16 | */ 17 | def optionNoNull(key: String, optionValue: Option[String]): DataFrameWriter[T] = { 18 | optionValue match { 19 | case Some(_) => dataFrameWriter.option(key, optionValue.get) 20 | case None => dataFrameWriter 21 | } 22 | } 23 | } 24 | 25 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/clickhouse/ClickHouseDialect.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.clickhouse 19 | 20 | import java.sql.Types 21 | 22 | import org.apache.spark.sql.jdbc.JdbcDialect 23 | import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} 24 | 25 | private case object ClickHouseDialect extends JdbcDialect { 26 | 27 | override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") 28 | 29 | override def getCatalystType( 30 | sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { 31 | if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { 32 | // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as 33 | // byte arrays instead of longs. 34 | md.putLong("binarylong", 1) 35 | Option(LongType) 36 | } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { 37 | Option(BooleanType) 38 | } else None 39 | } 40 | 41 | override def quoteIdentifier(colName: String): String = { 42 | s"$colName" 43 | } 44 | 45 | override def getTableExistsQuery(table: String): String = { 46 | s"SELECT 1 FROM $table LIMIT 1" 47 | } 48 | 49 | override def isCascadingTruncateTable(): Option[Boolean] = Some(false) 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/clickhouse/ClickHouseRelationProvider.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.execution.datasources.clickhouse 2 | 3 | import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode} 4 | import org.apache.spark.sql.execution.datasources.clickhouse.ClickHouseUtils._ 5 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} 6 | 7 | class ClickHouseRelationProvider extends CreatableRelationProvider 8 | with RelationProvider with DataSourceRegister{ 9 | override def shortName() = "clickhouse_jdbc" 10 | 11 | 12 | override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], 13 | df: DataFrame) = { 14 | val options = new ClickHouseOptionsInWrite(parameters) 15 | val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis 16 | 17 | val conn = ClickHouseUtils.createConnectionFactory(options)() 18 | try { 19 | val tableExists = ClickHouseUtils.tableExists(conn, options) 20 | if (tableExists) { 21 | mode match { 22 | case SaveMode.Overwrite => 23 | if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { 24 | // In this case, we should truncate table and then load. 25 | truncateTable(conn, options) 26 | val tableSchema = ClickHouseUtils.getSchemaOption(conn, options) 27 | saveTable(df, tableSchema, isCaseSensitive, options) 28 | } else { 29 | // Otherwise, do not truncate the table, instead drop and recreate it 30 | dropTable(conn, options.table, options) 31 | createTable(conn, df, options) 32 | saveTable(df, Some(df.schema), isCaseSensitive, options) 33 | } 34 | 35 | case SaveMode.Append => 36 | val tableSchema = ClickHouseUtils.getSchemaOption(conn, options) 37 | saveTable(df, tableSchema, isCaseSensitive, options) 38 | 39 | case SaveMode.ErrorIfExists => 40 | throw new AnalysisException( 41 | s"Table or view '${options.table}' already exists. " + 42 | s"SaveMode: ErrorIfExists.") 43 | 44 | case SaveMode.Ignore => 45 | // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected 46 | // to not save the contents of the DataFrame and to not change the existing data. 47 | // Therefore, it is okay to do nothing here and then just return the relation below. 48 | } 49 | } else { 50 | createTable(conn, df, options) 51 | saveTable(df, Some(df.schema), isCaseSensitive, options) 52 | } 53 | } finally { 54 | conn.close() 55 | } 56 | 57 | createRelation(sqlContext, parameters) 58 | } 59 | 60 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]):BaseRelation = { 61 | 62 | import sqlContext.implicits._ 63 | val hiveOptions = new ClickHouseOptions(parameters) 64 | val resolver = sqlContext.conf.resolver 65 | val timeZoneId = sqlContext.conf.sessionLocalTimeZone 66 | val schema = HiveRelation.getSchema(resolver, hiveOptions) 67 | val parts = HiveRelation.columnPartition(schema, resolver, timeZoneId, hiveOptions) 68 | ClickHouseRelation(schema, parts, hiveOptions)(sqlContext.sparkSession) 69 | 70 | } 71 | 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/clickhouse/DriverRegistry.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.execution.datasources.clickhouse 2 | 3 | import java.sql.{Driver, DriverManager} 4 | 5 | import scala.collection.mutable 6 | 7 | import org.apache.spark.internal.Logging 8 | import org.apache.spark.util.Utils 9 | 10 | /** 11 | * java.sql.DriverManager is always loaded by bootstrap classloader, 12 | * so it can't load JDBC drivers accessible by Spark ClassLoader. 13 | * 14 | * To solve the problem, drivers from user-supplied jars are wrapped into thin wrapper. 15 | */ 16 | object DriverRegistry extends Logging { 17 | 18 | /** 19 | * Load DriverManager first to avoid any race condition between 20 | * DriverManager static initialization block and specific driver class's 21 | * static initialization block. e.g. PhoenixDriver 22 | */ 23 | DriverManager.getDrivers 24 | 25 | private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty 26 | 27 | def register(className: String): Unit = { 28 | val cls = Utils.getContextOrSparkClassLoader.loadClass(className) 29 | if (cls.getClassLoader == null) { 30 | logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") 31 | } else if (wrapperMap.get(className).isDefined) { 32 | logTrace(s"Wrapper for $className already exists") 33 | } else { 34 | synchronized { 35 | if (wrapperMap.get(className).isEmpty) { 36 | val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) 37 | DriverManager.registerDriver(wrapper) 38 | wrapperMap(className) = wrapper 39 | logTrace(s"Wrapper for $className registered") 40 | } 41 | } 42 | } 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/clickhouse/DriverWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.clickhouse 19 | 20 | import java.sql.{Connection, Driver, DriverPropertyInfo, SQLFeatureNotSupportedException} 21 | import java.util.Properties 22 | 23 | /** 24 | * A wrapper for a JDBC Driver to work around SPARK-6913. 25 | * 26 | * The problem is in `java.sql.DriverManager` class that can't access drivers loaded by 27 | * Spark ClassLoader. 28 | */ 29 | class DriverWrapper(val wrapped: Driver) extends Driver { 30 | override def acceptsURL(url: String): Boolean = wrapped.acceptsURL(url) 31 | 32 | override def jdbcCompliant(): Boolean = wrapped.jdbcCompliant() 33 | 34 | override def getPropertyInfo(url: String, info: Properties): Array[DriverPropertyInfo] = { 35 | wrapped.getPropertyInfo(url, info) 36 | } 37 | 38 | override def getMinorVersion: Int = wrapped.getMinorVersion 39 | 40 | def getParentLogger: java.util.logging.Logger = { 41 | throw new SQLFeatureNotSupportedException( 42 | s"${this.getClass.getName}.getParentLogger is not yet implemented.") 43 | } 44 | 45 | override def connect(url: String, info: Properties): Connection = wrapped.connect(url, info) 46 | 47 | override def getMajorVersion: Int = wrapped.getMajorVersion 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/hive/DriverRegistry.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.execution.datasources.hive 2 | 3 | import java.sql.{Driver, DriverManager} 4 | 5 | import scala.collection.mutable 6 | 7 | import org.apache.spark.internal.Logging 8 | import org.apache.spark.util.Utils 9 | 10 | /** 11 | * java.sql.DriverManager is always loaded by bootstrap classloader, 12 | * so it can't load JDBC drivers accessible by Spark ClassLoader. 13 | * 14 | * To solve the problem, drivers from user-supplied jars are wrapped into thin wrapper. 15 | */ 16 | object DriverRegistry extends Logging { 17 | 18 | /** 19 | * Load DriverManager first to avoid any race condition between 20 | * DriverManager static initialization block and specific driver class's 21 | * static initialization block. e.g. PhoenixDriver 22 | */ 23 | DriverManager.getDrivers 24 | 25 | private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty 26 | 27 | def register(className: String): Unit = { 28 | val cls = Utils.getContextOrSparkClassLoader.loadClass(className) 29 | if (cls.getClassLoader == null) { 30 | logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") 31 | } else if (wrapperMap.get(className).isDefined) { 32 | logTrace(s"Wrapper for $className already exists") 33 | } else { 34 | synchronized { 35 | if (wrapperMap.get(className).isEmpty) { 36 | val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) 37 | DriverManager.registerDriver(wrapper) 38 | wrapperMap(className) = wrapper 39 | logTrace(s"Wrapper for $className registered") 40 | } 41 | } 42 | } 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/hive/DriverWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.hive 19 | 20 | import java.sql.{Connection, Driver, DriverPropertyInfo, SQLFeatureNotSupportedException} 21 | import java.util.Properties 22 | 23 | /** 24 | * A wrapper for a JDBC Driver to work around SPARK-6913. 25 | * 26 | * The problem is in `java.sql.DriverManager` class that can't access drivers loaded by 27 | * Spark ClassLoader. 28 | */ 29 | class DriverWrapper(val wrapped: Driver) extends Driver { 30 | override def acceptsURL(url: String): Boolean = wrapped.acceptsURL(url) 31 | 32 | override def jdbcCompliant(): Boolean = wrapped.jdbcCompliant() 33 | 34 | override def getPropertyInfo(url: String, info: Properties): Array[DriverPropertyInfo] = { 35 | wrapped.getPropertyInfo(url, info) 36 | } 37 | 38 | override def getMinorVersion: Int = wrapped.getMinorVersion 39 | 40 | def getParentLogger: java.util.logging.Logger = { 41 | throw new SQLFeatureNotSupportedException( 42 | s"${this.getClass.getName}.getParentLogger is not yet implemented.") 43 | } 44 | 45 | override def connect(url: String, info: Properties): Connection = wrapped.connect(url, info) 46 | 47 | override def getMajorVersion: Int = wrapped.getMajorVersion 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/hive/HiveDialect.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.hive 19 | 20 | import java.sql.Types 21 | 22 | import org.apache.spark.sql.jdbc.JdbcDialect 23 | import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} 24 | 25 | private case object HiveDialect extends JdbcDialect { 26 | 27 | override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") 28 | 29 | override def getCatalystType( 30 | sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { 31 | if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { 32 | // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as 33 | // byte arrays instead of longs. 34 | md.putLong("binarylong", 1) 35 | Option(LongType) 36 | } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { 37 | Option(BooleanType) 38 | } else None 39 | } 40 | 41 | override def quoteIdentifier(colName: String): String = { 42 | s"$colName" 43 | } 44 | 45 | override def getTableExistsQuery(table: String): String = { 46 | s"SELECT 1 FROM $table LIMIT 1" 47 | } 48 | 49 | override def isCascadingTruncateTable(): Option[Boolean] = Some(false) 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/hive/HiveRelationProvider.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.execution.datasources.hive 2 | 3 | import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode} 4 | import org.apache.spark.sql.execution.datasources.hive.HiveUtils._ 5 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} 6 | 7 | class HiveRelationProvider extends CreatableRelationProvider 8 | with RelationProvider with DataSourceRegister{ 9 | override def shortName() = "hive_jdbc" 10 | 11 | 12 | override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], 13 | df: DataFrame) = { 14 | val options = new HiveOptionsInWrite(parameters) 15 | val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis 16 | 17 | val conn = HiveUtils.createConnectionFactory(options)() 18 | try { 19 | val tableExists = HiveUtils.tableExists(conn, options) 20 | if (tableExists) { 21 | mode match { 22 | case SaveMode.Overwrite => 23 | if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { 24 | // In this case, we should truncate table and then load. 25 | truncateTable(conn, options) 26 | val tableSchema = HiveUtils.getSchemaOption(conn, options) 27 | saveTable(df, tableSchema, isCaseSensitive, options) 28 | } else { 29 | // Otherwise, do not truncate the table, instead drop and recreate it 30 | dropTable(conn, options.table, options) 31 | createTable(conn, df, options) 32 | saveTable(df, Some(df.schema), isCaseSensitive, options) 33 | } 34 | 35 | case SaveMode.Append => 36 | val tableSchema = HiveUtils.getSchemaOption(conn, options) 37 | saveTable(df, tableSchema, isCaseSensitive, options) 38 | 39 | case SaveMode.ErrorIfExists => 40 | throw new AnalysisException( 41 | s"Table or view '${options.table}' already exists. " + 42 | s"SaveMode: ErrorIfExists.") 43 | 44 | case SaveMode.Ignore => 45 | // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected 46 | // to not save the contents of the DataFrame and to not change the existing data. 47 | // Therefore, it is okay to do nothing here and then just return the relation below. 48 | } 49 | } else { 50 | createTable(conn, df, options) 51 | saveTable(df, Some(df.schema), isCaseSensitive, options) 52 | } 53 | } finally { 54 | conn.close() 55 | } 56 | 57 | createRelation(sqlContext, parameters) 58 | } 59 | 60 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]):BaseRelation = { 61 | 62 | import sqlContext.implicits._ 63 | val hiveOptions = new HiveOptions(parameters) 64 | val resolver = sqlContext.conf.resolver 65 | val timeZoneId = sqlContext.conf.sessionLocalTimeZone 66 | val schema = HiveRelation.getSchema(resolver, hiveOptions) 67 | val parts = HiveRelation.columnPartition(schema, resolver, timeZoneId, hiveOptions) 68 | HiveRelation(schema, parts, hiveOptions)(sqlContext.sparkSession) 69 | 70 | } 71 | 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/test/resources/datasources.propertites: -------------------------------------------------------------------------------- 1 | enable=true 2 | #url=jdbc:mysql://10.136.1.43/airflow?autoReconnect=true&failoverreadonly=false 3 | url=jdbc:mysql://127.0.0.1:3306/mydb?serverTimezone=GMT%2B8&useSSL=true 4 | driver=com.mysql.cj.jdbc.Driver 5 | username=zyc 6 | password=123456 -------------------------------------------------------------------------------- /src/test/resources/rules/rules.drl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaoyachao/zdh_server/15797e7577f4b73bea0d8db7a8a8710966a049c7/src/test/resources/rules/rules.drl -------------------------------------------------------------------------------- /src/test/scala/com/zyc/AppTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc 2 | 3 | import org.junit._ 4 | import Assert._ 5 | 6 | @Test 7 | class AppTest { 8 | 9 | @Test 10 | def testOK() = assertTrue(true) 11 | 12 | // @Test 13 | // def testKO() = assertTrue(false) 14 | 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/TEST_TRAIT2.scala: -------------------------------------------------------------------------------- 1 | package com.zyc 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | trait TEST_TRAIT2 { 7 | val spark = getSparkSession() 8 | 9 | val date_s = "20191108" 10 | 11 | 12 | def getSparkSession(): SparkSession = { 13 | 14 | val sparkConf = new SparkConf() 15 | sparkConf.setMaster("local[5]") 16 | sparkConf.setAppName("Spark demo") 17 | sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 18 | sparkConf.set("spark.executor.instances", "13") 19 | sparkConf.set("spark.executor.cores", "3") 20 | sparkConf.set("spark.repartition.min.num", "2") 21 | sparkConf.set("spark.repartition.normal.num", "5") 22 | sparkConf.set("spark.repartition.big.num", "10") 23 | sparkConf.set("spark.repartition.huge.num", "50") 24 | sparkConf.set("spark.sql.orc.impl", "native") 25 | sparkConf.set("spark.sql.orc.enableVectorizedReader", "true") 26 | sparkConf.set("spark.sql.crossJoin.enabled", "true") 27 | sparkConf.set("spark.sql.shuffle.partitions", "20") 28 | sparkConf.set("spark.default.parallelism", "3") 29 | //sparkConf.set("spark.sql.codegen.maxFields", "1000") 30 | //sparkConf.set("spark.sql.codegen.fallback", "true") 31 | //sparkConf.set("spark.sql.codegen.hugeMethodLimit",(65535*2).toString) 32 | sparkConf.set("spark.sql.sources.partitionOverwriteMode","dynamic") 33 | // sparkConf.set("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/log4j.properties") 34 | // sparkConf.set("spark.sql.extensions","org.apache.spark.sql.TiExtensions") 35 | // sparkConf.set("spark.tispark.pd.addresses","192.168.110.10:2379") 36 | // sparkConf.registerKryoClasses(Array(classOf[Hold])) 37 | val sparkSession = SparkSession 38 | .builder() 39 | .appName("Spark Zdh Report") 40 | .config(sparkConf) 41 | .enableHiveSupport() 42 | .getOrCreate() 43 | //sparkSession.sparkContext.setLogLevel("INFO") 44 | sparkSession 45 | } 46 | 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/CassandraDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import com.zyc.zdh.datasources.CassandraDataSources 5 | import org.junit.Test 6 | @Test 7 | class CassandraDataSourcesTest extends TEST_TRAIT2{ 8 | @Test 9 | def testGetDS { 10 | 11 | val inputOptions=Map( 12 | "url"->"localhost:9042", 13 | "paths"->"ks_test.tb1" 14 | ) 15 | CassandraDataSources.getDS(spark,null,null,inputOptions,null,null,null,null,null,null,null)("").show() 16 | //spark.conf.set("spark.cassandra.connection.host","localhost:9042") 17 | 18 | 19 | // spark.range(0,100).select(col("id"),col("id") as "name",lit("man") as "sex") 20 | // .write 21 | // .format("org.apache.spark.sql.cassandra") 22 | // .mode("overwrite") 23 | // .options(Map( "table" -> "tb1","keyspace"->"ks_test")) 24 | // .option("confirm.truncate","true") 25 | // .save() 26 | 27 | 28 | // val df = spark 29 | // .read 30 | // .format("org.apache.spark.sql.cassandra") 31 | // .options(Map( "table" -> "tb1","keyspace"->"ks_test")) 32 | // .load() 33 | // 34 | // df.show() 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/HbaseDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import com.zyc.zdh.datasources.HbaseDataSources 5 | import org.junit.Test 6 | //import org.apache.hadoop.hbase.spark.datasources.HBaseTableCatalog 7 | import org.apache.spark.sql.functions._ 8 | import org.scalatest.FunSuite 9 | 10 | @Test 11 | class HbaseDataSourcesTest extends TEST_TRAIT2{ 12 | @Test 13 | def testGetDS { 14 | 15 | val table="t1" 16 | val cols=Array("cf1:name","cf1:age") 17 | val map=Map("url"->"127.0.0.1","paths"->"t1") 18 | HbaseDataSources.getDS(spark,null,"",map,"0,1",cols,null,null,null,Array.empty[Map[String,String]],"")("1").show(false) 19 | 20 | } 21 | @Test 22 | def testHbaseSHC { 23 | 24 | val table="t1" 25 | val cols=Array("cf1:name") 26 | // HbaseDataSources.hbaseSHC(spark,table,"",null,cols,"rowkey=002")("1") 27 | 28 | 29 | val colStr = cols.map(col => { 30 | val cf = col.split(":")(0) 31 | val colName = col.split(":")(1) 32 | s"""|"$colName":{"cf":"$cf","col":"$colName","type":"string"} """ 33 | }).mkString(",") 34 | val catalog = 35 | s"""{ 36 | |"table":{"namespace":"default", "name":"$table"}, 37 | |"rowkey":"key", 38 | |"columns":{ 39 | |"rowkey":{"cf":"rowkey", "col":"key", "type":"string"}, 40 | ${colStr} 41 | |} 42 | |}""".stripMargin 43 | 44 | // spark.read 45 | // .options(Map(HBaseTableCatalog.tableCatalog -> catalog)) 46 | // .format("org.apache.spark.sql.execution.datasources.hbase") 47 | // .load() 48 | // .show() 49 | 50 | 51 | 52 | } 53 | @Test 54 | def testHbaseNerdammer { 55 | 56 | val table="t1" 57 | val cols=Array("cf1:name") 58 | // // HbaseDataSources.hbaseNerdammer(spark,table,"",null,cols,"rowkey=002")("1") 59 | // 60 | // import it.nerdammer.spark.hbase._ 61 | // 62 | // import spark.implicits._ 63 | // 64 | // spark.conf.set("spark.hbase.host", "192.168.65.10") //e.g. 192.168.1.1 or localhost or your hostanme 65 | // 66 | // // For Example If you have an HBase Table as 'Document' with ColumnFamily 'SMPL' and qualifier as 'DocID, Title' then: 67 | // 68 | // val docRdd = spark.sparkContext.hbaseTable[(Option[String], Option[String])](table) 69 | // .select("cf1:name","cf1:name") 70 | // .withStartRow("0") 71 | // .withStopRow("0") 72 | // docRdd.map(f=>f._1.get).toDF().show() 73 | 74 | } 75 | @Test 76 | def testGetDS2 { 77 | 78 | val table="t1" 79 | val cols=Array("cf1:name","cf2:age","cf1:sex") 80 | val map=Map("url"->"192.168.65.10:2181") 81 | // HbaseDataSources.getDS(spark,table,map,cols,"0,1")("1").show(false) 82 | 83 | } 84 | @Test 85 | def loadHFile { 86 | val options=Map("paths"->"t1") 87 | 88 | 89 | val df=spark.range(0,100).select(concat(col("id"),lit("a")) as "row_key",lit("1a") as "cf1:index",lit("zyc") as "cf1:name",lit("zhaoyachao") as "cf2:user") 90 | 91 | HbaseDataSources.writeHFile(spark,df,options)("001") 92 | 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/KafKaDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import com.zyc.zdh.datasources.KafKaDataSources 5 | import org.apache.spark.sql.functions._ 6 | import org.junit.Test 7 | import org.scalatest.FunSuite 8 | @Test 9 | class KafKaDataSourcesTest extends TEST_TRAIT2{ 10 | @Test 11 | def testGetDS { 12 | val outputOptions=Map( "url"->"jdbc:mysql://127.0.0.1:3306/mydb?serverTimezone=GMT%2B8", 13 | "driver"->"com.mysql.cj.jdbc.Driver", 14 | "dbtable"->"z10", 15 | "user"->"zyc", 16 | "password"->"123456") 17 | spark.sparkContext.setLogLevel("error") 18 | KafKaDataSources.createKafkaDataSources(spark,"localhost:9092","topic1","g1",Map("msgType"->"csv"),Array("name","age"),"",null,outputOptions,"","")("001") 19 | 20 | 21 | while (true){ 22 | Thread.sleep(10000) 23 | print("============") 24 | } 25 | } 26 | @Test 27 | def writeKafka{ 28 | 29 | val outputOptions=Map( "url"->"localhost:9092", 30 | "paths"->"t1", 31 | "dbtable"->"z10", 32 | "user"->"zyc", 33 | "password"->"123456") 34 | 35 | val df=spark.range(0,10).select(col("id") as "key",col("id") as "value") 36 | KafKaDataSources.writeDS(spark,df,outputOptions,"")("001") 37 | 38 | 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/MongoDBDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import com.zyc.zdh.datasources.MongoDBDataSources 5 | import org.junit.Test 6 | import org.scalatest.FunSuite 7 | @Test 8 | class MongoDBDataSourcesTest extends TEST_TRAIT2{ 9 | @Test 10 | def testGetDS { 11 | 12 | val dispatch=null 13 | val inputOptions=Map("url"->"mongodb://localhost:27017/admin","paths"->"zyc") 14 | val df=MongoDBDataSources.getDS(spark,dispatch,"mongodb",inputOptions,null,null,null,null,null,null, 15 | "delete from t1 where item='canvas'")("001").drop("_id") 16 | 17 | df.show(false) 18 | 19 | val outputOptions=Map("url"->"mongodb://localhost:27017/admin","paths"->"zyc1","model"->"append") 20 | val df1=MongoDBDataSources.writeDS(spark,df,outputOptions,"delete from t1 where item='canvas'")("001") 21 | // val readConfig=ReadConfig(Map("uri"->"mongodb://localhost:27017","database"->"admin","collection"->"zyc")) 22 | // MongoSpark.loadAndInferSchema(spark,readConfig).show(false) 23 | // val struct=JsonSchemaBuilder.getJsonSchema("_id,item,qty,size.h,size.w,size.uom,tags") 24 | // val outputOptions=Map("spark.mongodb.output.uri"->"mongodb://localhost:27017/admin","spark.mongodb.output.collection"->"zyc1") 25 | // df.drop("_id").write.format("mongo").mode(SaveMode.Append).options(outputOptions).save() 26 | 27 | 28 | 29 | 30 | 31 | 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/RedisDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import com.zyc.zdh.datasources.RedisDataSources 5 | import org.junit.Test 6 | import org.scalatest.FunSuite 7 | @Test 8 | class RedisDataSourcesTest extends TEST_TRAIT2{ 9 | @Test 10 | def testGetDS { 11 | import spark.implicits._ 12 | val options=Map("url"->"127.0.0.1:6379", 13 | "paths"->"persion", 14 | "data_type"->"hash", 15 | "password"->"yld" 16 | ) 17 | val inputCols=Array("name","age") 18 | // spark.conf.set("spark.redis.host", "localhost") 19 | // spark.conf.set("spark.redis.port", "6379") 20 | // spark.conf.set("spark.redis.auth", "yld") 21 | 22 | //RedisDataSources.getDS(spark,null,null,options,null,inputCols,null,null,null,null)("").show() 23 | 24 | 25 | val options2=Map("url"->"127.0.0.1:6379", 26 | "paths"->"persion", 27 | "data_type"->"table", 28 | "password"->"yld", 29 | "key.column"->"id" 30 | ) 31 | 32 | val df=Seq((1,"zyc",20),(2,"abc",30)).toDF("id","name","age") 33 | RedisDataSources.writeDS(spark,df,options2,"")("") 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/ESDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import org.apache.spark.sql.functions._ 5 | import org.junit.Test 6 | import org.scalatest.FunSuite 7 | @Test 8 | class ESDataSourcesTest extends TEST_TRAIT2{ 9 | 10 | @Test 11 | def testGetDS { 12 | val options=Map("url"->"localhost:9200", 13 | "paths"->"persion3" 14 | ) 15 | 16 | ESDataSources.getDS(spark,null,null,options,null,null,null,null,null,null,null)("001").show(false) 17 | 18 | 19 | } 20 | 21 | @Test 22 | def testWriteDS{ 23 | val df=spark.range(0,10).select(col("id"),concat(lit("name"),col("id")) as "name",lit(25) as "age") 24 | val options=Map("url"->"localhost:9200", 25 | "paths"->"persion3" 26 | // "es.write.operation"->"update" 27 | ) 28 | 29 | ESDataSources.writeDS(spark,df,options,"")("001") 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/FlumeDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import org.junit.Test 5 | import org.scalatest.FunSuite 6 | @Test 7 | class FlumeDataSourcesTest extends TEST_TRAIT2{ 8 | 9 | @Test 10 | def testGetDS { 11 | 12 | val inputOptions=Map("url"->"localhost:9999") 13 | val inputCols=Array("name","age") 14 | 15 | val output="jdbc" 16 | 17 | val outputOptions=Map( "url"->"jdbc:mysql://127.0.0.1:3306/mydb?serverTimezone=GMT%2B8", 18 | "driver"->"com.mysql.cj.jdbc.Driver", 19 | "dbtable"->"flume_t1", 20 | "user"->"zyc", 21 | "password"->"123456") 22 | 23 | 24 | 25 | FlumeDataSources.getDS(spark,null,"",inputOptions,"",inputCols,null,output,outputOptions,null,null)("001") 26 | 27 | while(true){ 28 | 29 | } 30 | 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/FtpDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import org.junit.Test 5 | import org.scalatest.FunSuite 6 | @Test 7 | class FtpDataSourcesTest extends TEST_TRAIT2{ 8 | 9 | @Test 10 | def testGetDS { 11 | val inputOptions=Map("sep"->"|", 12 | "url"->"10.136.1.217", 13 | "paths"->"/app/zyc/zyc.txt", 14 | "user"->"zyc", 15 | "password"->"123456", 16 | "header"->"false" 17 | ) 18 | val inputCols=Array("name","sex","age") 19 | val df=SFtpDataSources.getDS(spark,null,"ftp",inputOptions,"",inputCols,null,null,null,null,null)("") 20 | df.show() 21 | val options=Map("sep"->",", 22 | "url"->"10.136.1.217", 23 | "paths"->"/app/zyc/zyc1.txt", 24 | "user"->"zyc", 25 | "password"->"123456", 26 | "header"->"false" 27 | ) 28 | 29 | SFtpDataSources.writeDS(spark,df,options,"")("") 30 | 31 | } 32 | 33 | @Test 34 | def testGetDSCSV { 35 | val inputOptions=Map("sep"->"|", 36 | "url"->"192.168.110.10", 37 | "paths"->"/a.txt", 38 | "user"->"zyc", 39 | "sep"-> "|", 40 | "password"->"123456", 41 | "header"->"false", 42 | "fileType"->"csv" 43 | ) 44 | val inputCols=Array("name","age") 45 | val df=FtpDataSources.getDS(spark,null,"ftp",inputOptions,"",inputCols,null,null,null,null,null)("") 46 | df.show() 47 | 48 | 49 | } 50 | 51 | @Test 52 | def testWriteDSCSV { 53 | val inputOptions=Map("sep"->"|", 54 | "url"->"192.168.110.10", 55 | "paths"->"/a.txt", 56 | "user"->"zyc", 57 | "sep"-> "|", 58 | "password"->"123456", 59 | "header"->"false", 60 | "fileType"->"csv" 61 | ) 62 | val outputOptions=Map( 63 | "url"->"192.168.110.10", 64 | "paths"->"/a1.txt", 65 | "user"->"zyc", 66 | "sep"-> "||", 67 | "password"->"123456", 68 | "header"->"false", 69 | "fileType"->"csv" 70 | ) 71 | val inputCols=Array("name","age") 72 | val df=FtpDataSources.getDS(spark,null,"ftp",inputOptions,"",inputCols,null,null,null,null,null)("") 73 | 74 | FtpDataSources.writeDS(spark,df,outputOptions,null)("") 75 | 76 | 77 | } 78 | 79 | @Test 80 | def testGetDSJSON { 81 | val inputOptions=Map("sep"->"|", 82 | "url"->"192.168.110.10", 83 | "paths"->"b.json", 84 | "user"->"zyc", 85 | "sep"-> "|", 86 | "password"->"123456", 87 | "header"->"false", 88 | "fileType"->"json" 89 | ) 90 | val inputCols=Array("name","age") 91 | val df=FtpDataSources.getDS(spark,null,"ftp",inputOptions,"",inputCols,null,null,null,null,null)("") 92 | df.show() 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/GreenplumDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import org.junit.Test 5 | 6 | 7 | class GreenplumDataSourcesTest extends TEST_TRAIT2 { 8 | 9 | @Test 10 | def getDS(): Unit = { 11 | implicit val id="001" 12 | val options =Map( 13 | "url"->"jdbc:postgresql://192.168.110.10:5432/postgres", 14 | //"dbschema"-> "public", 15 | "dbtable"-> "t1", 16 | "user"-> "zyc", 17 | "password"-> "123456") 18 | val df=spark.read.format("greenplum") 19 | .options(options).load() 20 | 21 | df.show(false) 22 | 23 | } 24 | } -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/HdfsDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import java.util 4 | 5 | import com.zyc.TEST_TRAIT2 6 | import org.apache.hudi.DataSourceWriteOptions 7 | import org.apache.hudi.config.HoodieWriteConfig 8 | import org.apache.spark.sql.{Row, RowFactory, SaveMode} 9 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 10 | import org.junit.Test 11 | import org.scalatest.FunSuite 12 | 13 | import scala.collection.JavaConversions 14 | 15 | @Test 16 | class HdfsDataSourcesTest extends TEST_TRAIT2{ 17 | 18 | @Test 19 | def testWriteDS { 20 | 21 | val dt=Seq( 22 | Row("1","a", Row("zhaoyachao","man")), 23 | Row("2","b", Row("zhaoyachao","man")) 24 | ) 25 | val schema = StructType(Seq(StructField("id", StringType, nullable = true), StructField("tag", StringType, nullable = true), 26 | StructField("other", StructType(Seq(StructField("name", StringType, nullable = true),StructField("sex", StringType, nullable = true))), nullable = true) 27 | )) 28 | import spark.implicits._ 29 | 30 | val df =spark.createDataFrame(spark.sparkContext.parallelize(dt), schema) 31 | 32 | val options = Map( 33 | "precombine_field_opt_key"->"id", 34 | "recordkey_field_opt_key"->"id" 35 | 36 | ) 37 | HdfsDataSources.writeDS(spark, df, "hudi", "/data/hudi/t11", SaveMode.Overwrite, options, "")("001") 38 | 39 | } 40 | 41 | @Test 42 | def testWriteDS2 { 43 | 44 | val dt=Seq( 45 | Row("1","a", "zhaoyachao","man"), 46 | Row("2","b", "zhaoyachao","man") 47 | ) 48 | val schema = StructType(Seq(StructField("id", StringType, nullable = true), StructField("tag", StringType, nullable = true), 49 | StructField("name", StringType, nullable = true),StructField("sex", StringType, nullable = true) 50 | )) 51 | import spark.implicits._ 52 | 53 | val df =spark.createDataFrame(spark.sparkContext.parallelize(dt), schema) 54 | 55 | var options = Map( 56 | "precombine_field_opt_key"->"id", 57 | "recordkey_field_opt_key"->"id" 58 | 59 | ) 60 | val path="/data/hudi/t13" 61 | val basePath=path.substring(0,path.lastIndexOf("/")) 62 | val tableName=path.substring(path.lastIndexOf("/")+1) 63 | // TABLENAME 64 | val cols=df.columns 65 | if(options.getOrElse("precombine_field_opt_key","").toString.equalsIgnoreCase("") && !cols.contains("ts")){ 66 | throw new Exception("[数据采集]:[HDFS]:[WRITE]:[ERROR]:写入hudi数据文件时必须指定主键,请设置etl任务中的主键字段precombine_field_opt_key参数") 67 | } 68 | if(options.getOrElse("recordkey_field_opt_key","").toString.equalsIgnoreCase("") && !cols.contains("uuid")){ 69 | throw new Exception("[数据采集]:[HDFS]:[WRITE]:[ERROR]:写入hudi数据文件时必须指定主键,请设置etl任务中的主键字段recordkey_field_opt_key参数") 70 | } 71 | val recordkey_field_opt_key=options.getOrElse("recordkey_field_opt_key","") 72 | val precombine_field_opt_key=options.getOrElse("precombine_field_opt_key","") 73 | val operation_opt_key=options.getOrElse("operation_opt_key","upsert") 74 | 75 | options=options.+( HoodieWriteConfig.TABLE_NAME->tableName).+( 76 | DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY->precombine_field_opt_key, 77 | DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY->recordkey_field_opt_key, 78 | DataSourceWriteOptions.OPERATION_OPT_KEY->operation_opt_key) 79 | 80 | df.write.format("hudi").mode(SaveMode.Overwrite).options(options).save(path) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/IcebergDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.iceberg 5 | import org.apache.iceberg.PartitionSpec 6 | import org.apache.iceberg.catalog.{Namespace, TableIdentifier} 7 | import org.apache.iceberg.hadoop.HadoopCatalog 8 | import org.apache.iceberg.types.Types 9 | import org.junit.Test 10 | import org.scalatest.FunSuite 11 | @Test 12 | class IcebergDataSourcesTest { 13 | 14 | @Test 15 | def testWriteDS { 16 | 17 | val config = new Configuration() 18 | val catalog = new HadoopCatalog(config,"/data/iceberg"); 19 | val schema=new iceberg.Schema( 20 | Types.NestedField.required(1, "id", Types.IntegerType.get()), 21 | Types.NestedField.required(2, "user_name", Types.StringType.get()), 22 | Types.NestedField.required(3, "user_password", Types.StringType.get()), 23 | Types.NestedField.required(4, "eamil", Types.StringType.get()), 24 | Types.NestedField.required(5, "is_use_email", Types.StringType.get()), 25 | Types.NestedField.required(6, "phone", Types.StringType.get()), 26 | Types.NestedField.required(7, "is_use_phone", Types.StringType.get()) 27 | ) 28 | var name: TableIdentifier = TableIdentifier.of(Namespace.of("test"),"account_info") 29 | if(!catalog.tableExists(name)){ 30 | var spec:PartitionSpec=PartitionSpec.unpartitioned() 31 | 32 | if(!catalog.namespaceExists(Namespace.of("test"))){ 33 | catalog.createNamespace(Namespace.of("test")) 34 | } 35 | catalog.createTable(name, schema, spec) 36 | } 37 | 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/JdbcDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import org.junit.Test 5 | import org.scalatest.FunSuite 6 | @Test 7 | class JdbcDataSourcesTest extends TEST_TRAIT2{ 8 | 9 | @Test 10 | def testGetDS { 11 | 12 | implicit val id="001" 13 | val dispatchOption = null 14 | val inPut = "jdbc" 15 | val inputOptions=Map( 16 | "driver"->"com.github.housepower.jdbc.ClickHouseDriver", 17 | "url"->"jdbc:clickhouse://192.168.110.10:9000", 18 | "dbtable"->"datasets.z1", 19 | "user"->"default", 20 | "password"->"", 21 | "numPartitions"->"1", 22 | "isolationLevel"->"NONE" 23 | ) 24 | val df=JdbcDataSources.getDS(spark,dispatchOption,inPut,inputOptions,"",null,null,null,null,null,null ) 25 | 26 | df.show(false) 27 | 28 | // var pro = new java.util.Properties 29 | // pro.put("driver","com.github.housepower.jdbc.ClickHouseDriver") 30 | // val format="org.apache.spark.sql.hive_jdbc.datasources.clickhouse.ClickHouseRelationProvider" 31 | // df.write.format(format).mode("append").option("batchsize", "50000").option("isolationLevel", "NONE").option("numPartitions", "1").options(inputOptions) 32 | // .save() 33 | 34 | 35 | } 36 | @Test 37 | def testWrteDS { 38 | implicit val id="001" 39 | val dispatchOption = null 40 | val inPut = "jdbc" 41 | val inputOptions=Map( 42 | "driver"->"com.github.housepower.jdbc.ClickHouseDriver", 43 | "url"->"jdbc:clickhouse://192.168.110.10:9000", 44 | "dbtable"->"datasets.z1", 45 | "user"->"default", 46 | "password"->"", 47 | "numPartitions"->"1", 48 | "isolationLevel"->"NONE" 49 | ) 50 | import org.apache.spark.sql.functions._ 51 | val df=spark.range(10).select(concat(lit("zhaoyachao"),col("id")) as "name",lit(123) as "age",lit("woman") as "sex",lit(100.5f).cast("float") as "money") 52 | JdbcDataSources.writeDS(spark,df,inputOptions,"alter table datasets.z1 delete where 1=1") 53 | } 54 | 55 | @Test 56 | def getGreenplum{ 57 | 58 | implicit val id="001" 59 | val dispatchOption = null 60 | val inPut = "jdbc" 61 | val inputOptions=Map( 62 | "driver"->"com.pivotal.jdbc.GreenplumDriver", 63 | "url"->"jdbc:pivotal:greenplum://192.168.110.10:5432;DatabaseName=postgres", 64 | "dbtable"->"t1", 65 | "user"->"zyc", 66 | "password"->"123456", 67 | "numPartitions"->"1", 68 | "isolationLevel"->"NONE" 69 | ) 70 | import org.apache.spark.sql.functions._ 71 | val dt=spark.range(10).select(col("id")) 72 | //JdbcDataSources.writeDS(spark,dt,inputOptions,"") 73 | 74 | val df=JdbcDataSources.getDS(spark,dispatchOption,inPut,inputOptions,"",null,null,null,null,null,null ) 75 | 76 | df.show(false) 77 | 78 | } 79 | @Test 80 | def getGreenplum2 { 81 | 82 | implicit val id="001" 83 | val dispatchOption = null 84 | val inPut = "jdbc" 85 | val options =Map( 86 | "url"->"jdbc:postgresql://192.168.110.10:5432/postgres", 87 | "delimiter"-> "\t", 88 | "dbschema"-> "public", 89 | "dbtable"-> "t1", 90 | "user"-> "zyc", 91 | "password"-> "123456") 92 | val df=spark.read.format("greenplum") 93 | .options(options).load() 94 | 95 | df.show(false) 96 | 97 | } 98 | 99 | @Test 100 | def testUpdateJDBC { 101 | 102 | val opt=Map( 103 | "driver"->"com.mysql.cj.jdbc.Driver", 104 | "url"->"jdbc:mysql://127.0.0.1:3306/mydb?serverTimezone=GMT%2B8", 105 | "dbtable"->"task_log_instance", 106 | "user"->"zyc", 107 | "password"->"123456" 108 | ) 109 | //JdbcDataSources.updateJDBC(spark,null,opt,"")("001") 110 | 111 | } 112 | 113 | 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/KuduDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import org.apache.kudu.client.CreateTableOptions 5 | import org.apache.kudu.spark.kudu.KuduContext 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 8 | import org.junit.Test 9 | import org.scalatest.FunSuite 10 | @Test 11 | class KuduDataSourcesTest extends TEST_TRAIT2{ 12 | 13 | @Test 14 | def testGetDS { 15 | val inputOptions=Map("url"->"192.168.65.10:7051","paths"->"k1") 16 | 17 | val kuduContext = new KuduContext(inputOptions.getOrElse("url",""), spark.sparkContext) 18 | 19 | val kuduTableSchema = StructType( 20 | StructField("name", StringType, false) :: 21 | StructField("sex", StringType, true) :: 22 | StructField("age", IntegerType, true) :: Nil) 23 | val kuduTableOptions = new CreateTableOptions() 24 | import scala.collection.JavaConverters._ 25 | kuduTableOptions.setRangePartitionColumns(List("name").asJava).setNumReplicas(1); 26 | val kuduPrimaryKey = Seq("name") 27 | if(!kuduContext.tableExists(inputOptions.getOrElse("paths",""))){ 28 | kuduContext.createTable(inputOptions.getOrElse("paths","").toString, kuduTableSchema,kuduPrimaryKey,kuduTableOptions) 29 | } 30 | 31 | val df=KuduDataSources.getDS(spark,null,"kudu",inputOptions,null,null,null,null,null,null,null)("") 32 | 33 | df.show() 34 | 35 | } 36 | @Test 37 | def writeDS { 38 | val options=Map("url"->"192.168.65.10:7051","paths"->"k2") 39 | 40 | val kuduContext = new KuduContext(options.getOrElse("url",""), spark.sparkContext) 41 | 42 | val df =spark.range(0,100).select(col("id").cast("string") as "name",col("id").cast("string") as "sex", 43 | col("id").cast("int") as "age") 44 | 45 | KuduDataSources.writeDS(spark,df,options,"")("") 46 | 47 | 48 | 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/scala/com/zyc/zdh/datasources/TidbDataSourcesTest.scala: -------------------------------------------------------------------------------- 1 | package com.zyc.zdh.datasources 2 | 3 | import com.zyc.TEST_TRAIT2 4 | import org.junit.Test 5 | import org.scalatest.FunSuite 6 | @Test 7 | class TidbDataSourcesTest extends TEST_TRAIT2{ 8 | 9 | @Test 10 | def getDS { 11 | 12 | var tiDBOptions=Map[String,String]("tidb.user"->"root", 13 | "tidb.password"->"", 14 | "tidb.addr"->"192.168.110.10", 15 | "tidb.port" -> "4000" 16 | ) 17 | 18 | //spark.conf.set("spark.sql.extensions","org.apache.spark.sql.TiExtensions") 19 | //spark.conf.set("spark.tispark.pd.addresses","192.168.110.10:2379") 20 | val df=spark.sql("select * from d1.t1") 21 | 22 | // val df=spark.read.format("tidb") 23 | // .options(tiDBOptions) 24 | // .option("database","d1") 25 | // .option("table","t1") 26 | // .load() 27 | 28 | df.show() 29 | 30 | df.write. 31 | format("tidb"). 32 | option("tidb.user", "root"). 33 | option("tidb.password", ""). 34 | option("database", "d1"). 35 | option("table", "t2"). 36 | options(tiDBOptions). 37 | mode("overwrite"). 38 | save() 39 | 40 | } 41 | 42 | @Test 43 | def jdbc { 44 | 45 | var tiDBOptions=Map[String,String]("tidb.user"->"root", 46 | "tidb.password"->"", 47 | "tidb.addr"->"192.168.110.10", 48 | "tidb.port" -> "4000" 49 | ) 50 | val inputOptions=Map( 51 | "driver"->"com.mysql.cj.jdbc.Driver", 52 | "url"->"jdbc:mysql://192.168.110.10:4000", 53 | "dbtable"->"d1.t1", 54 | "user"->"root", 55 | "password"->"", 56 | "numPartitions"->"1", 57 | "isolationLevel"->"NONE", 58 | "useSSL"->"false" 59 | ) 60 | 61 | val df=spark.read.format("jdbc").options(inputOptions).load() 62 | 63 | df.write. 64 | format("jdbc"). 65 | options(inputOptions). 66 | option("dbtable","d1.t3"). 67 | mode("overwrite"). 68 | save() 69 | 70 | } 71 | } 72 | --------------------------------------------------------------------------------