├── .gitignore ├── .gitlab-ci.yml ├── README.md ├── assembly.xml ├── bin ├── profile.sh └── sql_runner.sh ├── docs ├── DataQuality_PartitionRule.md ├── Data_Check.md ├── External_Sources.md ├── Index_Column.md ├── Trouble_Shooting.md ├── UDF.md └── images │ ├── architecture.png │ ├── dq2_bollinger_model.png │ ├── dq2_ewma_model.png │ ├── dq2_row_number.png │ ├── dq_bollinger_model.png │ ├── dq_ewma_model.png │ └── dq_row_number.png ├── pom.xml ├── scalastyle-config.xml └── src ├── main ├── java │ └── one │ │ └── profiler │ │ ├── AsyncProfiler.java │ │ ├── AsyncProfilerMXBean.java │ │ ├── Counter.java │ │ ├── Events.java │ │ └── ProfileAgent.java ├── resources │ ├── log4j.properties │ └── metrics.properties_template └── scala │ └── org │ └── apache │ └── spark │ └── sql │ ├── SqlRunnerCatalogEventListener.scala │ ├── execution │ └── datasources │ │ ├── jdbc │ │ ├── JDBCCatalog.scala │ │ ├── JDBCDataWriter.scala │ │ ├── JDBCScanBuilder.scala │ │ ├── JDBCTable.scala │ │ ├── JDBCWriteBuilder.scala │ │ ├── MyJDBCOptions.scala │ │ └── MyJDBCUtils.scala │ │ └── kafka │ │ ├── KafkaCatalog.scala │ │ ├── KafkaDataWriter.scala │ │ ├── KafkaOptions.scala │ │ └── KafkaTable.scala │ ├── hive │ ├── SparkSqlRunner.scala │ └── SqlRunnerMetrics.scala │ ├── optimizer │ ├── CollectValueRule.scala │ ├── DataQualityRule.scala │ ├── ExternalSinkRule.scala │ ├── ExternalTableRule.scala │ ├── InsightExtensions.scala │ ├── PartitionScanLimitRule.scala │ ├── RepartitionRule.scala │ └── SqlRunnerSessionStateBuilder.scala │ ├── plugin │ ├── AsyncProfilePlugin.scala │ ├── ProfilePlugin.scala │ └── YourkitPlugin.scala │ ├── runner │ ├── Alert.scala │ ├── ArgParser.scala │ ├── JobRunner.scala │ ├── callback │ │ ├── ArrayValueCollector.scala │ │ ├── DataCallBack.scala │ │ ├── DataCallBackFactory.scala │ │ ├── DataCheckCallBack.scala │ │ ├── EmailSink.scala │ │ ├── QueryResult.scala │ │ ├── SingleValueCollector.scala │ │ └── Sink.scala │ ├── command │ │ ├── BaseCommand.scala │ │ ├── BlockCommentCommand.scala │ │ ├── CommandFactory.scala │ │ ├── ElseCommand.scala │ │ ├── FiCommand.scala │ │ ├── IfCommand.scala │ │ ├── LineCommentCommand.scala │ │ ├── SetCommand.scala │ │ ├── SourceChars.scala │ │ └── SqlCommand.scala │ ├── config │ │ ├── ApolloClient.scala │ │ └── VariableSubstitution.scala │ ├── container │ │ ├── CollectorContainer.scala │ │ ├── ConfigContainer.scala │ │ └── ContainerTrait.scala │ └── metrics │ │ ├── GraphiteReporter.scala │ │ └── ReporterTrait.scala │ ├── udf │ ├── DateFormatUDF.scala │ └── UDFFactory.scala │ └── util │ ├── ConfigUtil.scala │ ├── DQUtil.scala │ ├── GenericAvroSchema.scala │ ├── JdbcConnector.scala │ ├── JobIdUtil.scala │ ├── Logging.scala │ ├── NextIterator.scala │ ├── OptimizerUtil.scala │ ├── ReflectUtils.scala │ ├── StringUtil.scala │ └── SystemVariables.scala └── test └── scala └── org └── apache └── spark └── sql ├── InsightCatalogEventListenerSuite.scala ├── SQLRunnerSuiteUtils.scala ├── SparkSqlRunnerBase.scala ├── optimizer ├── CollectValueRuleSuite.scala ├── ExternalTableRuleSuite.scala └── PartitionScanLimitRuleSuite.scala ├── runner ├── ArgParserSuite.scala ├── command │ └── CommandSuite.scala └── config │ └── VariableSubstitutionSuite.scala ├── udf └── DateFormatUDFSuite.scala └── util ├── ConfigUtilSuite.scala └── JobIdUtilSuite.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | *.jar 4 | *.iml 5 | *.war 6 | *.DS_Store 7 | *.project 8 | *.classpath 9 | *.settings 10 | *.factorypath 11 | *dependency-reduced-pom.xml 12 | .idea 13 | target 14 | /local-test/ 15 | config-cache/ 16 | *.ipr 17 | *.iws 18 | !bin 19 | lib 20 | conf 21 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - deploy 3 | 4 | .build-deploy: 5 | script: 6 | - 'mvn clean package' 7 | - 'tar zxvf ./target/sql-runner-2.1-bin.tar.gz -C ./target' 8 | - 'rm -rf /data/ws/sql-runner-2.1' 9 | - 'mv ./target/sql-runner-2.1 /data/ws/sql-runner-2.1' 10 | stage: deploy 11 | when: manual 12 | 13 | uat-build-deploy: 14 | extends: .build-deploy 15 | tags: 16 | - uat-nuc1 17 | 18 | stg-build-deploy: 19 | extends: .build-deploy 20 | tags: 21 | - stg-gw1 22 | 23 | prd-build-deploy: 24 | extends: .build-deploy 25 | tags: 26 | - prd-gw1 27 | 28 | pre-build-deploy: 29 | extends: .build-deploy 30 | tags: 31 | - pre-gw1 32 | 33 | nta-build-deploy: 34 | extends: .build-deploy 35 | tags: 36 | - nta-gw1 37 | 38 | prg-build-deploy: 39 | extends: .build-deploy 40 | tags: 41 | - prg-gw1 42 | 43 | sth-build-deploy: 44 | extends: .build-deploy 45 | tags: 46 | - sth-gw1 47 | 48 | jdc-build-deploy: 49 | extends: .build-deploy 50 | tags: 51 | - jdc-gw1 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | sql_runner 是一个以Spark SQL为内核,以SQL为主体,扩展支持了数据质量告警,支持多种外部数据源,支持数据处理流程控制的数据处理引擎。 2 | 3 | ![架构图](docs/images/architecture.png) 4 | 5 | 用户通过sql_runner命令就可以执行一个包含各种扩展SQL的sql任务。 6 | 运行命令:`sql_runner [job_file.sql]` 7 | 8 | # Quick Start 9 | 10 | 编写SQL文件: 11 | 12 | ```sql 13 | /************************************************ 14 | 15 | author: kun.wan 16 | period: day 17 | run_env: PRD 18 | describe: 基础数据处理脚本 19 | 20 | ************************************************/ 21 | 22 | INSERT OVERWRITE TABLE trade.dws_trade partition(dt='${date|yyyyMMdd}') 23 | SELECT * 24 | FROM trade.dwd_trade t 25 | WHERE t.dt = '${date|yyyyMMdd}'; 26 | ``` 27 | 28 | 通过`sql_runner [job_file]` 命令就可以实现将表`trade.dwd_trade`的数据清洗到表`trade.dws_trade`。 29 | 30 | 运行说明: 31 | * 程序的第一部分为任务注释,注释中必须要包含 `author`, `period`, `run_env`, `describe` 这几个字段,主要是基于大型项目中的任务管理考虑,在之后的一些Demo中会将这部分头注释做省略。 32 | * 第二部分是我们需要运行的SQL命令,后面对系统当前支持的命令再详细介绍。 33 | * 在SQL中有看到`${date|yyyyMMdd}` 这样的特殊参数,这个会参考系统的参数管理章节。 34 | 35 | # 系统命令 36 | 37 | 当前系统支持如下命令 38 | * 单行注释命令 39 | * 多行注释命令 40 | * SET参数命令 41 | * IF命令 42 | * SQL命令 43 | 44 | ## 单行注释命令 45 | 46 | 以`--` 作为单行注释开始,系统执行的时候会忽略单行注释 47 | 48 | ## 多行注释命令 49 | 50 | 以`/**` 作为多行注释开始,以 `*/`作为多行注释结束,系统执行的时候会忽略多行注释 51 | 52 | ## SET参数命令 53 | 54 | 以`!set` 作为SET命令开始,以`;` 作为命令结束符, 命令格式: `!set [key]=[value];`, 系统执行的时候解析该参数为系统参数 55 | 56 | ## IF命令 57 | 58 | 以`!if` 作为IF命令开始,以`!fi`作为命令结束符,命令支持`!else`语句分支,命令格式: 59 | ``` 60 | !if ([条件判断语句]) 61 | [命令1] 62 | [命令2] 63 | [命令3] 64 | !else 65 | [命令4] 66 | [命令5] 67 | !fi 68 | ``` 69 | 命令正在执行的时候会对上述条件判断语句进行判断,如果条件为真,执行IF下面的命令,如果条件为假,执行ELSE下面的命令。 70 | 71 | 使用示例1: 对运行环境参数进行判断,来选择IF分支命令的选择执行 72 | 73 | ```sql 74 | !set user = "kun.wan"; 75 | !if (user = 'kun.wan') 76 | select 'if command'; 77 | !else 78 | select 'else command'; 79 | !fi 80 | ``` 81 | 82 | 使用示例2: 根据之前的SQL执行结果进行判断,来选择IF分支命令的选择执行 83 | 84 | ```sql 85 | SELECT /*+ COLLECT_VALUE('row_count', 'c') */ count(1) as c; 86 | SELECT /*+ COLLECT_VALUE('row_count2', 'd') */ count(1) as d; 87 | 88 | !if (row_count = row_count2 and row_count = 1) 89 | select 'row count is 1'; 90 | !else 91 | select 'row count is not 1'; 92 | !fi 93 | ``` 94 | 95 | ## SQL命令 96 | 97 | 除去以上命令,其他的代码会被解析为SQL命令,以`;` 作为命令结束符;每个SQL会由SQL引擎解析执行 98 | 99 | 100 | # 参数管理 101 | 102 | 系统执行过程中会有很多运行以来参数,包括时间参数, 系统参数和Set命令参数。 103 | 系统通过set命令,apollo配置等方式进行参数定义,在程序中使用`${variable}`的格式引用参数。 104 | 通过 `${variable, 'DEFAULT_VALUE'}`格式引用参数时,如果没有找到`variable`参数,则返回`DEFAULT_VALUE` 105 | 106 | ## 时间参数 107 | 108 | 时间参数是一个特殊类型的参数,表示任务运行的批次时间,如没有其他参数影响,系统时间由如下决定: 109 | 110 | * 如果是`period=month`, 系统时间表示当前时间的上一个月的1日0点0分0秒 111 | * 如果是`period=day`, 系统时间表示当前时间的前一天的0点0分0秒 112 | * 如果是`period=hour`, 系统时间表示当前时间的上一个小时的的0分0秒 113 | 114 | 说明: 115 | * 时间参数以date 开头,date表示当前job的运行批次时间。 116 | * 时间可以通过`+`和`-`来进行时间的加减运行 117 | * 在做时间运算的时候以数字和时间单位表示加减的时间窗口,时间单位中,Y表示年,M表示月, D表示天,H表示小时,m表示分钟,S表示秒。 118 | * 输出的时间格式默认为 `yyyyMMdd`,可以通过 `|` 后连接自定义的时间格式来自定义输出时间格式。时间格式为Java 默认的时间解析格式。 119 | 120 | 示例: `${date-3d|yyyyMMdd}` 121 | 122 | ## 系统参数 123 | 124 | 为了方便程序运行,程序启动的时候已经设置了一些系统参数,用于辅助程序运行。 125 | 126 | ## 系统环境参数 127 | 128 | 系统启动的时候会读取`env.xml` 中的配置作为系统参数,另外一些数据库中的配置系统会从Apollo中进行获取。 129 | 130 | 另外系统还支持在命令行中修改一些系统参数: 131 | * --dates : 手工指定系统的运行批次时间,后面可以添加多个日期参数,通过逗号分隔。每个日期参数格式:`--dates 2021-01-01T00:00:00,2021-01-03T00:00:00` 132 | * --dateRange : 手工指定系统的运行批次时间,后续跟批量运行的开始日期和结束日志,参数格式:`--dateRange 2021-01-01T00:00:00 2021-01-03T00:00:00`, 默认会每一个时间单位(天级任务就是一天,小时任务就是一小时)运行一次,可以通过 `--dateRangeStep` 参数修改多少个时间单位运行一次。 133 | * --test : 单次执行该任务,此时任务会把执行日志屏幕输出;如果程序运行错误,不会进行告警。在开发模式和补跑数据时可以添加该参数运行。 134 | * --dryrun : 空跑模式, 此模式会空跑指定任务中的SQL,并在屏幕上输出日志,可用于检查编写的程序正确性。 135 | 136 | ## Set命令参数 137 | 138 | 这个是在任务运行过程中修改系统参数 139 | 140 | # 参数的使用 141 | 142 | 对于上说的各种参数,可以通过`${variable}`格式,在SQL中引用,系统在执行的时候会自动进行参数替换。 143 | 此外,参数还支持嵌套参数引用,即 `${variable1 ${variable2 ${variable3}} }` 144 | 145 | # 高级功能说明 146 | 147 | * [外部数据源的读写](docs/External_Sources.md) 148 | * [数据质量告警](./docs/Data_Check.md) 149 | * [Hive表数据写入时排序索引](./docs/Index_Column.md) 150 | * [Trouble Shooting](./docs/Trouble_Shooting.md) 151 | * [UDF函数](./docs/UDF.md) 152 | 153 | # 日志管理 154 | 155 | * 如果程序在开发环境运行,需要传入参数 `--test`,程序日志直接在命令行输出;如果程序运行出错,不进行告警; 156 | * 如果程序在生产环境运行,程序运行日志输出在目录下 `/tmp/{USER}/${yyyyMMdd}`,程序运行完毕后会将日志归档到HDFS目录 `/metadata/logs/insight/${yyyyMMdd}`;如果程序运行出错,自动进行钉钉告警; 157 | 158 | # Contributing 159 | 160 | 开启本地调试模式 161 | 162 | * 注释掉pom中的 `hive-cli` 和 `hive-exec`两个依赖的provide依赖 163 | * 在resource目录下补充hdfs,yarn,hive的访问配置文件 164 | * 启动 `org.apache.sql.runner.JobRunner` 程序 165 | -------------------------------------------------------------------------------- /assembly.xml: -------------------------------------------------------------------------------- 1 | 4 | bin 5 | 6 | tar.gz 7 | 8 | 9 | 10 | ${project.basedir} 11 | / 12 | 13 | README* 14 | LICENSE* 15 | 16 | 17 | 18 | ${project.basedir}/bin 19 | /bin 20 | 21 | /** 22 | 23 | unix 24 | 0777 25 | 0755 26 | 27 | 28 | ${project.basedir}/src/main/resources 29 | /conf 30 | 31 | /** 32 | 33 | 34 | 35 | 36 | 37 | 38 | true 39 | /lib 40 | false 41 | runtime 42 | 43 | mysql:mysql-connector-java 44 | io.confluent:kafka-avro-serializer 45 | org.apache.kafka:kafka-clients 46 | io.confluent:kafka-schema-registry-client 47 | io.confluent:common-config 48 | io.confluent:common-utils 49 | io.dropwizard.metrics:* 50 | org.glassfish.jersey.core:* 51 | org.glassfish.jersey.containers:* 52 | org.glassfish.jersey.inject:* 53 | com.fasterxml.jackson.core:* 54 | org.apache.thrift:* 55 | org.apache.parquet:* 56 | org.apache.orc:* 57 | org.apache.avro:* 58 | org.apache.hadoop:* 59 | org.apache.hive:* 60 | org.apache.spark:* 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /bin/profile.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )" 6 | export JAVA_HOME=/usr/java/latest 7 | 8 | if [ $# == 0 ]; then 9 | echo "miss parameter for profile shell!" 10 | exit 1 11 | fi 12 | 13 | cmd=$1 14 | shift 15 | if [ "${cmd}" == "executor" ];then 16 | exec "${JAVA_HOME}"/bin/java -cp ./sql-runner-2.1.jar one.profiler.ProfileAgent 17 | exit 0 18 | elif [ "${cmd}" == "upload" ]; then 19 | sleep 1s 20 | if [ $# == 2 -a -f $1 ]; then 21 | profile_file=$1 22 | hdfs_file=$2 23 | 24 | lastTime=$(stat -c %Y "$profile_file") 25 | now=$(date +%s) 26 | stop=$(( now + 120 )) 27 | while [ ${now} -lt ${stop} ]; do 28 | sleep 1s 29 | if [ ${lastTime} -eq $(stat -c %Y "$profile_file") ];then 30 | HADOOP_USER_NAME=schedule hdfs dfs -put "${profile_file}" "${hdfs_file}" 31 | rm "${profile_file}" 32 | exit 0 33 | fi 34 | done 35 | rm "${profile_file}" 36 | fi 37 | fi 38 | -------------------------------------------------------------------------------- /bin/sql_runner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )" 4 | export JAVA_HOME=/usr/java/latest 5 | 6 | if [ ! $# -ge 1 ]; then 7 | echo "job config file must be provided!" 8 | exit 1 9 | fi 10 | 11 | jobFile=$1 12 | if [ ! -f ${jobFile} ];then 13 | jobFile="${BASEDIR}/${jobFile}" 14 | if [ ! -f ${jobFile} ];then 15 | echo "没有找到job文件: "${jobFile} 16 | exit 17 | fi 18 | fi 19 | jobFile=$(readlink -f ${jobFile}) 20 | shift 21 | 22 | cd ${BASEDIR} 23 | 24 | export SPARK_HOME=/opt/spark 25 | export CLASSPATH=$(echo ${SPARK_HOME}/jars/*.jar | tr ' ' ':'):${CLASSPATH} 26 | export CLASSPATH=$(echo ${BASEDIR}/lib/*.jar | tr ' ' ':'):${CLASSPATH} 27 | 28 | export HADOOP_CONF_DIR=/etc/hadoop/conf 29 | export SPARK_CONF_DIR=${SPARK_HOME}/conf 30 | export CLASSPATH=${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${CLASSPATH} 31 | 32 | export JAVA_OPTS="-Xmx2048m -Xms256m -server -XX:+UseG1GC" 33 | jobFileName=$(basename ${jobFile}) 34 | 35 | export CLASSPATH=${BASEDIR}/conf:${CLASSPATH} 36 | stdoutFile="/tmp/$USER/$(date +%Y%m%d)/${jobFileName%%.*}_$(date +%Y%m%d_%H%M%S).stdout" 37 | mkdir -p "/tmp/$USER/$(date +%Y%m%d)" 38 | if [[ "$*" =~ "--test" ]];then 39 | export JAVA_OPTS="${JAVA_OPTS} -Dinsight.root.logger=INFO,CA,FA -Dinsight.file.stdout=${stdoutFile}" 40 | elif [[ "$*" =~ "--dryrun" ]]; then 41 | export JAVA_OPTS="${JAVA_OPTS} -Dinsight.root.logger=INFO,CA,FA -Dinsight.file.stdout=${stdoutFile}" 42 | else 43 | export JAVA_OPTS="${JAVA_OPTS} -Dinsight.root.logger=INFO,FA -Dinsight.file.stdout=${stdoutFile}" 44 | fi 45 | 46 | # echo "Using CLASSPATH:"$CLASSPATH 47 | export HADOOP_USER_NAME=schedule 48 | if [[ "$*" =~ "--test" ]];then 49 | "${JAVA_HOME}"/bin/java ${JAVA_OPTS} org.apache.sql.runner.JobRunner ${jobFile} $@ 50 | elif [[ "$*" =~ "--dryrun" ]]; then 51 | "${JAVA_HOME}"/bin/java ${JAVA_OPTS} org.apache.sql.runner.JobRunner ${jobFile} $@ 52 | else 53 | "${JAVA_HOME}"/bin/java ${JAVA_OPTS} org.apache.sql.runner.JobRunner ${jobFile} $@ 2>>"${stdoutFile}" 54 | fi 55 | if [ $? -ne 0 ];then 56 | "${JAVA_HOME}"/bin/java org.apache.sql.runner.Alert ${jobFile} $@ 57 | fi 58 | -------------------------------------------------------------------------------- /docs/DataQuality_PartitionRule.md: -------------------------------------------------------------------------------- 1 | # Spark分区数据质量检查 2 | 3 | 本系统扩展了Spark的ExternalCatalogEventListener类,增加了对分区表在进行分区变化时的监听。 4 | 对监听到发生变化的分区进行进行表分析。 5 | 获取当前表中的其他历史分区的记录条数,建立记录数预估模型,如果当前分区的记录数不在预估模型内,则进行告警。 6 | 7 | 目前数据预估模型有两类:布林模型(Bollinger)和指数加权移动平均模型(EWMA) 8 | 9 | ## 布林模型(Bollinger) 10 | 11 | MA=最近10天记录数的绝对平均值 12 | MD=最近10天记录数的标准差 13 | (MA, UP, DN) = (MA, MA + 2 * MD, MA - 2 * MD) 14 | 15 | ## 指数加权移动平均模型(EWMA) 16 | 17 | MA=最近10天记录数的指数加权移动平均值 18 | MD=最近10天记录数和当日MA的标准差 19 | (MA, UP, DN) = (MA, MA + 2 * MD, MA - 2 * MD) 20 | 21 | ## 模型对比 22 | 23 | ### 数据对比模型1 24 | 25 | 表的记录数 26 | ![表的记录数](images/dq_row_number.png) 27 | 28 | Bollinger模型效果 29 | ![Bollinger模型效果](images/dq_bollinger_model.png) 30 | 31 | EWMA模型效果 32 | ![EWMA模型效果](images/dq_ewma_model.png) 33 | 34 | ### 数据对比模型2 35 | 36 | 表的记录数 37 | ![表的记录数](images/dq2_row_number.png) 38 | 39 | Bollinger模型效果 40 | ![Bollinger模型效果](images/dq2_bollinger_model.png) 41 | 42 | EWMA模型效果 43 | ![EWMA模型效果](images/dq2_ewma_model.png) -------------------------------------------------------------------------------- /docs/Data_Check.md: -------------------------------------------------------------------------------- 1 | # 数据质量检查 2 | 3 | ## DATA_CHECK([ALERT_MESSAGE], [CHECK_EXPRESSION]) 4 | 5 | 功能说明: 6 | 系统扩展了一个名为 DATA_CHECK的 SQL Hint,用户可以自定义数据检查表达式,如果该Boolean表达式计算结果返回false,会进行钉钉告警。 7 | 8 | 参数说明: 9 | 10 | * ALERT_MESSAGE: 数据检查失败时,告警信息 11 | * CHECK_EXPRESSION: 数据检查Boolean表达式。表达式中可以使用当前SQL中可以访问的任意列数据。 12 | 13 | 使用示例: 14 | 15 | 以下SQL会对比trade表中最近两天的店铺数量,如果差值大于100,则进行钉钉告警。 16 | 17 | ```sql 18 | !set diff_num = 100; 19 | 20 | WITH raw AS ( 21 | SELECT 22 | count(DISTINCT store_id) AS stores 23 | FROM trade 24 | WHERE dt in ('${date-1d|yyyyMMdd}', '${date|yyyyMMdd}') 25 | GROUP BY dt) 26 | SELECT /*+DATA_CHECK('交易表今日与昨日店铺数量差值大于${diff_num}', 'diff < ${diff_num}') */ 27 | max(stores) - min(stores) AS diff 28 | FROM raw; 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/External_Sources.md: -------------------------------------------------------------------------------- 1 | # 读写外部表 2 | 3 | 目前系统支持对JDBC数据源的读写,对Kafka的自动数据写入。在写SQL之前需要配置好相关系统参数和外部表参数。系统参数一般只JDBC连接信息,Kafka Broker地址等共用信息,不需要每个任务都进行配置。外部表参数指Mysql中的表名,表的主键等需要具体配置的参数。 4 | 5 | ## 读写JDBC表 6 | 7 | 功能说明: 8 | 系统支持将JDBC数据源中的一个表或者一个JDBC查询作为一个Spark中表进行读写。设置好JDBC表的相关参数后,通过 `jdbc.[NAMESPACE].[TABLE_NAME]` 名称就可以进行读写了。 9 | 10 | 使用参数说明: 11 | NAMESPACE: JDBC 连接数据库的标识 12 | TABLE_NAME: Spark SQL中注册的表名 13 | 14 | 系统参数说明(一般由系统统一配置,多个任务共享该参数): 15 | * [NAMESPACE].url : JDBC连接的地址 16 | * [NAMESPACE].username : JDBC连接的用户名 17 | * [NAMESPACE].password : JDBC连接的密码 18 | 19 | 读取外部表数据需要配置的参数: 20 | * [NAMESPACE].[TABLE_NAME].numPartitions : 外部表查询的并发数 21 | * [NAMESPACE].[TABLE_NAME].partitionColumn : 外部表并发查询的数据分区字段 22 | * [NAMESPACE].[TABLE_NAME].query : 可选参数,允许将一个JDBC 查询的视图,作为Spark的外部表进行查询 23 | 24 | 计算结果写入外部表需要配置的参数: 25 | * [NAMESPACE].[TABLE_NAME].uniqueKeys : 外部表的数据更新主键,因为数据写入JDBC表使用的是upsert方式,所以必须提供upsert操作的数据主键 26 | 27 | 使用示例: 28 | 29 | Spark读取 bi 数据库中 stores表数据示例: 30 | 31 | ```sql 32 | !set bi.stores.numPartitions = 2; 33 | !set bi.stores.partitionColumn = id; 34 | // query 为可选参数, 如果没有该参数,将直接查询 bi数据库中的stores表,如果有该参数,会查询query的查询结果视图 35 | !set bi.stores.query = """(select * from stores where store_id >10) as subq"""; 36 | 37 | SELECT store_id, store_name 38 | FROM jdbc.bi.stores 39 | WHERE store_id < 50; 40 | ``` 41 | 42 | Spark 写入 bi 数据库中 stores表数据示例: 43 | ```sql 44 | !set bi.stores.uniqueKeys = id; 45 | 46 | INSERT INTO jdbc.bi.stores 47 | SELECT 100 as store_id, "store_100" as store_name; 48 | ``` 49 | 50 | ## 数据写入Kafka 51 | 52 | 功能说明: 53 | 将SQL的计算结果插入到Kafka中。目前支持将结果自动转换为 avro 和 json 两种数据格式。 54 | 设置好Kafka的相关参数后,通过 `kafka.[TABLE_NAME]` 名称就可以向Kafka写数据了。 55 | 56 | 使用参数说明: 57 | TABLE_NAME: Spark SQL中注册的表名 58 | 59 | 系统参数说明(一般由系统统一配置,多个任务共享该参数): 60 | * kafka.bootstrap.servers : Kafka集群的Broker地址 61 | * kafka.schema.registry.url : 可选参数,如果Kafka 集群是Confluent版本的Kafka,可以管理Avro格式kafka数据,avro的schema由schema registry进行集中管理。可以配置上对应的schema registry地址。 62 | 63 | 写入json格式数据需要配置的参数: 64 | * kafka.[TABLE_NAME].kafkaTopic: kafka的topic名称 65 | * kafka.[TABLE_NAME].recordType: 填写 json 66 | * kafka.[TABLE_NAME].maxRatePerPartition : 可选参数,每个spark executor写入kafka的每秒消息数。数据结果数据集比较大,一定要加上速度限制,否则会把kafka写爆掉。 67 | 68 | 写入avro格式数据需要配置的参数: 69 | * kafka.[TABLE_NAME].kafkaTopic: kafka的topic名称 70 | * kafka.[TABLE_NAME].recordType: 填写 avro 71 | * kafka.[TABLE_NAME].avro.forceCreate : 默认为false, 如果为true,会强制使用计算结果dataframe schema作为kafka avro schema,如果schema registry上已经存在schema则会报错。如果为false,会先从Schema Registry上获取topic的Schema(此时其他avro参数无需配置),如果获取失败,再使用计算结果dataframe schema作为kafka avro schema。 72 | * kafka.[TABLE_NAME].avro.name : 可选参数,如果 `forceCreate` = true, 则必须提供创建avro数据schema需要的 name。 73 | * kafka.[TABLE_NAME].avro.namespace : 可选参数,如果 `forceCreate` = true, 则必须提供创建avro数据schema需要的 namespace。 74 | * kafka.[TABLE_NAME].maxRatePerPartition : 可选参数,每个spark executor写入kafka的每秒消息数。数据结果数据集比较大,一定要加上速度限制,否则会把kafka写爆掉。 75 | 76 | 使用示例: 77 | 78 | 向kafka写入json数据示例: 79 | 80 | ```sql 81 | !set kafka.test_topic.recordType = json; 82 | !set kafka.test_topic.kafkaTopic = test_topic; 83 | 84 | INSERT INTO kafka.test_topic 85 | SELECT 100 as id, "user_100" as name; 86 | ``` 87 | 88 | 向kafka写入avro数据示例: 89 | 90 | ```sql 91 | !set kafka.test_topic2.recordType = avro; 92 | !set kafka.test_topic2.kafkaTopic = test_topic2; 93 | -- 根据计算结果DDL自动生成Avro Schema 94 | !set kafka.test_topic2.avro.forceCreate = true; 95 | !set kafka.test_topic2.avro.name = student; 96 | !set kafka.test_topic2.avro.namespace = com.wankun; 97 | 98 | INSERT INTO kafka.test_topic2 99 | SELECT 1 as id1, 'wankun' as name1, '男' as sex1, 'PRD' env1; 100 | ``` 101 | 102 | 注意: 103 | Avro 1.8.* 版本对于Enum类型支持有问题。比较trick的解决办法是直接将Spark安装环境下的avro包删除掉. 104 | 因为hive-exec-1.1.0-cdh5.13.3.jar包 assemb 了 avro 的 1.7.6-cdh5.13.3的包,所以运行完全没问题。 105 | 106 | ## DINGDING_SINK(DING_BOT) 107 | 108 | 功能说明: 109 | 将SQL的程序结果通过钉钉机器人发送到钉钉群。 110 | 111 | 参数说明: 112 | 113 | * DING_BOT: 钉钉机器人名称 114 | 115 | 辅助参数说明: 116 | !data_alert.title=chatlog白名单店铺没有拉取到chatlog数据; 117 | !data_alert.pattern={store_id}: {store_name}; 118 | 119 | * ${DING_BOT} : 钉钉机器人Token 120 | * ${DING_BOT}.title : 钉钉群信息Title 121 | * ${DING_BOT}.pattern : 钉钉信息格式 122 | 123 | 使用示例: 124 | 125 | ```sql 126 | !data_alert.title=trade信息告警; 127 | !data_alert.pattern={store_id}: {store_name}; 128 | 129 | select /*+ DINGDING_SINK(data_alert) */ 130 | distinct a.store_id, a.store_name 131 | FROM trade a 132 | WHERE a.dt='${date|yyyyMMdd}'; 133 | ``` 134 | 135 | ## EMAIL_SINK(EMAIL_BOT) 136 | 137 | 功能说明: 138 | 将SQL的程序结果发送邮件。 139 | 140 | 参数说明: 141 | 142 | * EMAIL_BOT: Email发送机器人名称 143 | 144 | 辅助参数说明: 145 | 146 | * ${EMAIL_BOT} : Email 机器人标识 147 | * ${EMAIL_BOT}.columns : 需要取结果数据中的哪些字段 148 | * ${EMAIL_BOT}.columnNames : 结果数据中字段对应的中文名 149 | * ${EMAIL_BOT}.subject : 邮件标题 150 | * ${EMAIL_BOT}.email-to : 邮件接收人地址,多个地址使用逗号分割 151 | * ${EMAIL_BOT}.email-cc : 邮件抄送人地址,多个地址使用逗号分割 152 | 153 | 使用示例: 154 | 155 | ```sql 156 | !set email.columns={store_id}, {store_name}; 157 | !set email.columnNames=ID,名称; 158 | !set email.subject = 测试邮件; 159 | !set email.email-to = test-to@abc.com; 160 | !set email.email-cc = test-cc@abc.com; 161 | 162 | select /*+ EMAIL_SINK(email) */ 163 | distinct a.store_id, a.store_name 164 | FROM trade a 165 | WHERE a.dt='${date|yyyyMMdd}'; 166 | ``` -------------------------------------------------------------------------------- /docs/Index_Column.md: -------------------------------------------------------------------------------- 1 | # Hive表数据写入时排序索引 2 | 3 | 如果程序是将计算结果插入到hive表,运行前会先进行判断hive表是否有 `index_column` 属性,如果有,结果数据会根据这个列进行数据排序。 4 | 后续对该表的数据查询,如果带有`index_column`列的查询条件,数据会进行非常多的文件级过滤。 5 | 6 | hive表设置索引列方法: 7 | 8 | `alter table t set tblproperties('index_column'='col');` -------------------------------------------------------------------------------- /docs/Trouble_Shooting.md: -------------------------------------------------------------------------------- 1 | # Trouble Shooting 2 | 3 | ## 常见问题整理 4 | 5 | * Executor OOM 6 | 7 | ``` 8 | 19:50:23.535 Executor task launch worker for task 3 ERROR org.apache.spark.executor.Executor: Exception in task 0.3 in stage 0.0 (TID 3) 9 | java.lang.OutOfMemoryError: Java heap space 10 | ``` 11 | 默认SparkExecutor 12 | 目前给Spark的Driver和Executor都配置了2G内存。如果出现上述OOM错误,可以尝试增加Executor内存 13 | ```xml 14 | 15 | 16 | spark.executor.memory 17 | 4g 18 | 19 | 20 | ``` 21 | 22 | * 读取parquet文件出现很多读取空文件的Task 23 | 24 | 这个是Spark FileSourceStrategy在将LogicalPlan 转换为 FileSourceScanExec(DataSourceScanExec的子类)时的BUG。 25 | 虽然spark在计算每个split时会按照比较理想的参数去计算split,但是物理执行时对应的FileFormat(ParquetFileFormat)读取文件时可能会读到空数据。 26 | ``` 27 | def maxSplitBytes( 28 | sparkSession: SparkSession, 29 | selectedPartitions: Seq[PartitionDirectory]): Long = { 30 | val defaultMaxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes 31 | val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes 32 | val defaultParallelism = sparkSession.sparkContext.defaultParallelism 33 | val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum 34 | val bytesPerCore = totalBytes / defaultParallelism 35 | 36 | Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore)) 37 | } 38 | ``` 39 | 40 | 可以通过调大openCostInBytes参数,也可以关闭向FileSourceScanExec算子的转化来处理。 41 | ```xml 42 | 43 | spark.sql.hive.convertMetastoreParquet 44 | false 45 | 46 | ``` 47 | 48 | * 复杂Streaming任务Driver端OOM 49 | 50 | Spark UI默认会保留最近1000个executions的执行内容,供用户查看。但是如果每个execution的解析计划比较大,就比较容易造成driver端OOM。 51 | 例如,我遇到的一个Streaming任务,每个任务的Spark Plan内存占用约4M,很危险。通过减小`spark.sql.ui.retainedExecutions` 参数后,系统恢复稳定。 52 | 53 | ## Spark 任务profile 54 | 55 | 系统内置了async-profile工具用于对Spark程序运行过程进行Profile分析。 56 | 因为async-profile工具是基于 perf_events 进行程序采样分析的,所以要求集群机器上开启对应的系统参数。 57 | ``` 58 | sysctl -w kernel.perf_event_paranoid=1 59 | sysctl -w kernel.kptr_restrict=0 60 | ``` 61 | 62 | 在开启executor profile过程中,程序会占用额外的内存资源,有可能会被NodeManager以内存用超而Kill掉。 63 | 为了能够将profile结果上传到HDFS,需要修改NodeManager参数`yarn.nodemanager.sleep-delay-before-sigkill.ms=60000` 64 | 65 | Spark任务执行过程profile目前提供如下三种方式: 66 | 67 | ### 直接查看堆栈法 68 | 69 | 对于简单的任务,可以直接进行spark job管理页面,查看运行慢的executor对应Thread Dump,分析具体那个Thread运行慢导致 70 | 71 | ### 自动Profile Executor法 72 | 73 | 对于运行时间较短,但是运行比较慢的任务,可以通过 `--profile` 参数开启对executor进程的Profile。 74 | 75 | profile有如下三种profile结果,默认生成JFR文件: 76 | * 通过`--config spark.profile.type=jfr`来指定生成JFR文件 77 | * 通过`--config spark.profile.type=svg`来指定生成SVG火焰图 78 | * 通过`--config spark.profile.type=yourkit`来指定生成Yourkit snapshot文件 79 | 80 | executor运行完毕后会将生成的JFR文件上传到HDFS的`/metadata/logs/profile/${applicationId}/${attemptId}/` 路径。 81 | 82 | ### 手动Profile Executor法 83 | 84 | 对于运行时间较长,不需要权量进行Profile的Executor可以可以通过 `--profile --config spark.profile.manualprofile=true` 参数手动开启profile。 85 | 此时可以进入executor执行的节点的对应进程启动目录,执行 `profile.sh executor`,再依次输入`start` 和`stop` 命令,profile结束后会在当前机器生成火焰图文件 `/tmp/executor_${PID}.svg` 。 86 | 87 | PS: 程序内部使用MXBean 进行Profile管理,并提供了Shell工具进行外部管理。如果有JS好的同学可以直接修改Spark Executor页面,增加开始和结束并直接查看火焰图是最方便的了。 -------------------------------------------------------------------------------- /docs/UDF.md: -------------------------------------------------------------------------------- 1 | # UDF函数 2 | 3 | ## UDF函数分类: 4 | 5 | * 不包含业务数据处理逻辑的UDF: 建议直接写入本项目内,这样方便UDF的代码复用 6 | * 包含业务处理逻辑的UDF: 这类代码一般不可复用,但是代码仍然要做到统一管理,所以建议和公司内一样,使用独立的repo来管理代码。 7 | 接下来说的扩展UDF函数开发流程,指的就是这类UDF函数。 8 | 9 | ## 扩展UDF函数开发流程 10 | 11 | ### 编写UDF开发函数 12 | 13 | UDF函数开发规范 14 | 15 | * 代码类以`com.wankun.[业务模块].[主业务逻辑]UDF`命名,类名上需要添加`UDFDescription`注解。 16 | * 注解中的`name`属性表示注册的函数名 17 | * 注解中的`returnType`属性表示函数的函数数据类型 18 | * 注解中的`description`属性表示函数的说明 19 | * 原则上所有业务逻辑必须要有明确`description`说明,主函数需要有UT测试类 20 | 21 | ### 部署UDF函数 22 | 23 | 编译项目,并上传结果jar到hdfs目录:`/deploy/config/biz-udfs-1.0.jar` (PS: 一般CI可以做到自动化) 24 | 25 | ``` 26 | mvn clean package 27 | hdfs dfs -put -f ./target/biz-udfs-1.0.jar /deploy/config/biz-udfs-1.0.jar 28 | ``` 29 | 30 | # 开发例子说明 31 | 32 | 下文以开发推荐归因数据的order转换的UDF函数为例,说明开发和使用步骤。 33 | 34 | ## 开发UDF 函数,并实现UDF4的call方法,实现传入4个参数,输出一个参数的UDF函数 35 | 36 | ```java 37 | package com.wankun.udfs.recommend; 38 | 39 | @UDFDescription( 40 | name = "attribution_orders", 41 | returnType = "array>", 43 | description = "在对推荐归因计算时,转换原始trade中的orders数据") 44 | public class AttributionOrdersUDF 45 | implements UDF4, String, WrappedArray, WrappedArray> { 46 | 47 | @Override 48 | public WrappedArray call(String abTarget, 49 | WrappedArray priorSpuIds, 50 | String dispatchSpuId, 51 | WrappedArray originOrders) throws Exception { 52 | 53 | } 54 | } 55 | ``` 56 | 57 | ## 使用函数 58 | 59 | ```sql 60 | 61 | !set spark.sql.externalUdfClasses = com.wankun.udfs.recommend.AttributionOrdersUDF; 62 | 63 | SELECT attribution_orders(ab_target, prior_spu_id, spu_id, orders) as orders 64 | FROM trade; 65 | ``` -------------------------------------------------------------------------------- /docs/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/architecture.png -------------------------------------------------------------------------------- /docs/images/dq2_bollinger_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq2_bollinger_model.png -------------------------------------------------------------------------------- /docs/images/dq2_ewma_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq2_ewma_model.png -------------------------------------------------------------------------------- /docs/images/dq2_row_number.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq2_row_number.png -------------------------------------------------------------------------------- /docs/images/dq_bollinger_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq_bollinger_model.png -------------------------------------------------------------------------------- /docs/images/dq_ewma_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq_ewma_model.png -------------------------------------------------------------------------------- /docs/images/dq_row_number.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq_row_number.png -------------------------------------------------------------------------------- /src/main/java/one/profiler/AsyncProfilerMXBean.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Andrei Pangin 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package one.profiler; 18 | 19 | /** 20 | * AsyncProfiler interface for JMX server. 21 | * How to register AsyncProfiler MBean: 22 | * 23 | *
{@code
24 |  *     ManagementFactory.getPlatformMBeanServer().registerMBean(
25 |  *             AsyncProfiler.getInstance(),
26 |  *             new ObjectName("one.profiler:type=AsyncProfiler")
27 |  *     );
28 |  * }
29 | */ 30 | public interface AsyncProfilerMXBean { 31 | void start(String event, long interval) throws IllegalStateException; 32 | void resume(String event, long interval) throws IllegalStateException; 33 | void stop() throws IllegalStateException; 34 | 35 | long getSamples(); 36 | String getVersion(); 37 | 38 | String execute(String command) throws IllegalArgumentException, java.io.IOException; 39 | 40 | String dumpCollapsed(Counter counter); 41 | String dumpTraces(int maxTraces); 42 | String dumpFlat(int maxMethods); 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/one/profiler/Counter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Andrei Pangin 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package one.profiler; 18 | 19 | /** 20 | * Which metrics to use when generating profile in collapsed stack traces format. 21 | */ 22 | public enum Counter { 23 | SAMPLES, 24 | TOTAL 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/one/profiler/Events.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Andrei Pangin 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package one.profiler; 18 | 19 | /** 20 | * Predefined event names to use in {@link AsyncProfiler#start(String, long)} 21 | */ 22 | public class Events { 23 | public static final String CPU = "cpu"; 24 | public static final String ALLOC = "alloc"; 25 | public static final String LOCK = "lock"; 26 | public static final String WALL = "wall"; 27 | public static final String ITIMER = "itimer"; 28 | } 29 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file core/target/unit-tests.log 19 | insight.root.logger=INFO,CA 20 | insight.file.stdout=/tmp/stdout 21 | log4j.rootLogger=${insight.root.logger} 22 | 23 | #Console Appender 24 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 25 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.CA.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %p %c: %m%n 27 | log4j.appender.CA.Threshold = TRACE 28 | log4j.appender.CA.follow = true 29 | 30 | #File Appender 31 | log4j.appender.FA=org.apache.log4j.FileAppender 32 | log4j.appender.FA.append=false 33 | log4j.appender.FA.file=${insight.file.stdout} 34 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout 35 | log4j.appender.FA.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %p %c: %m%n 36 | 37 | # Set the logger level of File Appender to WARN 38 | log4j.appender.FA.Threshold = TRACE 39 | 40 | # Some packages are noisy for no good reason. 41 | log4j.additivity.parquet.hadoop.ParquetRecordReader=false 42 | log4j.logger.parquet.hadoop.ParquetRecordReader=OFF 43 | 44 | log4j.additivity.parquet.hadoop.ParquetOutputCommitter=false 45 | log4j.logger.parquet.hadoop.ParquetOutputCommitter=OFF 46 | 47 | log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false 48 | log4j.logger.org.apache.hadoop.hive.serde2.lazy.LazyStruct=OFF 49 | 50 | log4j.additivity.org.apache.hadoop.hive.metastore.RetryingHMSHandler=false 51 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=OFF 52 | 53 | log4j.additivity.hive.ql.metadata.Hive=false 54 | log4j.logger.hive.ql.metadata.Hive=OFF 55 | 56 | # Parquet related logging 57 | log4j.logger.parquet.hadoop=WARN 58 | log4j.logger.org.apache.spark.sql.parquet=WARN 59 | 60 | log4j.logger.org.spark_project.jetty=ERROR 61 | log4j.logger.org.apache.spark=WARN 62 | log4j.logger.org.apache.spark.deploy.yarn=INFO 63 | log4j.logger.org.apache.hadoop.hive.ql=INFO 64 | log4j.logger.org.apache.hadoop.hive.metastore=WARN 65 | log4j.logger.org.apache.hadoop.hive.ql.log.PerfLogger=WARN 66 | log4j.logger.org.apache.hadoop.mapreduce.lib=INFO 67 | log4j.logger.org.apache.spark.sql=INFO 68 | 69 | log4j.logger.BlockManagerMasterEndpoint=ERROR 70 | 71 | log4j.logger.org.apache.spark.sql.execution.datasources.FileSourceStrategy=WARN 72 | 73 | # to enable RuleExecutor log in Spark2 74 | #log4j.logger.org.apache.spark.sql.hive=TRACE 75 | #log4j.logger.org.apache.spark.sql.hive.client=INFO 76 | #log4j.logger.org.apache.spark.sql.hive.HiveMetastoreCatalog=DEBUG 77 | #log4j.logger.org.apache.spark.sql.execution.FileSourceScanExec=DEBUG 78 | 79 | # to enable RuleExecutor log in Spark3, set this configuration in spark_default.xml 80 | #spark.sql.optimizer.planChangeLog.level=INFO 81 | -------------------------------------------------------------------------------- /src/main/resources/metrics.properties_template: -------------------------------------------------------------------------------- 1 | # USING : --files metrics.properties 2 | *.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink 3 | *.sink.graphite.host=graphite_host 4 | *.sink.graphite.port=2003 5 | *.sink.graphite.prefix=java 6 | master.source.jvm.class=org.apache.spark.metrics.source.JvmSource 7 | worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource 8 | driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource 9 | executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCCatalog.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.jdbc 19 | 20 | import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table} 21 | import org.apache.spark.sql.util.Logging 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2021-04-08. 26 | */ 27 | class JDBCCatalog extends DelegatingCatalogExtension with Logging { 28 | 29 | override def name(): String = "JDBC" 30 | 31 | override def loadTable(ident: Identifier): Table = JDBCTable(ident) 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCDataWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.jdbc 19 | 20 | import java.sql.Connection 21 | 22 | import org.apache.spark.sql.catalyst.InternalRow 23 | import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage} 24 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.createConnectionFactory 25 | import org.apache.spark.sql.execution.datasources.jdbc.MyJDBCUtils._ 26 | import org.apache.spark.sql.types._ 27 | import org.apache.spark.sql.util.Logging 28 | 29 | /** 30 | * @author kun.wan, 31 | * @date 2021-04-07. 32 | */ 33 | class JDBCDataWriter(schema: StructType, options: MyJDBCOptions) 34 | extends DataWriter[InternalRow] with Logging { 35 | 36 | val table = options.tableOrQuery 37 | val uniqueKeys: Set[String] = 38 | options.uniqueKeys.split(",").map(_.trim.toLowerCase).toSet 39 | 40 | val conn: Connection = createConnectionFactory(options)() 41 | conn.setAutoCommit(false) 42 | val (upsertSql, affectColumns, updateColumns) = upsertSqlAndColumns(conn, options) 43 | val stmt = conn.prepareStatement(upsertSql) 44 | 45 | val nameToIndex = schema.names.map(_.toLowerCase).zipWithIndex.toMap 46 | val setters = 47 | (affectColumns ++ updateColumns).zipWithIndex.map { case (column, pos) => 48 | val fieldIndex = nameToIndex(column.toLowerCase) 49 | makeSetter(fieldIndex, pos + 1, schema.fields(fieldIndex).dataType) 50 | } 51 | 52 | var rowCount = 0 53 | val batchSize = options.batchSize 54 | 55 | override def write(row: InternalRow): Unit = { 56 | try { 57 | setters.map(_.apply(stmt, row)) 58 | } catch { 59 | case e: Exception => 60 | logError(s"fail to fill prepare statement params. Row=($row), statement=$stmt") 61 | throw e 62 | } 63 | 64 | stmt.addBatch() 65 | rowCount += 1 66 | if (rowCount % batchSize == 0) { 67 | val updateCounts = stmt.executeBatch().length 68 | // upsertCount.add(updateCounts) 69 | logInfo(s"commit JDBC PreparedStatement,affected rows = ${updateCounts}, " + 70 | s"statement counter = ${rowCount}") 71 | 72 | rowCount = 0 73 | } 74 | } 75 | 76 | override def commit(): WriterCommitMessage = { 77 | val updateCounts = stmt.executeBatch().length 78 | // upsertCount.add(updateCounts) 79 | logInfo(s"commit JDBC PreparedStatement,affected rows = ${updateCounts}, " + 80 | s"statement counter = ${rowCount}") 81 | conn.commit() 82 | new WriterCommitMessage() {} 83 | } 84 | 85 | override def abort(): Unit = { 86 | conn.rollback() 87 | } 88 | 89 | override def close(): Unit = { 90 | stmt.close() 91 | conn.close() 92 | } 93 | 94 | def upsertSqlAndColumns(conn: Connection, 95 | options: JDBCOptions): (String, Array[String], Array[String]) = { 96 | val tableSchema = JdbcUtils.getSchemaOption(conn, options) 97 | assert(tableSchema.isDefined, s"Fail to get $table in db, maybe $table does not exist") 98 | val tableColumnNames = tableSchema.get.fieldNames 99 | val rddSchemaNames = schema.names.map(_.toLowerCase) 100 | val affectColumns = tableColumnNames.filter(col => rddSchemaNames.contains(col.toLowerCase)) 101 | val updateColumns = affectColumns.filter(col => !uniqueKeys.contains(col.toLowerCase)) 102 | tableColumnNames.filterNot(affectColumns.contains) 103 | .foreach(col => logWarning(s"row schema doesn't contains column : {${col} }")) 104 | 105 | val upsertSql = 106 | s""" 107 | |INSERT INTO ${table} (${affectColumns.mkString(", ")}) 108 | |VALUES ( ${affectColumns.map(_ => "?").mkString(", ")} ) 109 | |ON DUPLICATE KEY UPDATE ${updateColumns.map(_ + "= ?").mkString(", ")} 110 | |""".stripMargin 111 | logInfo(s"upsert sql : $upsertSql") 112 | (upsertSql, affectColumns, updateColumns) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.jdbc 19 | 20 | import java.sql.Connection 21 | import java.util 22 | 23 | import scala.collection.JavaConverters._ 24 | import scala.collection.mutable 25 | 26 | import org.apache.spark.sql.connector.catalog._ 27 | import org.apache.spark.sql.connector.read.ScanBuilder 28 | import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} 29 | import org.apache.spark.sql.runner.container.ConfigContainer 30 | import org.apache.spark.sql.types.StructType 31 | import org.apache.spark.sql.util.{CaseInsensitiveStringMap, Logging} 32 | 33 | /** 34 | * @author kun.wan, 35 | * @date 2021-04-07. 36 | * 37 | * 一般的实现里,会有一个Source类,继承 RelationProvider 和 TableProvider,负责提供Relation 和 Table对象。 38 | * 然后调用 DataSourceV2Utils.getTableFromProvider() 方法,从Provider 获取table实例的方法,但是我感觉这样 39 | * 还不如直接new 一个Table实例方便,那样做反而更绕了~~ 40 | */ 41 | case class JDBCTable(ident: Identifier) extends Table 42 | with SupportsRead 43 | with SupportsWrite 44 | with Logging { 45 | 46 | import MyJDBCOptions._ 47 | 48 | val namespace = ident.namespace()(0) 49 | val relationName = ident.name() 50 | 51 | val tableOrQuery = 52 | ConfigContainer.getOrElse(s"$namespace.$relationName.query", ident.name()) 53 | 54 | val jdbcOptions = { 55 | val parameters = mutable.Map( 56 | JDBC_URL -> ConfigContainer.get(s"$namespace.url"), 57 | "user" -> ConfigContainer.get(s"$namespace.username"), 58 | "password" -> ConfigContainer.get(s"$namespace.password"), 59 | JDBC_TABLE_NAME -> tableOrQuery 60 | ) 61 | Seq( 62 | JDBC_PARTITION_COLUMN, 63 | JDBC_NUM_PARTITIONS, 64 | JDBC_QUERY_TIMEOUT, 65 | JDBC_BATCH_FETCH_SIZE, 66 | JDBC_PUSHDOWN_PREDICATE, 67 | JDBC_UNIQUE_KEYS 68 | ).map(optionName => optionName -> s"$namespace.$relationName.$optionName") 69 | .filter(option => ConfigContainer.contains(option._2)) 70 | .foreach { option => parameters += (option._1 -> ConfigContainer.get(option._2)) } 71 | 72 | // 读数据使用新的分区算法,JDBC_PARTITION_COLUMN 为必须参数,JDBC_LOWER_BOUND, JDBC_UPPER_BOUND 传入伪参数 73 | if (parameters.contains(JDBC_PARTITION_COLUMN)) { 74 | parameters += (JDBC_LOWER_BOUND -> "0") 75 | parameters += (JDBC_UPPER_BOUND -> "0") 76 | } 77 | 78 | // JDBC 更新数据时需要准备好更新的表的数据主键 79 | new MyJDBCOptions(parameters.toMap) 80 | } 81 | 82 | override def name(): String = ident.toString 83 | 84 | /** 85 | * JDBC表写的时候,schema通过child Plan自动解析生成 86 | * JDBC表读的时候,进行schema自动推测 87 | * @return 88 | */ 89 | override def schema(): StructType = { 90 | if (ConfigContainer.contains(s"${ident.toString}.schemaDDL")) { 91 | StructType.fromDDL(ConfigContainer.get(s"${ident.toString}.schemaDDL")) 92 | } else { 93 | val conn: Connection = MyJDBCUtils.createConnectionFactory(jdbcOptions)() 94 | try { 95 | JdbcUtils.getSchemaOption(conn, jdbcOptions).get 96 | } finally { 97 | conn.close() 98 | } 99 | } 100 | } 101 | 102 | override def capabilities(): util.Set[TableCapability] = 103 | Set(TableCapability.BATCH_READ, 104 | TableCapability.BATCH_WRITE).asJava 105 | 106 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { 107 | Seq( 108 | JDBC_URL, 109 | "user", 110 | "password", 111 | JDBC_TABLE_NAME, 112 | JDBC_PARTITION_COLUMN, 113 | JDBC_NUM_PARTITIONS 114 | ).foreach { option => 115 | require(jdbcOptions.parameters.contains(option), 116 | s"parameter $option is needed in JDBC read") 117 | } 118 | 119 | new JDBCScanBuilder(schema, jdbcOptions) 120 | } 121 | 122 | override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { 123 | Seq( 124 | JDBC_URL, 125 | "user", 126 | "password", 127 | JDBC_TABLE_NAME, 128 | JDBC_UNIQUE_KEYS 129 | ).foreach { option => 130 | require(jdbcOptions.parameters.contains(option), 131 | s"parameter $option is needed in JDBC write") 132 | } 133 | 134 | new JDBCWriteBuilder(schema, jdbcOptions) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCWriteBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.jdbc 19 | 20 | import org.apache.spark.sql.catalyst.InternalRow 21 | import org.apache.spark.sql.connector.write._ 22 | import org.apache.spark.sql.types.StructType 23 | 24 | /** 25 | * @author kun.wan, 26 | * @date 2021-04-07. 27 | */ 28 | class JDBCWriteBuilder(schema: StructType, options: MyJDBCOptions) extends WriteBuilder { 29 | 30 | override def buildForBatch(): BatchWrite = new JDBCBatchWrite(schema, options) 31 | 32 | } 33 | 34 | class JDBCBatchWrite(schema: StructType, options: MyJDBCOptions) extends BatchWrite { 35 | 36 | override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = 37 | new JDBCDataWriterFactory(schema, options) 38 | 39 | override def commit(messages: Array[WriterCommitMessage]): Unit = {} 40 | 41 | override def abort(messages: Array[WriterCommitMessage]): Unit = {} 42 | } 43 | 44 | class JDBCDataWriterFactory(schema: StructType, options: MyJDBCOptions) extends DataWriterFactory { 45 | 46 | override def createWriter(partitionId: Int, taskId: Long): DataWriter[InternalRow] = 47 | new JDBCDataWriter(schema, options) 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/MyJDBCOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.jdbc 19 | 20 | import java.util.Locale 21 | 22 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 23 | 24 | /** 25 | * @author kun.wan, 26 | * 27 | * @date 2021-04-08. 28 | * 29 | * Spark内置的JDBCOptions 不会序列化用户传入的自定义属性,所以直接自己干 30 | */ 31 | case class MyJDBCOptions(@transient override val parameters: CaseInsensitiveMap[String]) 32 | extends JDBCOptions(parameters) { 33 | 34 | import JDBCOptions._ 35 | 36 | def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters)) 37 | 38 | def this(url: String, table: String, parameters: Map[String, String]) = { 39 | this(CaseInsensitiveMap(parameters ++ Map( 40 | JDBCOptions.JDBC_URL -> url, 41 | JDBCOptions.JDBC_TABLE_NAME -> table))) 42 | } 43 | 44 | require( 45 | parameters.get(JDBC_TABLE_NAME).isDefined, 46 | s"Option '$JDBC_TABLE_NAME' is required. " + 47 | s"Option '$JDBC_QUERY_STRING' is not applicable while writing.") 48 | 49 | val uniqueKeys = parameters.getOrElse(MyJDBCOptions.JDBC_UNIQUE_KEYS, "") 50 | 51 | var filterWhereClause = parameters.getOrElse(MyJDBCOptions.JDBC_FILTER_WHERE_CLAUSE, "") 52 | 53 | } 54 | 55 | object MyJDBCOptions { 56 | 57 | private val jdbcOptionNames = collection.mutable.Set[String]() 58 | 59 | private def newOption(name: String): String = { 60 | jdbcOptionNames += name.toLowerCase(Locale.ROOT) 61 | name 62 | } 63 | 64 | val JDBC_URL = newOption("url") 65 | val JDBC_TABLE_NAME = newOption("dbtable") 66 | val JDBC_QUERY_STRING = newOption("query") 67 | val JDBC_DRIVER_CLASS = newOption("driver") 68 | val JDBC_PARTITION_COLUMN = newOption("partitionColumn") 69 | val JDBC_LOWER_BOUND = newOption("lowerBound") 70 | val JDBC_UPPER_BOUND = newOption("upperBound") 71 | val JDBC_NUM_PARTITIONS = newOption("numPartitions") 72 | val JDBC_QUERY_TIMEOUT = newOption("queryTimeout") 73 | val JDBC_BATCH_FETCH_SIZE = newOption("fetchsize") 74 | val JDBC_TRUNCATE = newOption("truncate") 75 | val JDBC_CASCADE_TRUNCATE = newOption("cascadeTruncate") 76 | val JDBC_CREATE_TABLE_OPTIONS = newOption("createTableOptions") 77 | val JDBC_CREATE_TABLE_COLUMN_TYPES = newOption("createTableColumnTypes") 78 | val JDBC_CUSTOM_DATAFRAME_COLUMN_TYPES = newOption("customSchema") 79 | val JDBC_BATCH_INSERT_SIZE = newOption("batchsize") 80 | val JDBC_TXN_ISOLATION_LEVEL = newOption("isolationLevel") 81 | val JDBC_SESSION_INIT_STATEMENT = newOption("sessionInitStatement") 82 | val JDBC_PUSHDOWN_PREDICATE = newOption("pushDownPredicate") 83 | val JDBC_UNIQUE_KEYS = newOption("uniqueKeys") 84 | val JDBC_FILTER_WHERE_CLAUSE = newOption("filterWhereClause") 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/kafka/KafkaCatalog.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.kafka 19 | 20 | import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table} 21 | import org.apache.spark.sql.util.Logging 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2021-04-08. 26 | */ 27 | class KafkaCatalog extends DelegatingCatalogExtension with Logging { 28 | 29 | override def name(): String = "KAFKA" 30 | 31 | override def loadTable(ident: Identifier): Table = KafkaTable(ident) 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/kafka/KafkaOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.kafka 19 | 20 | import java.util.Properties 21 | 22 | import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper} 23 | import io.confluent.kafka.serializers.{AbstractKafkaAvroSerDeConfig, KafkaAvroSerializer} 24 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig} 25 | import org.apache.kafka.common.serialization.StringSerializer 26 | 27 | import scala.reflect.ClassTag 28 | 29 | import scala.collection.JavaConverters._ 30 | 31 | /** 32 | * @author kun.wan, 33 | * @date 2020-07-13. 34 | */ 35 | case class KafkaOptions(name: String, config: Map[String, String]) extends Serializable { 36 | val bootstrapServers = config(s"kafka.bootstrap.servers") 37 | val schemaRegistryUrl = config.getOrElse(s"kafka.schema.registry.url", "") 38 | 39 | val topic = config(s"kafka.${name}.kafkaTopic") 40 | val recordType: String = config(s"kafka.${name}.recordType") 41 | val avroName = config.getOrElse(s"kafka.${name}.avro.name", "") 42 | val avroNamespace = config.getOrElse(s"kafka.${name}.avro.namespace", "") 43 | val fieldMapping = config.getOrElse(s"kafka.${name}.avro.fieldMapping", "") 44 | val avroForceCreate = config.getOrElse(s"kafka.${name}.avro.forceCreate", "false") 45 | 46 | val maxRatePerPartition = config.getOrElse(s"kafka.${name}.maxRatePerPartition", "10000000").toInt 47 | 48 | lazy val fieldMappingMap = { 49 | val objectMapper = new ObjectMapper 50 | if (fieldMapping != "") { 51 | objectMapper.readTree(fieldMapping) 52 | .asScala 53 | .map(f => f.path("name").textValue() -> f) 54 | .toMap 55 | } else { 56 | Map[String, JsonNode]() 57 | } 58 | } 59 | 60 | lazy val serialClass: Class[_] = recordType match { 61 | case JSON_TYPE => 62 | classOf[StringSerializer] 63 | case AVRO_TYPE => 64 | classOf[KafkaAvroSerializer] 65 | } 66 | 67 | val JSON_TYPE: String = "json" 68 | val AVRO_TYPE: String = "avro" 69 | 70 | def initProducer[T: ClassTag](): KafkaProducer[String, T] = { 71 | val properties = new Properties 72 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers) 73 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer]) 74 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serialClass) 75 | properties.put(ProducerConfig.ACKS_CONFIG, "all") 76 | properties.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl) 77 | new KafkaProducer[String, T](properties) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/execution/datasources/kafka/KafkaTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.execution.datasources.kafka 19 | 20 | import java.util 21 | 22 | import org.apache.spark.sql.catalyst.InternalRow 23 | import org.apache.spark.sql.connector.catalog.{Identifier, SupportsWrite, Table, TableCapability} 24 | import org.apache.spark.sql.connector.write._ 25 | import org.apache.spark.sql.types.StructType 26 | import scala.collection.JavaConverters._ 27 | 28 | import org.apache.spark.sql.runner.container.ConfigContainer 29 | 30 | /** 31 | * @author kun.wan, 32 | * @date 2021-04-06. 33 | */ 34 | case class KafkaTable(ident: Identifier) extends Table with SupportsWrite { 35 | 36 | override def name(): String = ident.toString 37 | 38 | override def schema(): StructType = 39 | StructType.fromDDL(ConfigContainer.get(s"${ident.toString}.schemaDDL")) 40 | 41 | override def capabilities(): util.Set[TableCapability] = 42 | Set(TableCapability.BATCH_WRITE).asJava 43 | 44 | override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = 45 | new KafkaWriteBuilder(ident.name(), schema()) 46 | } 47 | 48 | class KafkaWriteBuilder(name: String, schema: StructType) extends WriteBuilder { 49 | 50 | override def buildForBatch(): BatchWrite = new KafkaBatchWrite(name, schema) 51 | 52 | } 53 | 54 | class KafkaBatchWrite(name: String, schema: StructType) extends BatchWrite { 55 | 56 | override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = 57 | new KafkaDataWriterFactory(name, schema) 58 | 59 | override def commit(messages: Array[WriterCommitMessage]): Unit = {} 60 | 61 | override def abort(messages: Array[WriterCommitMessage]): Unit = {} 62 | } 63 | 64 | class KafkaDataWriterFactory(name: String, schema: StructType) extends DataWriterFactory { 65 | 66 | val kafkaOption: KafkaOptions = KafkaOptions(name, ConfigContainer.valueMap.get()) 67 | 68 | override def createWriter(partitionId: Int, taskId: Long): DataWriter[InternalRow] = 69 | new KafkaDataWriter(kafkaOption, schema) 70 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hive/SqlRunnerMetrics.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.execution.SparkPlan 22 | import org.apache.spark.sql.execution.command.DataWritingCommandExec 23 | import org.apache.spark.sql.execution.metric.SQLMetric 24 | import org.apache.spark.sql.hive.execution.{HiveTableScanExec, InsertIntoHiveTable} 25 | 26 | /** 27 | * @author kun.wan, 28 | * @date 2020-04-29. 29 | */ 30 | object SqlRunnerMetrics extends Logging { 31 | 32 | def logSparkPlanMetrics(plan: SparkPlan): Unit = plan match { 33 | case HiveTableScanExec(_, relation, _) => 34 | logInfo(s"source ${relation.nodeName}(${relation.tableMeta.identifier}) metrics : ${formatMetrics(plan.metrics)}") 35 | case DataWritingCommandExec(cmd: InsertIntoHiveTable, _) => 36 | logInfo(s"Insert table ${cmd.table.identifier} metrics : ${formatMetrics(plan.metrics)}") 37 | 38 | case _ => 39 | } 40 | 41 | def formatMetrics(metrics: Map[String, SQLMetric]): Map[String, Long] = metrics.map { 42 | case (name: String, metric: SQLMetric) => 43 | name -> metric.value 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/optimizer/CollectValueRule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import java.util.Locale 21 | 22 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnresolvedHint} 23 | import org.apache.spark.sql.catalyst.rules.Rule 24 | import org.apache.spark.sql.util.OptimizerUtil.parseHintParameter 25 | 26 | import org.apache.spark.sql.runner.callback.{ArrayValueCollector, DataCallBackFactory, SingleValueCollector} 27 | 28 | /** 29 | * @author kun.wan, 30 | * @date 2020-09-15. 31 | */ 32 | object CollectValueRule extends Rule[LogicalPlan] { 33 | 34 | override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { 35 | case hint@UnresolvedHint(hintName, parameters, child) => hintName.toUpperCase(Locale.ROOT) match { 36 | case "COLLECT_VALUE" => 37 | val name: String = parseHintParameter(parameters(0)) 38 | val columnName: String = parseHintParameter(parameters(1)) 39 | DataCallBackFactory.registerDataCallBack(SingleValueCollector(name, columnName)) 40 | 41 | child 42 | 43 | case "COLLECT_ARRAY" => 44 | val name: String = parseHintParameter(parameters(0)) 45 | val columnName: String = parseHintParameter(parameters(1)) 46 | DataCallBackFactory.registerDataCallBack(ArrayValueCollector(name, columnName)) 47 | 48 | child 49 | 50 | case _ => hint 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/optimizer/DataQualityRule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import java.util.Locale 21 | 22 | import org.apache.spark.internal.Logging 23 | import org.apache.spark.sql.catalyst.expressions.Literal 24 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser 25 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, UnresolvedHint} 26 | import org.apache.spark.sql.catalyst.rules.Rule 27 | import org.apache.spark.sql.{Column, SparkSession} 28 | import org.apache.spark.util.IdGenerator 29 | 30 | import org.apache.spark.sql.runner.callback.{DataCallBackFactory, DataCheckCallBack} 31 | 32 | /** 33 | * @author kun.wan, 34 | * @date 2021-02-20. 35 | */ 36 | case class DataQualityRule(spark: SparkSession) extends Rule[LogicalPlan] { 37 | 38 | import DataQualityRule._ 39 | 40 | override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { 41 | case hint @ UnresolvedHint(hintName, parameters, child) => hintName.toUpperCase(Locale.ROOT) match { 42 | case "DATA_CHECK" => 43 | val checkTitle: String = parameters.head.toString 44 | val dataCheckExpressions = 45 | parameters.tail map { case literal: Literal => 46 | val expression = literal.toString() 47 | val checkResultColumn = generateDataCheckColumnName() 48 | val column = Column.apply(CatalystSqlParser.parseExpression(expression)).as(checkResultColumn) 49 | column.named.children.head.children.find { expr => child.output.contains(expr) } match { 50 | case Some(originColumnExpr) => 51 | DataCallBackFactory.registerDataCallBack( 52 | DataCheckCallBack(checkTitle, 53 | child.output.find( p => p == originColumnExpr).get.name, 54 | checkResultColumn, 55 | expression)) 56 | column.named 57 | 58 | case _ => 59 | throw new RuntimeException("Data check column not matched!") 60 | } 61 | } 62 | 63 | Project(child.output ++ dataCheckExpressions, child) 64 | 65 | case _ => hint 66 | } 67 | } 68 | } 69 | 70 | object DataQualityRule extends Logging { 71 | private val ID_GENERATOR = new IdGenerator 72 | 73 | def generateDataCheckColumnName(): String = { 74 | s"__DATA_CHECK_${ID_GENERATOR.next}__" 75 | } 76 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/optimizer/ExternalSinkRule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import java.util.Locale 21 | 22 | import org.apache.spark.sql.SparkSession 23 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnresolvedHint} 24 | import org.apache.spark.sql.catalyst.rules.Rule 25 | import org.apache.spark.sql.util.OptimizerUtil.parseHintParameter 26 | 27 | import org.apache.spark.sql.runner.callback.{DataCallBackFactory, EmailSink} 28 | import org.apache.spark.sql.runner.container.ConfigContainer 29 | 30 | /** 31 | * @author kun.wan, 32 | * @date 2020-09-15. 33 | */ 34 | case class ExternalSinkRule(spark: SparkSession) extends Rule[LogicalPlan] { 35 | 36 | override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { 37 | case hint@UnresolvedHint(hintName, parameters, child) => hintName.toUpperCase(Locale.ROOT) match { 38 | case "EMAIL_SINK" => 39 | val name = parseHintParameter(parameters(0)) 40 | DataCallBackFactory.registerDataCallBack(EmailSink(name, ConfigContainer.valueMap.get())) 41 | child 42 | 43 | case _ => hint 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/optimizer/ExternalTableRule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, UnresolvedRelation} 22 | import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, LogicalPlan, With} 23 | import org.apache.spark.sql.catalyst.rules.Rule 24 | import org.apache.spark.sql.execution.QueryExecution 25 | import org.apache.spark.sql.runner.container.ConfigContainer 26 | 27 | /** 28 | * @author kun.wan, 29 | * @date 2021-04-07. 30 | */ 31 | case class ExternalTableRule(spark: SparkSession) extends Rule[LogicalPlan] { 32 | 33 | import spark.sessionState.analyzer._ 34 | 35 | // from Analyzer 36 | private def isResolvingView: Boolean = AnalysisContext.get.catalogAndNamespace.nonEmpty 37 | 38 | // If we are resolving relations insides views, we need to expand single-part relation names with 39 | // the current catalog and namespace of when the view was created. 40 | private def expandRelationName(nameParts: Seq[String]): Seq[String] = { 41 | if (!isResolvingView) return nameParts 42 | 43 | if (nameParts.length == 1) { 44 | AnalysisContext.get.catalogAndNamespace :+ nameParts.head 45 | } else if (spark.sessionState.catalogManager.isCatalogRegistered(nameParts.head)) { 46 | nameParts 47 | } else { 48 | AnalysisContext.get.catalogAndNamespace.head +: nameParts 49 | } 50 | } 51 | 52 | def setSchemaDDL(u: UnresolvedRelation, child: LogicalPlan): Unit = { 53 | expandRelationName(u.multipartIdentifier) match { 54 | case NonSessionCatalogAndIdentifier(catalog, ident) => 55 | val schemaDDL = new QueryExecution(spark, child).analyzed.schema.toDDL 56 | ConfigContainer :+ (s"${ident.toString}.schemaDDL" -> schemaDDL) 57 | 58 | case _ => 59 | } 60 | } 61 | 62 | override def apply(plan: LogicalPlan): LogicalPlan = { 63 | plan match { 64 | case InsertIntoStatement(u: UnresolvedRelation, _, _, query: LogicalPlan, _, _) => 65 | setSchemaDDL(u, query) 66 | 67 | case With(InsertIntoStatement(u: UnresolvedRelation, _, _, query: LogicalPlan, _, _), cteRelations) => 68 | setSchemaDDL(u, With(query, cteRelations)) 69 | 70 | case _ => 71 | } 72 | plan 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/optimizer/InsightExtensions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.SparkSessionExtensions 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2020-04-17. 26 | */ 27 | class InsightExtensions extends (SparkSessionExtensions => Unit) with Logging { 28 | def apply(e: SparkSessionExtensions): Unit = { 29 | e.injectOptimizerRule(RepartitionRule) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/optimizer/PartitionScanLimitRule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation} 22 | import org.apache.spark.sql.catalyst.expressions._ 23 | import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan} 24 | import org.apache.spark.sql.catalyst.rules.Rule 25 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} 26 | import org.apache.spark.sql.{AnalysisException, SparkSession} 27 | 28 | import scala.collection.mutable.ArrayBuffer 29 | 30 | /** 31 | * @author kun.wan, 32 | * @date 2020-07-28. 33 | */ 34 | case class PartitionScanLimitRule(spark: SparkSession) extends Rule[LogicalPlan] with Logging { 35 | 36 | val partitionScanLimitEnable: Boolean = 37 | spark.conf.get("spark.partition.scan.limit.enable", "true").toBoolean 38 | 39 | def conditionCheck(partitionColNames: Seq[String], 40 | filters: ArrayBuffer[Expression], 41 | tableMeta: CatalogTable): Unit = { 42 | val filteredAttributes = filters.flatMap(_.references.map(_.name.toLowerCase)) 43 | if ((partitionColNames.map(_.toLowerCase) intersect filteredAttributes).size == 0) { 44 | val table = tableMeta.identifier 45 | throw new AnalysisException( 46 | s"""Does not find partition column filter condition for table $table 47 | |partitionColNames : ${partitionColNames.mkString(", ")} 48 | |filteredAttributes : $filteredAttributes 49 | |""".stripMargin) 50 | } 51 | } 52 | 53 | def checkRelationFilters(plan: LogicalPlan, filters: ArrayBuffer[Expression]): Unit = 54 | plan match { 55 | case Filter(condition, child) if condition.deterministic => 56 | checkRelationFilters(child, filters :+ condition) 57 | 58 | case HiveTableRelation(catalogTable, _, partitionCols, _, _) 59 | if partitionCols.nonEmpty => 60 | val partitionColNames = partitionCols.map(_.name) 61 | conditionCheck(partitionColNames, filters, catalogTable) 62 | 63 | case LogicalRelation(relation: HadoopFsRelation, _, catalogTableOpt, _) => 64 | relation.partitionSchemaOption.map { case partitionSchema => 65 | val partitionColNames = partitionSchema.fieldNames 66 | conditionCheck(partitionColNames, filters, catalogTableOpt.get) 67 | } 68 | 69 | case Join(left, right, _, _, _) => 70 | checkRelationFilters(left, ArrayBuffer[Expression]()) 71 | checkRelationFilters(right, ArrayBuffer[Expression]()) 72 | 73 | case _ => 74 | plan.children.map(checkRelationFilters(_, filters)) 75 | } 76 | 77 | override def apply(plan: LogicalPlan): LogicalPlan = { 78 | if (partitionScanLimitEnable) { 79 | checkRelationFilters(plan, ArrayBuffer[Expression]()) 80 | } 81 | plan 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/optimizer/RepartitionRule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics 21 | import org.apache.hadoop.hive.common.StatsSetupConst 22 | import org.apache.spark.internal.Logging 23 | import org.apache.spark.sql.SparkSession 24 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTablePartition} 25 | import org.apache.spark.sql.catalyst.dsl.expressions._ 26 | import org.apache.spark.sql.catalyst.expressions.SortOrder 27 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition, RepartitionByExpression, _} 28 | import org.apache.spark.sql.catalyst.rules.Rule 29 | import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand 30 | import org.apache.spark.sql.hive.execution.InsertIntoHiveTable 31 | import org.apache.spark.sql.util.SystemVariables.INDEX_COLUMN_NAME 32 | 33 | /** 34 | * @author kun.wan, 35 | * @date 2020-04-17. 36 | */ 37 | case class RepartitionRule(spark: SparkSession) extends Rule[LogicalPlan] with Logging { 38 | 39 | val DEFAULT_PARTITION_SIZE = 64 * 1024 * 1024L 40 | val SAMPLING_PARTITIONS = 10 41 | 42 | val analyzer = spark.sessionState.analyzer 43 | val catalog = SparkSession.active.sessionState.catalog 44 | 45 | override def apply(plan: LogicalPlan): LogicalPlan = { 46 | val newPlan = plan transform { 47 | case InsertIntoHiveTable(table, partition, query, overwrite, partitionExists, outputCols) 48 | if table.partitionColumnNames.size > 0 && checkQueryType(query) => 49 | 50 | val newQuery: LogicalPlan = transformQuery(table, query) 51 | InsertIntoHiveTable(table, partition, newQuery, overwrite, partitionExists, outputCols) 52 | 53 | case InsertIntoHadoopFsRelationCommand(outputPath, staticPartitions, ifPartitionNotExists, 54 | partitionColumns, bucketSpec, fileFormat, options, query, mode, catalogTable, fileIndex, 55 | outputColumnNames) 56 | if catalogTable.isDefined && (staticPartitions.size + partitionColumns.size) > 0 57 | && checkQueryType(query) => 58 | val newQuery = 59 | transformQuery(catalogTable.get, query) 60 | 61 | InsertIntoHadoopFsRelationCommand( 62 | outputPath, 63 | staticPartitions, 64 | ifPartitionNotExists, 65 | partitionColumns, 66 | bucketSpec, 67 | fileFormat, 68 | options, 69 | newQuery, 70 | mode, 71 | catalogTable, 72 | fileIndex, 73 | outputColumnNames) 74 | } 75 | if (!newPlan.fastEquals(plan)) { 76 | logDebug(s"plan after RepartitionRule:\n$newPlan") 77 | } 78 | newPlan 79 | } 80 | 81 | private def checkQueryType(query: LogicalPlan): Boolean = { 82 | !query.isInstanceOf[Sort] && !query.isInstanceOf[Repartition] && 83 | !query.isInstanceOf[RepartitionByExpression] 84 | } 85 | 86 | private def transformQuery(table: CatalogTable, query: LogicalPlan): LogicalPlan = { 87 | val tableName = table.identifier 88 | val sortExprsOpt: Option[Seq[SortOrder]] = 89 | table.properties.get(INDEX_COLUMN_NAME).map(indexColumn => { 90 | val order = Symbol(indexColumn).attr.asc 91 | Seq(analyzer.resolveExpressionBottomUp(order, query).asInstanceOf[SortOrder]) 92 | }) 93 | 94 | val numPartitionsOpt = repartitionNumbers(catalog.listPartitions(tableName)) 95 | (sortExprsOpt, numPartitionsOpt) match { 96 | case (Some(sortExprs), Some(numPartitions)) => 97 | RepartitionByExpression(sortExprs, query, numPartitions) 98 | 99 | case (Some(sortExprs), None) => Sort(sortExprs, true, query) 100 | case (None, Some(numPartitions)) => Repartition(numPartitions, true, query) 101 | case (None, None) => query 102 | } 103 | } 104 | 105 | /** 106 | * 1. 根据分区创建时间倒排序,取最近创建的分区 107 | * 2. sample 采样10个分区元数据来计算分区个数,取结果中位数 108 | * @param partitions 109 | * @return 110 | */ 111 | def repartitionNumbers(partitions: Seq[CatalogTablePartition]): Option[Int] = { 112 | 113 | val stats = new DescriptiveStatistics 114 | if (log.isDebugEnabled) { 115 | partitions.foreach(p => logDebug(s"got partition ${p.simpleString}")) 116 | } 117 | partitions.filter(_.parameters.contains(StatsSetupConst.TOTAL_SIZE)) 118 | .sortWith((p1, p2) => p1.createTime > p2.createTime) 119 | .slice(0, SAMPLING_PARTITIONS) 120 | .foreach { p => 121 | stats.addValue(p.parameters.get(StatsSetupConst.TOTAL_SIZE).get.toLong 122 | / DEFAULT_PARTITION_SIZE) 123 | } 124 | if (stats.getPercentile(50).isNaN) { 125 | None 126 | } else { 127 | val number = stats.getPercentile(50).toInt + 1 128 | if (number > 0) { 129 | Some(number) 130 | } else { 131 | None 132 | } 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/plugin/AsyncProfilePlugin.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.plugin 19 | 20 | import java.lang.management.ManagementFactory 21 | 22 | import javax.management.ObjectName 23 | import one.profiler.AsyncProfiler 24 | 25 | /** 26 | * @author kun.wan, 27 | * @date 2020-05-14. 28 | */ 29 | class AsyncProfilePlugin extends ProfilePlugin { 30 | 31 | var profiler: AsyncProfiler = _ 32 | 33 | override def init0(): Unit = { 34 | profileFile = s"${logDir}/${containerId}.${profileType}" 35 | 36 | profiler = AsyncProfiler.getInstance() 37 | ManagementFactory.getPlatformMBeanServer().registerMBean( 38 | profiler, 39 | new ObjectName("one.profiler:type=AsyncProfiler") 40 | ) 41 | if (!manualProfile) { 42 | logInfo(profiler.execute(s"start,${profileType},file=${profileFile}")) 43 | } 44 | } 45 | 46 | override def shutdown0(): Unit = { 47 | logInfo(profiler.execute(s"stop,file=${profileFile}")) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/plugin/ProfilePlugin.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.plugin 19 | 20 | import java.util.{Map => JMap} 21 | 22 | import org.apache.hadoop.conf.Configuration 23 | import org.apache.hadoop.fs.{FileSystem, Path} 24 | import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil 25 | import org.apache.spark.internal.Logging 26 | import org.apache.spark.util.SignalUtils 27 | import org.apache.spark.SparkConf 28 | import org.apache.spark.api.plugin.{ExecutorPlugin, PluginContext} 29 | 30 | import scala.reflect.io.File 31 | 32 | 33 | /** 34 | * @author kun.wan, 35 | * @date 2020-05-26. 36 | */ 37 | abstract class ProfilePlugin extends ExecutorPlugin with Logging { 38 | 39 | val pluginName = this.getClass.getName.stripSuffix("$") 40 | 41 | var conf: SparkConf = _ 42 | var manualProfile: Boolean = _ 43 | var profileType: String = _ 44 | 45 | val logDir = System.getProperty("spark.yarn.app.container.log.dir") 46 | val containerId = YarnSparkHadoopUtil.getContainerId 47 | val applicationAttemptId = containerId.getApplicationAttemptId 48 | val applicationId = applicationAttemptId.getApplicationId 49 | 50 | var profileFile: String = _ 51 | 52 | val fs = FileSystem.get(new Configuration()) 53 | var shutdownFlag = false 54 | 55 | def init0(): Unit = {} 56 | 57 | def shutdown0(): Unit = {} 58 | 59 | override def init(ctx: PluginContext, extraConf: JMap[String, String]): Unit = { 60 | conf = ctx.conf() 61 | manualProfile = conf.getBoolean("spark.profile.manualprofile", false) 62 | profileType = conf.get("spark.profile.type", "jfr") 63 | 64 | init0() 65 | logInfo(s"init ProfileExecutorPlugin") 66 | 67 | // Handle SIGTERM from NodeManager 68 | Seq("TERM", "HUP", "INT").foreach { sig => 69 | SignalUtils.register(sig) { 70 | log.error("Executor RECEIVED SIGNAL " + sig) 71 | while(!shutdownFlag) { 72 | Thread sleep 100 73 | log.error("Executor shutdown loopback. SIGNAL " + sig) 74 | } 75 | log.error("ProfilePlugin Shutdown loop end. SIGNAL " + sig) 76 | false 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * 1. Shutdown method is already a ShutdownHook. 83 | * 2. Executor may be killed by NodeManager before the shutdown method is finished. 84 | * The default wait time is 250ms defined by sleepDelayBeforeSigKill in ContainerLaunch Service. 85 | */ 86 | override def shutdown(): Unit = { 87 | if (!manualProfile) { 88 | logInfo(s"shutdown ${pluginName}") 89 | shutdown0() 90 | 91 | logInfo("begin upload executor profile file.") 92 | 93 | val srcPath = new Path(profileFile) 94 | val dstPath = new Path(s"/metadata/logs/profile/${applicationId}/" + 95 | s"${applicationAttemptId.getAttemptId}/${containerId}.${profileType}") 96 | logInfo(s"profileFile :${srcPath} hdfs path : ${dstPath}") 97 | fs.copyFromLocalFile(true, true, srcPath, dstPath) 98 | File(profileFile).delete() 99 | } 100 | logInfo(s"end ${pluginName}") 101 | shutdownFlag = true 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/plugin/YourkitPlugin.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.plugin 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2020-05-14. 23 | */ 24 | class YourkitPlugin extends ProfilePlugin { 25 | 26 | override def shutdown0(): Unit = { 27 | val controllerCls = Class.forName("com.yourkit.api.Controller") 28 | val controller = controllerCls.newInstance() 29 | 30 | val displayNameMethod = controllerCls.getMethod("capturePerformanceSnapshot") 31 | profileFile = displayNameMethod.invoke(controller).asInstanceOf[String] 32 | 33 | val stopCpuProfilingMethod = controllerCls.getMethod("stopCpuProfiling") 34 | stopCpuProfilingMethod.invoke(controller) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/Alert.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner 19 | 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.util.{DQUtil, SystemVariables} 22 | import org.apache.spark.sql.runner.container.ConfigContainer 23 | 24 | /** 25 | * 对非测试,运行失败的程序进行告警 26 | * 27 | * @author kun.wan, 28 | * @date 2020-02-26. 29 | */ 30 | object Alert extends ArgParser with Logging { 31 | def main(args: Array[String]): Unit = { 32 | if (!args.contains("--test") && !args.contains("--dryrun")) { 33 | parseArgument(args) 34 | val env = ConfigContainer.getOrElse(SystemVariables.ENV, SystemVariables.DEFAULT_ENV) 35 | 36 | val alertMessage = s"$env : 程序 ${args(0)} 运行失败,请检查!" 37 | logError(alertMessage) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/ArgParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner 19 | 20 | import java.time.LocalDateTime 21 | 22 | import org.apache.commons.io.FilenameUtils 23 | import org.apache.commons.lang3.StringUtils 24 | 25 | import org.apache.spark.sql.util.SystemVariables 26 | import org.apache.sql.runner.command.CommandFactory 27 | import scala.collection.mutable.ArrayBuffer 28 | import scala.io.Source 29 | 30 | import org.apache.spark.sql.runner.command.{BaseCommand, BlockCommentCommand, CommandFactory} 31 | import org.apache.spark.sql.runner.config.ApolloClient 32 | import org.apache.spark.sql.runner.container.ConfigContainer 33 | 34 | /** 35 | * @author kun.wan, 36 | * @date 2020-06-03. 37 | */ 38 | class ArgParser { 39 | 40 | var batchTimesOpt: Option[Seq[LocalDateTime]] = None 41 | var startDate: Option[LocalDateTime] = None 42 | var endDate: Option[LocalDateTime] = None 43 | var dateRangeStep: Int = 1 44 | var jobFile: String = _ 45 | var commands: Array[BaseCommand] = _ 46 | 47 | def parseArgument(args: Array[String]): Unit = { 48 | if (args.length < 1) { 49 | println("job configuration file must be found!") 50 | System.exit(-1) 51 | } 52 | 53 | val leftArgs = new ArrayBuffer[String]() 54 | var argv = args.toList 55 | 56 | 57 | while (!argv.isEmpty) { 58 | argv match { 59 | case "--dateRange" :: startDateStr :: endDateStr :: tail => 60 | startDate = Some(LocalDateTime.parse(startDateStr)) 61 | endDate = Some(LocalDateTime.parse(endDateStr)) 62 | argv = tail 63 | case "--dates" :: dates :: tail => 64 | batchTimesOpt = Some(dates.split(",").map(LocalDateTime.parse(_)).toSeq) 65 | argv = tail 66 | case "--config" :: value :: tail => 67 | val tup = value.split("=") 68 | ConfigContainer :+ (tup(0) -> tup(1)) 69 | argv = tail 70 | case "--profile" :: tail => 71 | ConfigContainer :+ ("spark.profile" -> "true") 72 | argv = tail 73 | case "--dryrun" :: tail => 74 | ConfigContainer :+ ("dryrun" -> "true") 75 | argv = tail 76 | case "--dateRangeStep" :: dateRangeStepStr :: tail => 77 | dateRangeStep = dateRangeStepStr.toInt 78 | argv = tail 79 | case head :: tail if head != null => 80 | leftArgs.append(head) 81 | argv = tail 82 | } 83 | } 84 | 85 | jobFile = leftArgs(0) 86 | 87 | ConfigContainer :+ (SystemVariables.JOB_NAME -> FilenameUtils.getBaseName(jobFile)) 88 | 89 | if (StringUtils.isNotBlank(System.getenv(SystemVariables.APOLLO_META))) { 90 | ConfigContainer :+ (SystemVariables.APOLLO_META -> System.getenv(SystemVariables.APOLLO_META)) 91 | } 92 | 93 | commands = CommandFactory.parseCommands(Source.fromFile(jobFile).mkString) 94 | require(commands.length > 0 && commands(0).isInstanceOf[BlockCommentCommand], 95 | "sql job must start with job description!") 96 | checkHeader(commands(0).asInstanceOf[BlockCommentCommand]) 97 | 98 | // pull variables from apollo 99 | ApolloClient.pollVariablesFromApollo() 100 | } 101 | 102 | def checkHeader(cmd: BlockCommentCommand): Unit = { 103 | val keys = Set("author", "period", "run_env", "describe") 104 | val headerMap: Map[String, String] = 105 | cmd.comment.split('\n') 106 | .filter(_.contains(":")) 107 | .map { line => 108 | val splits = line.split(":") 109 | splits(0).trim -> splits(1).trim 110 | }.toMap 111 | 112 | val notExistsKeys = keys.filterNot(headerMap.contains(_)) 113 | assert(notExistsKeys.isEmpty, s"Header 中缺少 ${notExistsKeys.mkString(", ")} 参数!") 114 | for ((key, value) <- headerMap) { 115 | ConfigContainer :+ (key -> value) 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/JobRunner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner 19 | 20 | import java.time.LocalDateTime 21 | import java.time.temporal.ChronoUnit 22 | 23 | import scala.reflect.io.File 24 | 25 | import org.apache.spark.sql.plugin.{AsyncProfilePlugin, YourkitPlugin} 26 | import org.apache.spark.sql.runner.command.SqlCommand 27 | import org.apache.spark.sql.runner.container.{CollectorContainer, ConfigContainer} 28 | import org.apache.spark.sql.util.SystemVariables._ 29 | import org.apache.spark.sql.util.{Logging, SystemVariables} 30 | 31 | /** 32 | * @author kun.wan, 33 | * @date 2019-12-05. 34 | */ 35 | object JobRunner extends ArgParser with Logging { 36 | def main(args: Array[String]): Unit = { 37 | parseArgument(args) 38 | logInfo(s"submit job for ${jobFile}") 39 | 40 | prepareRuntimeParameter() 41 | 42 | batchTimesOpt.getOrElse(Seq[LocalDateTime]()).map { batchTime => 43 | CollectorContainer :+ (SystemVariables.BATCH_TIME -> batchTime) 44 | logInfo(s"submitting job(batchTime = $batchTime)") 45 | if (ConfigContainer.contains("dryrun")) { 46 | commands.foreach(_.dryrun()) 47 | } else { 48 | commands.foreach(_.run()) 49 | } 50 | } 51 | SqlCommand.stop() 52 | 53 | logInfo(s"end job") 54 | } 55 | 56 | def prepareRuntimeParameter(): Unit = { 57 | // prepare for spark mode 58 | val distJars = Seq(PROJECT_JAR_NAME).map(jar => s"lib/${jar}").mkString(",") 59 | ConfigContainer :+ ("spark.yarn.dist.jars" -> distJars) 60 | if (!ConfigContainer.contains("spark.yarn.queue")) { 61 | ConfigContainer :+ ("spark.yarn.queue" -> s"root.${File(jobFile).parent.name}") 62 | } 63 | 64 | if (ConfigContainer.getOrElse("spark.profile", "false").toBoolean) { 65 | val profileShell = "hdfs:///deploy/config/profile.sh" 66 | val yourkitAgent = "hdfs:///deploy/config/libyjpagent.so" 67 | 68 | ConfigContainer.getOrElse("spark.profile.type", "jfr") match { 69 | case "yourkit" => 70 | ConfigContainer :+ ("spark.profile.type" -> "snapshot") 71 | ConfigContainer :+ ("spark.yarn.dist.files" -> s"${profileShell},${yourkitAgent}") 72 | ConfigContainer :+ ("spark.yarn.dist.jars" -> s"${distJars},hdfs:///deploy/config/yjp-controller-api-redist.jar") 73 | ConfigContainer :+ ("spark.executor.extraJavaOptions" -> "-agentpath:libyjpagent.so=logdir=,async_sampling_cpu") 74 | ConfigContainer :+ ("spark.executor.plugins" -> classOf[YourkitPlugin].getName) 75 | 76 | case _ => 77 | ConfigContainer :+ ("spark.yarn.dist.archives" -> 78 | "hdfs:///deploy/config/async-profiler/async-profiler.zip#async-profiler") 79 | ConfigContainer :+ ("spark.yarn.dist.files" -> profileShell) 80 | ConfigContainer :+ ("spark.executor.extraLibraryPath" -> "./async-profiler/build/") 81 | ConfigContainer :+ ("spark.executor.plugins" -> classOf[AsyncProfilePlugin].getName) 82 | } 83 | } 84 | 85 | // 如果日期参数为空,时间设置为上一个执行周期 86 | if (startDate != None && endDate != None) { 87 | batchTimesOpt = ConfigContainer.get("period") match { 88 | case "minute" => 89 | val rangeSize = ChronoUnit.MINUTES.between(startDate.get, endDate.get) 90 | Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusMinutes(i))) 91 | case "hour" | "hourly" => 92 | val rangeSize = ChronoUnit.HOURS.between(startDate.get, endDate.get) 93 | Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusHours(i))) 94 | case "day" | "daily" => 95 | val rangeSize = ChronoUnit.DAYS.between(startDate.get, endDate.get) 96 | Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusDays(i))) 97 | case "month" => 98 | val rangeSize = ChronoUnit.MONTHS.between(startDate.get, endDate.get) 99 | Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusMonths(i))) 100 | } 101 | } 102 | if (batchTimesOpt == None) { 103 | val defaultBatchTime = { 104 | ConfigContainer.get("period") match { 105 | case "minute" => 106 | val dt = LocalDateTime.now.minusMinutes(1) 107 | LocalDateTime.of(dt.getYear, dt.getMonth, dt.getDayOfMonth, 108 | dt.getHour, dt.getMinute, 0) 109 | case "hour" => 110 | val dt = LocalDateTime.now.minusHours(1) 111 | LocalDateTime.of(dt.getYear, dt.getMonth, dt.getDayOfMonth, dt.getHour, 0, 0) 112 | case "day" => 113 | val dt = LocalDateTime.now.minusDays(1) 114 | LocalDateTime.of(dt.getYear, dt.getMonth, dt.getDayOfMonth, 0, 0, 0) 115 | case "month" => 116 | val dt = LocalDateTime.now.minusMonths(1) 117 | LocalDateTime.of(dt.getYear, dt.getMonth, 1, 0, 0, 0) 118 | } 119 | } 120 | batchTimesOpt = Some(Seq(defaultBatchTime)) 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/ArrayValueCollector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 21 | import org.apache.spark.sql.util.Logging 22 | import scala.collection.mutable.ArrayBuffer 23 | 24 | import org.apache.spark.sql.runner.container.CollectorContainer 25 | 26 | /** 27 | * @author kun.wan, 28 | * @date 2021-03-08. 29 | */ 30 | case class ArrayValueCollector(name: String, columnName: String) 31 | extends DataCallBack with Logging { 32 | 33 | val array = ArrayBuffer[Any]() 34 | 35 | override def next(row: GenericRowWithSchema): Unit = { 36 | array += row.get(row.schema.fieldIndex(columnName)) 37 | } 38 | 39 | override def close(): Unit = { 40 | CollectorContainer :+ (name -> array.toArray) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/DataCallBack.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 21 | import org.apache.spark.sql.types.StructType 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2021-02-20. 26 | */ 27 | trait DataCallBack { 28 | 29 | var skipEmpty = true 30 | 31 | def init(schema: StructType): Unit = {} 32 | 33 | def next(row: GenericRowWithSchema): Unit 34 | 35 | def close(): Unit 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/DataCallBackFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import org.apache.spark.sql.util.Logging 21 | 22 | import scala.collection.mutable.ArrayBuffer 23 | 24 | /** 25 | * @author kun.wan, 26 | * @date 2021-03-08. 27 | */ 28 | object DataCallBackFactory extends Logging { 29 | 30 | val callBacks: ThreadLocal[ArrayBuffer[DataCallBack]] = 31 | new ThreadLocal[ArrayBuffer[DataCallBack]] { 32 | override def initialValue(): ArrayBuffer[DataCallBack] = ArrayBuffer[DataCallBack]() 33 | } 34 | 35 | def registerDataCallBack(dataCallBack: DataCallBack): Unit = { 36 | logInfo(s"add new data call back:\n$dataCallBack") 37 | callBacks.get() += dataCallBack 38 | } 39 | 40 | def clearDataCallBack(): Unit = callBacks.get().clear() 41 | 42 | def consumeResult(qr: QueryResult): Unit = { 43 | val iterator = qr.iterator 44 | while (iterator.hasNext) { 45 | iterator.next() 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/DataCheckCallBack.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 21 | import org.apache.spark.sql.util.{DQUtil, Logging} 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2021-03-08. 26 | */ 27 | case class DataCheckCallBack(title: String, 28 | originColumn: String, 29 | checkResultColumn: String, 30 | expression: String) 31 | extends DataCallBack with Logging { 32 | 33 | override def next(row: GenericRowWithSchema): Unit = { 34 | val value: Any = row.get(row.schema.fieldIndex(originColumn)) 35 | val checkResult: Boolean = row.getAs(checkResultColumn) 36 | val messages = 37 | Seq(title, 38 | s"数据检查${if (checkResult) "正常" else "异常"}", 39 | s"检查条件: $expression", 40 | s"实际值 $value ${if (!checkResult) "不" else ""}满足条件!") 41 | 42 | logInfo(messages.mkString("\n")) 43 | } 44 | 45 | override def close(): Unit = {} 46 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/EmailSink.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import java.util.Properties 21 | 22 | import javax.activation.DataHandler 23 | import javax.mail.internet.{InternetAddress, MimeBodyPart, MimeMessage, MimeMultipart} 24 | import javax.mail.{Message, Session} 25 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 26 | import org.apache.spark.sql.util.ConfigUtil 27 | 28 | import scala.collection.mutable.ArrayBuffer 29 | 30 | case class EmailSink(name: String, config: Map[String, String]) extends Sink { 31 | 32 | // email邮件服务器参数 33 | val hostName = config.getOrElse( 34 | "email.hostname", 35 | throw new IllegalArgumentException("config email.hostname is needed.") 36 | ) 37 | val userName = config.getOrElse( 38 | "email.username", 39 | throw new IllegalArgumentException("config email.username is needed.") 40 | ) 41 | val password = config.getOrElse( 42 | "email.password", 43 | throw new IllegalArgumentException("config email.password is needed.") 44 | ) 45 | val from = config.getOrElse( 46 | "email.from", 47 | throw new IllegalArgumentException("config email.from is needed.") 48 | ) 49 | 50 | // email内容构建参数 51 | val names = ConfigUtil.trimConfigArray( 52 | config.getOrElse( 53 | s"$name.columns", 54 | throw new IllegalArgumentException(s"config $name.columns is needed.") 55 | ), 56 | "," 57 | ) 58 | val columnNames = ConfigUtil.trimConfigArray( 59 | config.getOrElse( 60 | s"$name.columnNames", 61 | throw new IllegalArgumentException(s"config $name.columnNames is needed.") 62 | ), 63 | "," 64 | ) 65 | val to = config.getOrElse( 66 | s"$name.email-to", 67 | throw new IllegalArgumentException(s"config $name.email-to is needed.") 68 | ) 69 | 70 | val cc = config.getOrElse(s"$name.email-cc", "") 71 | 72 | val emailPattern = EmailSink.generateTitle(names) 73 | val emailColumnName = EmailSink.generateTitle(columnNames) 74 | val emailTemplate = config.getOrElse( 75 | s"$name.email-template", 76 | s"""%s
""" 77 | ) 78 | val csvPattern = columnNames 79 | val subject = envName + "环境:" + config.getOrElse(s"$name.subject", "no subject") 80 | val attachedFileName = config.getOrElse("email-attach-filename", subject) 81 | 82 | val emailContent = new ArrayBuffer[String]() 83 | val csvContentBuffer = new ArrayBuffer[String]() 84 | emailContent.append(emailColumnName) 85 | csvContentBuffer.append(columnNames) 86 | 87 | var i = 0 88 | 89 | override def next(row: GenericRowWithSchema): Unit = { 90 | if (i < rowLimit) { 91 | emailContent.append(parsePattern(emailPattern, row)) 92 | i = i + 1 93 | } 94 | csvContentBuffer.append(parsePattern(names, row)) 95 | 96 | } 97 | 98 | override def close(): Unit = { 99 | val htmlContent = emailTemplate.format(emailContent.mkString("\n")) 100 | val csvContent = csvContentBuffer.mkString("\n") 101 | 102 | // 邮件发送 103 | val properties = new Properties() 104 | properties.put("mail.transport.protocol", "smtp") 105 | properties.put("mail.smtp.host", hostName) 106 | properties.put("mail.smtp.port", "465") 107 | properties.put( 108 | "mail.smtp.socketFactory.class", 109 | "javax.net.ssl.SSLSocketFactory" 110 | ) 111 | properties.put("mail.smtp.auth", "true") 112 | properties.put("mail.smtp.ssl.enable", "true") 113 | 114 | val session = Session.getInstance(properties) 115 | val message = new MimeMessage(session) 116 | message.setFrom(new InternetAddress(from, userName)) 117 | message.addRecipients(Message.RecipientType.TO, to) 118 | message.addRecipients(Message.RecipientType.CC, cc) 119 | message.setSubject(subject) 120 | val multipart = new MimeMultipart() 121 | val contentPart = new MimeBodyPart() 122 | contentPart.setContent(htmlContent, "text/html;charset=UTF-8") 123 | multipart.addBodyPart(contentPart) 124 | val mdp = new MimeBodyPart() 125 | val dh = new DataHandler( 126 | new String(Array[Byte](0xEF.toByte, 0xBB.toByte, 0xBF.toByte)) + csvContent, 127 | "text/plain;charset=UTF-8" 128 | ) 129 | mdp.setFileName(attachedFileName + ".csv") 130 | mdp.setDataHandler(dh) 131 | multipart.addBodyPart(mdp) 132 | message.setContent(multipart) 133 | val transport = session.getTransport 134 | transport.connect(from, password) 135 | transport.sendMessage(message, message.getAllRecipients) 136 | transport.close 137 | logInfo(s"Email sink finished") 138 | } 139 | 140 | override def toString: String = { 141 | s"EmailSink(name = $name, from = $from, to = $to, cc = $cc, " + 142 | s"names = $names, columnNames = $columnNames)" 143 | } 144 | 145 | } 146 | 147 | object EmailSink { 148 | def generateTitle(columnName: String): String = { 149 | val columnTitle = columnName.split(",") 150 | .map(col => s"${ConfigUtil.trimConfigValue(col)}") 151 | .mkString 152 | 153 | s"${columnTitle}" 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/QueryResult.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 21 | import org.apache.spark.sql.types.StructType 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2019-12-05. 26 | */ 27 | case class QueryResult(schema: StructType, iterator: Iterator[GenericRowWithSchema]) 28 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/SingleValueCollector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 21 | import org.apache.spark.sql.util.Logging 22 | import org.apache.spark.sql.runner.container.CollectorContainer 23 | 24 | /** 25 | * @author kun.wan, 26 | * @date 2021-03-08. 27 | */ 28 | case class SingleValueCollector(name: String, columnName: String) 29 | extends DataCallBack with Logging { 30 | 31 | var value: Any = _ 32 | 33 | override def next(row: GenericRowWithSchema): Unit = { 34 | value = row.get(row.schema.fieldIndex(columnName)) 35 | } 36 | 37 | override def close(): Unit = { 38 | CollectorContainer :+ (name -> value) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/callback/Sink.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.callback 19 | 20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 21 | import org.apache.spark.sql.runner.metrics.ReporterTrait 22 | import org.apache.spark.sql.util.{Logging, SystemVariables} 23 | 24 | /** 25 | * @author kun.wan, 26 | * @date 2019-12-12. 27 | */ 28 | trait Sink extends DataCallBack with ReporterTrait with Logging { 29 | 30 | val config: Map[String, String] 31 | 32 | val envName = config.getOrElse(SystemVariables.ENV, "UNKNOWN") 33 | 34 | var resultRows: Long = 0 35 | 36 | val defaultRowLimit: String = "1000" 37 | 38 | val rowLimit: Int = config.getOrElse("rowLimit", defaultRowLimit).toInt 39 | 40 | def parsePattern(pattern: String, row: GenericRowWithSchema): String = { 41 | val sb = new StringBuilder 42 | var startIdx = -1 43 | for ((c, idx) <- pattern.zipWithIndex) { 44 | if (c == '{' && startIdx < 0) { 45 | startIdx = idx 46 | } else if (c == '}' && startIdx >= 0) { 47 | val variableName = pattern.substring(startIdx + 1, idx) 48 | val fieldValue: AnyRef = row.getAs(variableName) 49 | sb.append(fieldValue) 50 | startIdx = -1 51 | } else if (startIdx < 0) { 52 | sb.append(c) 53 | } 54 | } 55 | 56 | sb.toString 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/BaseCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | import org.apache.spark.sql.util.{Logging, StringUtil} 21 | 22 | /** 23 | * @author kun.wan, 24 | * @date 2021-02-23. 25 | */ 26 | abstract class BaseCommand(sourceChars: SourceChars) extends Logging { 27 | 28 | val escapeMapping: Map[Array[Char], Array[Char]] = Map( 29 | Array('\"') -> Array('\"'), 30 | Array(''') -> Array('''), 31 | Array('(') -> Array(')'), 32 | ) 33 | 34 | val chars = sourceChars.chars 35 | 36 | def readTo(char: Char): (String, Int, Int) = readTo(Array(char)) 37 | 38 | def readTo(target: String): (String, Int, Int) = readTo(target.toCharArray) 39 | 40 | private def readTo(target: Array[Char]): (String, Int, Int) = { 41 | val len = target.length 42 | var index = -1 43 | var i = sourceChars.start 44 | while (i < sourceChars.end && index < 0) { 45 | // deal with escape char array 46 | for ((startChars, endChars) <- escapeMapping if startChars.intersect(target).size == 0) { 47 | val slen = startChars.length 48 | if (i > slen && chars(i - slen) != '\\') { 49 | if (chars.slice(i - slen + 1, i + 1) sameElements startChars) { 50 | val elen = endChars.length 51 | i = i + elen 52 | while (i < sourceChars.end && (chars(i - elen) == '\\' || 53 | !(chars.slice(i - elen + 1, i + 1) sameElements endChars))) { 54 | i = i + 1 55 | } 56 | } 57 | } 58 | } 59 | 60 | if (chars.slice(i - len + 1, i + 1) sameElements target) { 61 | index = i + 1 - len 62 | } else { 63 | i = i + 1 64 | } 65 | } 66 | assert(index >= 0, s"Parse Job Error!\n${new String(chars.slice(sourceChars.start, sourceChars.end))}") 67 | val res = 68 | StringUtil.escapeStringValue(new String(chars.slice(sourceChars.start, index))) 69 | val nextStart = i + 1 70 | (res, index, nextStart) 71 | } 72 | 73 | def run(): Unit = { 74 | throw new Exception("Unsupport Command!") 75 | } 76 | 77 | def dryrun(): Unit = run() 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/BlockCommentCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-02-24. 23 | */ 24 | case class BlockCommentCommand(sourceChars: SourceChars) 25 | extends BaseCommand(sourceChars) { 26 | 27 | def this(sourceString: String) { 28 | this(SourceChars(sourceString.toCharArray, 0, sourceString.length)) 29 | } 30 | 31 | sourceChars.start = sourceChars.start + CommandFactory.blockCommentPrefix.length 32 | 33 | val (comment, _, nextStart) = readTo("*/") 34 | sourceChars.start = nextStart 35 | 36 | override def toString: String = s"/**${comment}*/" 37 | 38 | override def run(): Unit = { 39 | logInfo(s"\n${this.toString}") 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/CommandFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | import scala.collection.mutable.ArrayBuffer 21 | 22 | /** 23 | * @author kun.wan, 24 | * @date 2021-02-24. 25 | */ 26 | object CommandFactory { 27 | val sqlPrefix = "" 28 | val lineCommentPrefix = "--" 29 | val blockCommentPrefix = "/**" 30 | val setPrefix = "!set" 31 | 32 | val ifPrefix = "!if" 33 | val elsePrefix = "!else" 34 | val fiPrefix = "!fi" 35 | 36 | def skipEmptyChars(sourceChars: SourceChars): Unit = { 37 | while (sourceChars.start < sourceChars.chars.length && 38 | Character.isWhitespace(sourceChars.chars.charAt(sourceChars.start))) { 39 | sourceChars.start = sourceChars.start + 1 40 | } 41 | } 42 | 43 | /** 44 | * 使用探测法,找到下一条Command 45 | * @param sourceChars 46 | */ 47 | def nextCommand(sourceChars: SourceChars): BaseCommand = { 48 | skipEmptyChars(sourceChars) 49 | val commandPrefix: Option[String] = 50 | Seq( 51 | lineCommentPrefix, 52 | blockCommentPrefix, 53 | setPrefix, 54 | ifPrefix, 55 | elsePrefix, 56 | fiPrefix 57 | ) find { prefix => 58 | val len = prefix.length 59 | if (sourceChars.start + len >= sourceChars.end) { 60 | false 61 | } 62 | else { 63 | prefix.equalsIgnoreCase(new String(sourceChars.chars, sourceChars.start, len)) 64 | } 65 | } 66 | 67 | val cmd = 68 | commandPrefix match { 69 | case Some(prefix) if prefix == lineCommentPrefix => LineCommentCommand(sourceChars) 70 | case Some(prefix) if prefix == blockCommentPrefix => BlockCommentCommand(sourceChars) 71 | case Some(prefix) if prefix == setPrefix => SetCommand(sourceChars) 72 | case Some(prefix) if prefix == ifPrefix => IfCommand(sourceChars) 73 | case Some(prefix) if prefix == elsePrefix => ElseCommand(sourceChars) 74 | case Some(prefix) if prefix == fiPrefix => FiCommand(sourceChars) 75 | case None => SqlCommand(sourceChars) 76 | } 77 | skipEmptyChars(sourceChars) 78 | cmd 79 | } 80 | 81 | def parseCommands(source: String): Array[BaseCommand] = { 82 | val commands = ArrayBuffer[BaseCommand]() 83 | val sourceChars = SourceChars(source.toCharArray, 0, source.length) 84 | 85 | while (sourceChars.start < source.length) { 86 | val command = nextCommand(sourceChars) 87 | commands += command 88 | } 89 | commands.toArray 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/ElseCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-02-24. 23 | */ 24 | case class ElseCommand(sourceChars: SourceChars) 25 | extends BaseCommand(sourceChars) { 26 | 27 | def this(sourceString: String) { 28 | this(SourceChars(sourceString.toCharArray, 0, sourceString.length)) 29 | } 30 | 31 | sourceChars.start = sourceChars.start + CommandFactory.elsePrefix.length 32 | } 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/FiCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-02-24. 23 | */ 24 | case class FiCommand(sourceChars: SourceChars) 25 | extends BaseCommand(sourceChars) { 26 | 27 | def this(sourceString: String) { 28 | this(SourceChars(sourceString.toCharArray, 0, sourceString.length)) 29 | } 30 | 31 | sourceChars.start = sourceChars.start + CommandFactory.fiPrefix.length 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/IfCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 21 | import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Cast, Literal} 22 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser 23 | import org.apache.spark.sql.types.DataType 24 | import org.apache.sql.runner.container.ConfigContainer 25 | import scala.collection.mutable 26 | import scala.collection.mutable.ArrayBuffer 27 | 28 | import org.apache.spark.sql.runner.config.VariableSubstitution 29 | import org.apache.spark.sql.runner.container.{CollectorContainer, ConfigContainer} 30 | 31 | /** 32 | * @author kun.wan, 33 | * @date 2021-02-24. 34 | */ 35 | case class IfCommand(sourceChars: SourceChars) 36 | extends BaseCommand(sourceChars) { 37 | 38 | def this(sourceString: String) { 39 | this(SourceChars(sourceString.toCharArray, 0, sourceString.length)) 40 | } 41 | 42 | sourceChars.start = sourceChars.start + CommandFactory.ifPrefix.length 43 | 44 | val (_, _, nextStart1) = readTo("(") 45 | sourceChars.start = nextStart1 46 | val (ifConditionString, _, nextStart2) = readTo(")") 47 | sourceChars.start = nextStart2 48 | 49 | val ifCommands = new ArrayBuffer[BaseCommand]() 50 | val elseCommands = new ArrayBuffer[BaseCommand]() 51 | 52 | var parseStage = "if" 53 | while (parseStage != "fi") { 54 | val cmd = CommandFactory.nextCommand(sourceChars) 55 | cmd match { 56 | case _: FiCommand => 57 | parseStage = "fi" 58 | 59 | case _: ElseCommand => 60 | parseStage = "else" 61 | 62 | case _ => 63 | parseStage match { 64 | case "if" => 65 | ifCommands += cmd 66 | case "else" => 67 | elseCommands += cmd 68 | } 69 | } 70 | } 71 | 72 | override def toString: String = { 73 | val elseString = 74 | if (elseCommands.size > 0) { 75 | s"""\n!else 76 | |${elseCommands.mkString("\n")} 77 | |""".stripMargin 78 | } else { 79 | "" 80 | } 81 | 82 | s"""!if ($ifConditionString) 83 | |${ifCommands.mkString("\n") + elseString} 84 | |!fi 85 | |""".stripMargin 86 | 87 | } 88 | 89 | override def run(): Unit = { 90 | doRun(isDryRun = false) 91 | } 92 | 93 | override def dryrun(): Unit = { 94 | doRun(isDryRun = true) 95 | } 96 | 97 | def doRun(isDryRun: Boolean): Unit = { 98 | VariableSubstitution.withSubstitution { substitution => 99 | val dataTypeMap = mutable.Map[String, DataType]() 100 | 101 | val originExpr = CatalystSqlParser.parseExpression(substitution.substitute(ifConditionString)) 102 | 103 | var lastMapSize = -1 104 | while (lastMapSize != dataTypeMap.size) { 105 | lastMapSize = dataTypeMap.size 106 | originExpr transform { 107 | case expr: BinaryExpression => 108 | (expr.left, expr.right) match { 109 | case (attr: UnresolvedAttribute, literal: Literal) => 110 | dataTypeMap += (attr.name -> literal.dataType) 111 | 112 | case (literal: Literal, attr: UnresolvedAttribute) => 113 | dataTypeMap += (attr.name -> literal.dataType) 114 | 115 | case (attr1: UnresolvedAttribute, attr2: UnresolvedAttribute) => 116 | if (dataTypeMap.contains(attr1.name)) { 117 | dataTypeMap += (attr2.name -> dataTypeMap(attr1.name)) 118 | } 119 | if (dataTypeMap.contains(attr2.name)) { 120 | dataTypeMap += (attr1.name -> dataTypeMap(attr2.name)) 121 | } 122 | 123 | case (_, _) => 124 | } 125 | expr 126 | 127 | case e => e 128 | } 129 | } 130 | 131 | val ifCondition = 132 | originExpr transform { 133 | case e: UnresolvedAttribute => 134 | val dataType = dataTypeMap(e.name) 135 | val literal = Literal(CollectorContainer.getOrElse(e.name, ConfigContainer.get(e.name))) 136 | if (dataType == literal.dataType) { 137 | literal 138 | } else { 139 | Cast(literal, dataType) 140 | } 141 | 142 | case e => e 143 | } 144 | 145 | val ret = ifCondition.eval().asInstanceOf[Boolean] 146 | if (ret) { 147 | ifCommands.foreach(cmd => if (isDryRun) cmd.run() else cmd.dryrun()) 148 | } else { 149 | elseCommands.foreach(cmd => if (isDryRun) cmd.run() else cmd.dryrun()) 150 | } 151 | } 152 | 153 | } 154 | } 155 | 156 | 157 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/LineCommentCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-02-24. 23 | */ 24 | case class LineCommentCommand(sourceChars: SourceChars) 25 | extends BaseCommand(sourceChars) { 26 | 27 | def this(sourceString: String) { 28 | this(SourceChars(sourceString.toCharArray, 0, sourceString.length)) 29 | } 30 | 31 | sourceChars.start = sourceChars.start + CommandFactory.lineCommentPrefix.length 32 | 33 | val (comment, _, nextStart) = readTo('\n') 34 | sourceChars.start = nextStart 35 | 36 | override def toString: String = s"${CommandFactory.lineCommentPrefix} ${comment}" 37 | 38 | override def run(): Unit = { 39 | logInfo(s"\n${this.toString}") 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/SetCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | import org.apache.spark.sql.runner.config.VariableSubstitution 21 | import org.apache.spark.sql.runner.container.ConfigContainer 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2021-02-24. 26 | */ 27 | case class SetCommand(sourceChars: SourceChars) extends BaseCommand(sourceChars) { 28 | 29 | def this(sourceString: String) { 30 | this(SourceChars(sourceString.toCharArray, 0, sourceString.length)) 31 | } 32 | 33 | sourceChars.start = sourceChars.start + CommandFactory.setPrefix.length 34 | 35 | val (key, _, valueStart) = readTo('=') 36 | sourceChars.start = valueStart 37 | 38 | val (value, _, nextStart) = readTo(';') 39 | sourceChars.start = nextStart 40 | 41 | override def toString: String = s"${CommandFactory.setPrefix} $key = $value;" 42 | 43 | override def run(): Unit = { 44 | val substitutionValue = 45 | VariableSubstitution.withSubstitution { substitution => 46 | substitution.substitute(value) 47 | } 48 | 49 | ConfigContainer :+ (key -> substitutionValue) 50 | logInfo(s"\n${CommandFactory.setPrefix} $key = $substitutionValue;") 51 | } 52 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/SourceChars.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-02-24. 23 | */ 24 | case class SourceChars(chars: Array[Char], var start: Int, var end: Int) 25 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/command/SqlCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.hive.SparkSqlRunner 22 | import org.apache.spark.sql.util.{Logging, SystemVariables} 23 | import scala.collection.JavaConverters._ 24 | 25 | import org.apache.spark.sql.runner.callback.DataCallBackFactory 26 | import org.apache.spark.sql.runner.config.VariableSubstitution 27 | import org.apache.spark.sql.runner.container.ConfigContainer 28 | 29 | /** 30 | * @author kun.wan, 31 | * @date 2021-02-24. 32 | */ 33 | case class SqlCommand(sourceChars: SourceChars) 34 | extends BaseCommand(sourceChars) { 35 | 36 | def this(sourceString: String) { 37 | this(SourceChars(sourceString.toCharArray, 0, sourceString.length)) 38 | } 39 | 40 | sourceChars.start = sourceChars.start + CommandFactory.sqlPrefix.length 41 | 42 | val (sql, _, nextStart) = readTo(";") 43 | sourceChars.start = nextStart 44 | 45 | override def toString: String = s"$sql;" 46 | 47 | override def run(): Unit = { 48 | doRun(isDryRun = false) 49 | } 50 | 51 | override def dryrun(): Unit = { 52 | doRun(isDryRun = true) 53 | } 54 | 55 | def doRun(isDryRun: Boolean): Unit = { 56 | VariableSubstitution.withSubstitution { substitution => 57 | // 这里需要注意参数的还原 58 | val sqlText = substitution.substitute(sql) 59 | logInfo(s"sql content:\n$sqlText") 60 | if (!isDryRun) { 61 | DataCallBackFactory.consumeResult(SqlCommand.sparkSqlRunner.run(sqlText)) 62 | } 63 | } 64 | } 65 | } 66 | 67 | object SqlCommand extends Logging { 68 | 69 | implicit lazy val sparkSession: SparkSession = 70 | SparkSqlRunner.sparkSession( 71 | Some(ConfigContainer.getOrElse(SystemVariables.JOB_NAME, "Unknown Job Name"))) 72 | 73 | lazy val sparkSqlRunner = new SparkSqlRunner 74 | 75 | // val catalogEventListener = InsightCatalogEventListener() 76 | var sqlContext = sparkSession.sqlContext 77 | 78 | // SparkSession.active.sharedState.externalCatalog.addListener(catalogEventListener) 79 | 80 | /** Cleans up and shuts down the Spark SQL environments. */ 81 | def stop() { 82 | logDebug("Clear SparkSession and SparkContext") 83 | // TODO 84 | // catalogEventListener.stop() 85 | if (sqlContext != null) { 86 | sqlContext = null 87 | } 88 | if (sparkSession != null) { 89 | sparkSession.stop() 90 | } 91 | SparkSession.clearActiveSession 92 | 93 | val clazz = Class.forName("java.lang.ApplicationShutdownHooks") 94 | val field = clazz.getDeclaredField("hooks") 95 | field.setAccessible(true) 96 | val inheritableThreadLocalsField = classOf[Thread].getDeclaredField("inheritableThreadLocals") 97 | inheritableThreadLocalsField.setAccessible(true) 98 | 99 | val hooks = field.get(clazz).asInstanceOf[java.util.IdentityHashMap[Thread, Thread]].asScala 100 | hooks.keys.map(inheritableThreadLocalsField.set(_, null)) 101 | } 102 | 103 | def simpleTypeName(typeName: String): String = { 104 | val i = typeName.indexOf("(") 105 | if (i > 0) { 106 | typeName.substring(0, i) 107 | } else { 108 | typeName 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/config/ApolloClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.config 19 | 20 | import com.ctrip.framework.apollo.{Config, ConfigService} 21 | import org.apache.commons.lang3.StringUtils 22 | 23 | import org.apache.spark.sql.util.{Logging, SystemVariables} 24 | import scala.collection.JavaConverters._ 25 | 26 | import org.apache.spark.sql.runner.container.ConfigContainer 27 | 28 | /** 29 | * @author kun.wan, 30 | * @date 2020-03-04. 31 | */ 32 | case class ApolloClient(namespace: String) extends Logging { 33 | 34 | lazy val config: Config = ConfigService.getConfig(namespace) 35 | 36 | def getProperty(key: String, defaultValue: String): String = { 37 | config.getProperty(key, defaultValue) 38 | } 39 | } 40 | 41 | object ApolloClient extends Logging { 42 | 43 | /** 44 | * 去Apollo 获取参数太慢了 45 | * 46 | * @return 47 | */ 48 | def pollVariablesFromApollo(): Unit = { 49 | if (StringUtils.isNotBlank(System.getenv(SystemVariables.APOLLO_META))) { 50 | val appId = 51 | ConfigContainer.getOrElse("apollo.app.id", 52 | ConfigContainer.getOrElse("appId", 53 | SystemVariables.DEFAULT_APOLLO_ID)) 54 | System.setProperty("app.id", appId) 55 | 56 | val systemClient = ApolloClient("1.above-board") 57 | 58 | systemClient.config.getPropertyNames 59 | .toArray.map { case key: String => 60 | val value = systemClient.getProperty(key, "") 61 | val encryptedValue = if (key.toLowerCase.contains("password")) "******" else value 62 | logInfo(s"pull variable from apollo, $key = $encryptedValue)") 63 | ConfigContainer :+ (key -> value) 64 | } 65 | 66 | if (ConfigContainer.contains("apollo.namespace")) { 67 | val appClient = ApolloClient(ConfigContainer.get("apollo.namespace")) 68 | appClient.config.getPropertyNames.asScala.map { case key: String => 69 | val value = appClient.getProperty(key, "") 70 | val encryptedValue = if (key.toLowerCase.contains("password")) "******" else value 71 | logInfo(s"pull variable from apollo, $key = $encryptedValue") 72 | ConfigContainer :+ (key -> value) 73 | } 74 | } 75 | } 76 | } 77 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/container/CollectorContainer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.container 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-03-08. 23 | */ 24 | object CollectorContainer extends ContainerTrait[String, Any] 25 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/container/ConfigContainer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.container 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2020-03-06. 23 | */ 24 | object ConfigContainer extends ContainerTrait[String, String] 25 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/container/ContainerTrait.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.container 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-03-08. 23 | */ 24 | class ContainerTrait[A, B] { 25 | 26 | /** 27 | * 这里设计为 ThreadLocal 变量,用于支持多线程运行多job时,维护各自的配置信息. 28 | * 其他线程如果要维护自己的配置信息,从valueMap拷贝出去进行自己维护 29 | */ 30 | val valueMap = 31 | new InheritableThreadLocal[Map[A, B]]() { 32 | override def initialValue(): Map[A, B] = Map[A, B]() 33 | } 34 | 35 | /** 36 | * 原有map和新的map合并,如果key冲突,保留新的map值 37 | * 38 | * @param map 39 | */ 40 | def ++(map: Map[A, B]): Unit = { 41 | valueMap.set(valueMap.get() ++ map) 42 | } 43 | 44 | /** 45 | * 向map中加入新值,如果key已经存在,使用新值覆盖 46 | * @param kv 47 | */ 48 | def :+(kv: (A, B)): Unit = { 49 | valueMap.set(valueMap.get() + kv) 50 | } 51 | 52 | def getOrElse(key: A, default: => B): B = valueMap.get().getOrElse(key, default) 53 | 54 | def get(key: A): B = valueMap.get()(key) 55 | 56 | def getOption(key: A): Option[B] = valueMap.get().get(key) 57 | 58 | def contains(key: A): Boolean = valueMap.get().contains(key) 59 | 60 | def -(key: A): Unit = { 61 | if (valueMap.get().contains(key)) { 62 | valueMap.set(valueMap.get() - key) 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/metrics/GraphiteReporter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.metrics 19 | 20 | import java.io.PrintWriter 21 | import java.net.Socket 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2020-02-26. 26 | */ 27 | case class GraphiteReporter(host: String, port: Int) extends AutoCloseable with Serializable { 28 | 29 | @transient val socket: Socket = new Socket(host, port) 30 | @transient val out: PrintWriter = new PrintWriter(socket.getOutputStream, true) 31 | 32 | def reportMetrics(key: String, value: Number): Unit = { 33 | val timestamp = System.currentTimeMillis() / 1000 34 | out.printf(s"${key} ${value} ${timestamp}%n") 35 | } 36 | 37 | override def close(): Unit = { 38 | if (out != null) { 39 | out.close() 40 | } 41 | if (socket != null) { 42 | socket.close() 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/runner/metrics/ReporterTrait.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.metrics 19 | 20 | import org.apache.spark.sql.runner.container.ConfigContainer 21 | 22 | /** 23 | * @author kun.wan, 24 | * @date 2020-02-26. 25 | */ 26 | trait ReporterTrait { 27 | 28 | lazy val reporter: Option[GraphiteReporter] = { 29 | val enableMetrics = ConfigContainer.getOrElse("metrics.enable", "true").toBoolean 30 | if (enableMetrics && ConfigContainer.contains("graphite.host")) { 31 | val graphiteHost = ConfigContainer.get("graphite.host") 32 | val graphitePort = ConfigContainer.getOrElse("graphite.port", "2003").toInt 33 | Some(GraphiteReporter(graphiteHost, graphitePort)) 34 | } else { 35 | None 36 | } 37 | } 38 | 39 | def reportMetrics(key: String, value: Number): Unit = 40 | reporter.map(_.reportMetrics(key, value)) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/udf/DateFormatUDF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.udf 19 | 20 | import java.time.format.DateTimeFormatter 21 | import java.time.format.DateTimeFormatter._ 22 | 23 | import org.sparkproject.guava.cache.CacheLoader 24 | import org.sparkproject.guava.cache.CacheBuilder 25 | 26 | /** 27 | * @author kun.wan, 28 | * @date 2020-07-20. 29 | */ 30 | object DateFormatUDF { 31 | 32 | lazy val cache = CacheBuilder.newBuilder() 33 | .maximumSize(100) 34 | .build(new CacheLoader[String, DateTimeFormatter] { 35 | override def load(key: String): DateTimeFormatter = ofPattern(key) 36 | }) 37 | 38 | implicit def toFormatter(pattern: String): DateTimeFormatter = cache.get(pattern) 39 | 40 | // function name : transform_date 41 | val transform_date_udf: (String, String, String) => String = { 42 | (dt: String, srcPattern: String, dstPattern: String) => 43 | toFormatter(dstPattern).format(srcPattern.parse(dt)) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/udf/UDFFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.udf 19 | 20 | import java.lang.annotation.Annotation 21 | 22 | import org.apache.spark.sql.SparkSession 23 | import org.apache.spark.sql.runner.container.ConfigContainer 24 | import org.apache.spark.sql.types.DataType 25 | import org.apache.spark.sql.util.Logging 26 | 27 | /** 28 | * @author kun.wan, 29 | * @date 2020-07-20. 30 | */ 31 | object UDFFactory extends Logging { 32 | 33 | val EXTERNAL_UDFS = "spark.sql.externalUdfClasses" 34 | 35 | def registerExternalUDFs(spark: SparkSession): Unit = { 36 | spark.udf.register("transform_date", DateFormatUDF.transform_date_udf) 37 | 38 | ConfigContainer.getOption(EXTERNAL_UDFS).map { 39 | case udfClasses: String => 40 | spark.sessionState.resourceLoader.addJar("hdfs:///deploy/config/biz-udfs-1.0.jar") 41 | 42 | val annotationClazz = 43 | Class.forName("org.apachetech.udfs.annotations.UDFDescription", 44 | true, 45 | spark.sharedState.jarClassLoader) 46 | .asInstanceOf[Class[_ <: Annotation]] 47 | val nameMethod = annotationClazz.getMethod("name") 48 | val returnTypeMethod = annotationClazz.getMethod("returnType") 49 | 50 | udfClasses.split(",").map(_.trim).foreach(udfClass => { 51 | val clazz = Class.forName(udfClass, true, spark.sharedState.jarClassLoader) 52 | val annotation = clazz.getAnnotation(annotationClazz) 53 | val name: String = nameMethod.invoke(annotation).asInstanceOf[String] 54 | val returnType: String = returnTypeMethod.invoke(annotation).asInstanceOf[String] 55 | 56 | logInfo(s"register udf ${name} with class ${udfClass}") 57 | spark.udf.registerJava(name, udfClass, DataType.fromDDL(returnType)) 58 | }) 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/ConfigUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.runner.container.ConfigContainer 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2020-02-17. 26 | */ 27 | object ConfigUtil { 28 | 29 | def ltrim(s: String): String = s.replaceAll("^\\s+", "") 30 | 31 | def rtrim(s: String): String = s.replaceAll("\\s+$", "") 32 | 33 | def trimConfigValue(configValue: String): String = rtrim(ltrim(configValue)) 34 | 35 | 36 | def trimConfigArray(configValue: String, separator: String): String = { 37 | configValue.split(separator) 38 | .map(trimConfigValue(_)) 39 | .mkString(separator) 40 | } 41 | 42 | def withConfigs[T](configs: (String, String)*)(func: => T): T = { 43 | val spark = SparkSession.active 44 | try { 45 | configs.foreach(config => { 46 | ConfigContainer :+ (config._1 -> config._2) 47 | spark.conf.set(config._1, config._2) 48 | }) 49 | 50 | func 51 | } finally { 52 | configs.foreach(config => { 53 | ConfigContainer - config._1 54 | spark.conf.unset(config._1) 55 | }) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/DQUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import org.apache.spark.sql.runner.container.ConfigContainer 21 | 22 | /** 23 | * @author kun.wan, 24 | * @date 2020-02-26. 25 | */ 26 | object DQUtil { 27 | 28 | val serverUrl = ConfigContainer.getOrElse("dataquality.alert", "") 29 | val title = s"${ConfigContainer.getOrElse(SystemVariables.ENV, SystemVariables.DEFAULT_ENV)}数据质量检查告警" 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/GenericAvroSchema.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import scala.beans.BeanProperty 21 | 22 | /** 23 | * @author kun.wan, 24 | * @date 2020-02-26. 25 | */ 26 | case class GenericAvroSchema(@BeanProperty name: String, 27 | @BeanProperty namespace: String, 28 | @BeanProperty fields: Array[AvroField], 29 | @BeanProperty `type`: String = "record", 30 | @BeanProperty doc: String = "") 31 | 32 | case class AvroField(@BeanProperty name: String, 33 | @BeanProperty `type`: String, 34 | @BeanProperty doc: String = "") 35 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/JdbcConnector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import java.sql.{Connection, PreparedStatement, SQLException} 21 | 22 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 23 | import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} 24 | import org.apache.spark.sql.types._ 25 | 26 | /** 27 | * 1. 提供JDBC相关配置参数 28 | * 2. 提供JDBCOption实例 作为connect参数 29 | * 3. 提供JDBC相关操作util方法 30 | * 31 | * @author kun.wan, 32 | * @date 2019-12-11. 33 | */ 34 | class JdbcConnector(config: Map[String, String]) extends Logging { 35 | 36 | val tag: String = config.getOrElse( 37 | "tag", 38 | throw new IllegalArgumentException("config tag is needed.") 39 | ) 40 | 41 | /** 42 | * 1. Get ${tag.key} value from config map 43 | * 2. Return default value if or defaultValue is not empty 44 | * 3. throw parameter should be provided exception 45 | * 46 | * @param key 47 | * @param defaultValue 48 | * @return 49 | */ 50 | def getJdbcConfig(key: String, defaultValue: String = ""): String = { 51 | config.get(s"$tag.$key") match { 52 | case Some(v) => v 53 | case None if defaultValue != "" => defaultValue 54 | case None => throw new Exception(s"parameter $key should be provided!") 55 | } 56 | } 57 | 58 | val url = getJdbcConfig("url") 59 | val username = getJdbcConfig("username") 60 | val password = getJdbcConfig("password") 61 | val queryTimeout = getJdbcConfig("query.timeout", "180").toInt 62 | val tableName: String = config("tableName") 63 | 64 | val jdbcConnectOption: JDBCOptions = 65 | new JDBCOptions(Map( 66 | JDBCOptions.JDBC_URL -> url, 67 | "user" -> username, 68 | "password" -> password, 69 | JDBCOptions.JDBC_TABLE_NAME -> tableName, 70 | JDBCOptions.JDBC_QUERY_TIMEOUT -> queryTimeout.toString 71 | )) 72 | 73 | def getConnection(): Connection = JdbcUtils.createConnectionFactory(jdbcConnectOption)() 74 | 75 | def closeConnection(conn: Connection): Unit = { 76 | try { 77 | if (conn != null) { 78 | conn.close() 79 | } 80 | } catch { 81 | case ex: Exception => logError("close jdbc connection error!", ex) 82 | } 83 | } 84 | 85 | def withConnection[T](body: Connection => T): T = { 86 | val conn: Connection = getConnection() 87 | try { 88 | body(conn) 89 | } catch { 90 | case ex: Exception => 91 | logError("execute jdbc function error!", ex) 92 | throw ex 93 | } finally { 94 | closeConnection(conn) 95 | } 96 | } 97 | 98 | def getTableSchema(): StructType = { 99 | val tableSchemaOption = JdbcUtils.getSchemaOption(getConnection(), jdbcConnectOption) 100 | assert(tableSchemaOption.isDefined, s"Failed to get $tableName schema!") 101 | tableSchemaOption.get 102 | } 103 | 104 | /** 105 | * @param row 准备转换的Row数据 106 | * @param pstmt JDBC PreparedStatement 107 | * @param fields 需要转换的字段列表, pstmt在进行参数转换时的开始下标,默认为1 108 | */ 109 | def rowToPreparedStatement(row: GenericRowWithSchema, 110 | pstmt: PreparedStatement, 111 | fields: Seq[StructField]): Unit = { 112 | fields.zipWithIndex.map { 113 | case (field, fieldIndex) => 114 | field.dataType match { 115 | case _: BooleanType => 116 | pstmt.setBoolean(fieldIndex + 1, row.getAs(field.name)) 117 | case _: DoubleType => 118 | pstmt.setDouble(fieldIndex + 1, row.getAs(field.name)) 119 | case _: DecimalType => 120 | pstmt.setBigDecimal(fieldIndex + 1, row.getAs(field.name)) 121 | case _: FloatType => 122 | pstmt.setFloat(fieldIndex + 1, row.getAs(field.name)) 123 | case _: ByteType => 124 | pstmt.setByte(fieldIndex + 1, row.getAs(field.name)) 125 | case _: ShortType => 126 | pstmt.setShort(fieldIndex + 1, row.getAs(field.name)) 127 | case _: IntegerType => 128 | pstmt.setInt(fieldIndex + 1, row.getAs(field.name)) 129 | case _: LongType => 130 | pstmt.setLong(fieldIndex + 1, row.getAs(field.name)) 131 | case _: StringType => 132 | pstmt.setString(fieldIndex + 1, row.getAs(field.name)) 133 | case _: DateType => 134 | pstmt.setDate(fieldIndex + 1, row.getAs(field.name)) 135 | case _ => 136 | throw new IllegalArgumentException( 137 | s"Unsupported type ${field.dataType}" 138 | ) 139 | } 140 | } 141 | } 142 | 143 | var statementCounter: Long = 0 144 | 145 | def tryStatement[T](pstmt: PreparedStatement, row: Option[GenericRowWithSchema] = None) 146 | (body: PreparedStatement => Unit): Unit = { 147 | try { 148 | statementCounter.synchronized { 149 | if (pstmt != null) { 150 | body(pstmt) 151 | statementCounter = statementCounter + 1 152 | } 153 | if (statementCounter % 10000 == 0) { 154 | val updateCounts = pstmt.executeBatch 155 | logInfo(s"commit JDBC PreparedStatement,affected rows = ${updateCounts.length}, " + 156 | s"statement counter = ${statementCounter}") 157 | pstmt.clearParameters() 158 | } 159 | } 160 | } catch { 161 | case e: Exception => 162 | logError(s"debug message for pstmt : ${pstmt}, row : ${row}") 163 | throw e 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/JobIdUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import java.io.File 21 | import java.time.LocalDateTime 22 | import java.time.format.DateTimeFormatter 23 | 24 | /** 25 | * @author kun.wan, 26 | * @date 2020-03-06. 27 | */ 28 | object JobIdUtil { 29 | 30 | def generatorJobId(jobFile: String): String = { 31 | val ts = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) 32 | val prefix = new File(jobFile).getName.stripSuffix(".xml") 33 | s"${prefix}-${ts}" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/Logging.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import org.slf4j.{Logger, LoggerFactory} 21 | 22 | /** 23 | * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows 24 | * logging messages at different levels using methods that only evaluate parameters lazily if the 25 | * log level is enabled. 26 | * 27 | */ 28 | trait Logging { 29 | // Make the log field transient so that objects with Logging can 30 | // be serialized and used on another machine 31 | @transient private var log_ : Logger = null 32 | 33 | implicit def anyToString(any: Any): String = any.toString 34 | 35 | // Method to get the logger name for this object 36 | protected def logName = { 37 | // Ignore trailing $'s in the class names for Scala objects 38 | this.getClass.getName.stripSuffix("$") 39 | } 40 | 41 | // Method to get or create the logger for this object 42 | protected def log: Logger = { 43 | if (log_ == null) { 44 | log_ = LoggerFactory.getLogger(logName) 45 | } 46 | log_ 47 | } 48 | 49 | // Log methods that take only a String 50 | protected def logInfo(msg: => String) { 51 | if (log.isInfoEnabled) log.info(msg) 52 | } 53 | 54 | protected def logDebug(msg: => String) { 55 | if (log.isDebugEnabled) log.debug(msg) 56 | } 57 | 58 | protected def logTrace(msg: => String) { 59 | if (log.isTraceEnabled) log.trace(msg) 60 | } 61 | 62 | protected def logWarning(msg: => String) { 63 | if (log.isWarnEnabled) log.warn(msg) 64 | } 65 | 66 | protected def logError(msg: => String) { 67 | if (log.isErrorEnabled) log.error(msg) 68 | } 69 | 70 | // Log methods that take Throwables (Exceptions/Errors) too 71 | protected def logInfo(msg: => String, throwable: Throwable) { 72 | if (log.isInfoEnabled) log.info(msg, throwable) 73 | } 74 | 75 | protected def logDebug(msg: => String, throwable: Throwable) { 76 | if (log.isDebugEnabled) log.debug(msg, throwable) 77 | } 78 | 79 | protected def logTrace(msg: => String, throwable: Throwable) { 80 | if (log.isTraceEnabled) log.trace(msg, throwable) 81 | } 82 | 83 | protected def logWarning(msg: => String, throwable: Throwable) { 84 | if (log.isWarnEnabled) log.warn(msg, throwable) 85 | } 86 | 87 | protected def logError(msg: => String, throwable: Throwable) { 88 | if (log.isErrorEnabled) log.error(msg, throwable) 89 | } 90 | 91 | protected def isTraceEnabled(): Boolean = { 92 | log.isTraceEnabled 93 | } 94 | 95 | def runWithErrorLog[T](body: => T): T = { 96 | try { 97 | body 98 | } catch { 99 | case e: Exception => 100 | logError(s"find exception: $e") 101 | throw e 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/NextIterator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | /** Provides a basic/boilerplate Iterator implementation. */ 21 | abstract class NextIterator[U] extends Iterator[U] { 22 | 23 | private var gotNext = false 24 | private var nextValue: U = _ 25 | private var closed = false 26 | protected var finished = false 27 | 28 | /** 29 | * Method for subclasses to implement to provide the next element. 30 | * 31 | * If no next element is available, the subclass should set `finished` 32 | * to `true` and may return any value (it will be ignored). 33 | * 34 | * This convention is required because `null` may be a valid value, 35 | * and using `Option` seems like it might create unnecessary Some/None 36 | * instances, given some iterators might be called in a tight loop. 37 | * 38 | * @return U, or set 'finished' when done 39 | */ 40 | def getNext(): U 41 | 42 | /** 43 | * Method for subclasses to implement when all elements have been successfully 44 | * iterated, and the iteration is done. 45 | * 46 | * Note: `NextIterator` cannot guarantee that `close` will be 47 | * called because it has no control over what happens when an exception 48 | * happens in the user code that is calling hasNext/next. 49 | * 50 | * Ideally you should have another try/catch, as in HadoopRDD, that 51 | * ensures any resources are closed should iteration fail. 52 | */ 53 | def close() 54 | 55 | /** 56 | * Calls the subclass-defined close method, but only once. 57 | * 58 | * Usually calling `close` multiple times should be fine, but historically 59 | * there have been issues with some InputFormats throwing exceptions. 60 | */ 61 | def closeIfNeeded() { 62 | if (!closed) { 63 | // Note: it's important that we set closed = true before calling close(), since setting it 64 | // afterwards would permit us to call close() multiple times if close() threw an exception. 65 | closed = true 66 | close() 67 | } 68 | } 69 | 70 | override def hasNext: Boolean = { 71 | if (!finished) { 72 | if (!gotNext) { 73 | nextValue = getNext() 74 | if (finished) { 75 | closeIfNeeded() 76 | } 77 | gotNext = true 78 | } 79 | } 80 | !finished 81 | } 82 | 83 | override def next(): U = { 84 | if (!hasNext) { 85 | throw new NoSuchElementException("End of stream") 86 | } 87 | gotNext = false 88 | nextValue 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/OptimizerUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import org.apache.spark.sql.AnalysisException 21 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 22 | import org.apache.spark.sql.catalyst.expressions.Literal 23 | import org.apache.spark.sql.types.StringType 24 | 25 | /** 26 | * @author kun.wan, 27 | * @date 2021-03-08. 28 | */ 29 | object OptimizerUtil { 30 | 31 | def parseHintParameter(value: Any): String = { 32 | value match { 33 | case v: String => UnresolvedAttribute.parseAttributeName(v).mkString(".") 34 | case Literal(v, dt: StringType) => v.toString 35 | case v: UnresolvedAttribute => v.nameParts.mkString(".") 36 | case unsupported => throw new AnalysisException(s"Unable to parse : $unsupported") 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/ReflectUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-02-22. 23 | */ 24 | object ReflectUtils { 25 | 26 | /** 27 | * 通过反射执行private方法 28 | * @param clazz 29 | * @param name private 方法名 30 | * @param instance 方法执行的实例,如果是静态方法,直接传入null 31 | * @param parameterTypes 方法参数类型列表,无参数时传入空Seq() 32 | * @param parameters 方法参数实例列表,无参数时传入空Seq() 33 | */ 34 | def runMethod(clazz: Class[_], 35 | name: String, 36 | instance: Any, 37 | parameterTypes: Seq[Class[_]], 38 | parameters: Seq[Object]): Unit = { 39 | val method = clazz.getDeclaredMethod(name, parameterTypes: _*) 40 | method.setAccessible(true) 41 | method.invoke(instance, parameters: _*) 42 | } 43 | 44 | def setVariable(instance: Any, 45 | fieldName: String, 46 | value: Any): Unit = { 47 | val field = instance.getClass.getDeclaredField(fieldName) 48 | field.setAccessible(true) 49 | field.set(instance, value) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/StringUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import org.apache.commons.lang3.StringUtils 21 | 22 | /** 23 | * @author kun.wan, 24 | * @date 2021-03-17. 25 | */ 26 | object StringUtil { 27 | 28 | val escapeMapping: Map[Array[Char], Array[Char]] = Map( 29 | Array('\"') -> Array('\"'), 30 | Array(''') -> Array('''), 31 | Array('(') -> Array(')'), 32 | ) 33 | 34 | def escapeStringValue(text: String): String = { 35 | var res = text.trim 36 | for ((startChars, endChars) <- escapeMapping 37 | if res.startsWith(new String(startChars)) && res.endsWith(new String(endChars))) { 38 | res = StringUtils.removeStart(res, new String(startChars)) 39 | res = StringUtils.removeEnd(res, new String(endChars)).trim 40 | } 41 | res 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/util/SystemVariables.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | /** 21 | * @author kun.wan, 22 | * @date 2021-02-24. 23 | */ 24 | object SystemVariables { 25 | 26 | val BATCH_TIME = "batch_time" 27 | val JOB_NAME = "job_name" 28 | val INDEX_COLUMN_NAME = "index_column" 29 | val PROJECT_JAR_NAME = "sql-runner-3.0.jar" 30 | } 31 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/SQLRunnerSuiteUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql 19 | 20 | import java.io.File 21 | 22 | import org.apache.commons.io.FileUtils 23 | 24 | /** 25 | * @author kun.wan, 26 | * @date 2020-09-16. 27 | */ 28 | object SQLRunnerSuiteUtils { 29 | 30 | def cleanTestHiveData(): Unit = { 31 | val metastoreDB = new File("metastore_db") 32 | if (metastoreDB.exists) { 33 | FileUtils.forceDelete(metastoreDB) 34 | } 35 | val sparkWarehouse = new File("spark-warehouse") 36 | if (sparkWarehouse.exists) { 37 | FileUtils.forceDelete(sparkWarehouse) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/SparkSqlRunnerBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql 19 | 20 | import org.apache.spark.sql.udf.UDFFactory 21 | import org.apache.spark.internal.config.Tests.IS_TESTING 22 | import org.apache.spark.sql.SQLRunnerSuiteUtils._ 23 | import org.apache.spark.sql.hive.SparkSqlRunner 24 | import org.apache.spark.sql.hive.test.TestHiveSingleton 25 | import org.apache.spark.sql.runner.command.SqlCommand 26 | import org.apache.spark.sql.test.SQLTestUtils 27 | 28 | /** 29 | * @author kun.wan, 30 | * @date 2020-04-15. 31 | */ 32 | class SparkSqlRunnerBase extends QueryTest with SQLTestUtils with TestHiveSingleton { 33 | 34 | implicit val sparkImp: SparkSession = spark 35 | val sc = spark.sparkContext 36 | var runner: SparkSqlRunner = _ 37 | 38 | override def beforeAll(): Unit = { 39 | 40 | super.beforeAll() 41 | System.setProperty(IS_TESTING.key, "true") 42 | cleanTestHiveData() 43 | 44 | SparkSession.active.sharedState.externalCatalog.addListener(SqlRunnerCatalogEventListener()) 45 | UDFFactory.registerExternalUDFs(spark) 46 | 47 | runner = new SparkSqlRunner 48 | } 49 | 50 | 51 | override def afterAll() { 52 | cleanTestHiveData() 53 | SqlCommand.stop() 54 | super.afterAll() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/optimizer/CollectValueRuleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import org.apache.spark.sql.QueryTest 21 | import org.apache.spark.sql.hive.SparkSqlRunner 22 | import org.apache.spark.sql.test.SQLTestData.TestData 23 | import org.apache.spark.sql.test.SQLTestUtils 24 | import org.scalatest.matchers.should.Matchers._ 25 | 26 | import org.apache.spark.sql.runner.callback.DataCallBackFactory 27 | import org.apache.spark.sql.runner.command.SqlCommand 28 | import org.apache.spark.sql.runner.container.CollectorContainer 29 | 30 | /** 31 | * @author kun.wan, 32 | * @date 2020-07-28. 33 | */ 34 | class CollectValueRuleSuite extends QueryTest with SQLTestUtils { 35 | 36 | override val spark = { 37 | System.setProperty("spark.master", "local[1]") 38 | SparkSqlRunner.sparkSession(Some("CollectValueRuleSuite")) 39 | } 40 | 41 | import spark.implicits._ 42 | 43 | override def beforeAll() { 44 | val df = spark.sparkContext.parallelize( 45 | (1 to 100).map(i => TestData(i, i.toString))).toDF() 46 | df.createOrReplaceTempView("testData") 47 | } 48 | 49 | override def afterAll(): Unit = { 50 | spark.close() 51 | } 52 | 53 | 54 | def runPartitionScanLimitRule(testQuery: String): Unit = { 55 | PartitionScanLimitRule(spark).apply( 56 | spark.sql(testQuery).queryExecution.optimizedPlan 57 | ) 58 | } 59 | 60 | def runAndComsume(sql: String): Unit = { 61 | DataCallBackFactory.consumeResult(SqlCommand.sparkSqlRunner.run(sql)) 62 | } 63 | 64 | test("test collect Hint") { 65 | runAndComsume( 66 | s"""SELECT /*+ COLLECT_VALUE('single_value', 'count_column') */ 67 | | /*+ COLLECT_VALUE('max_key', 'keyColumn') */ 68 | | count(1) as count_column, 69 | | concat('prefix_', max(key)) as keyColumn 70 | |from testData 71 | |""".stripMargin) 72 | CollectorContainer.get("single_value") should be(100) 73 | CollectorContainer.get("max_key") should be("prefix_100") 74 | 75 | runAndComsume( 76 | s"""SELECT /*+ COLLECT_ARRAY('intArray', 'key') */ 77 | | /*+ COLLECT_ARRAY('stringArray', 'value') */ 78 | | key, value 79 | |from testData 80 | |""".stripMargin) 81 | CollectorContainer.get("intArray") should be((1 to 100)) 82 | CollectorContainer.get("stringArray") should be((1 to 100).map(_.toString)) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/optimizer/ExternalTableRuleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import org.apache.spark.sql.SQLRunnerSuiteUtils.cleanTestHiveData 21 | import org.apache.spark.sql.QueryTest 22 | import org.apache.spark.sql.hive.SparkSqlRunner 23 | import org.apache.spark.sql.test.SQLTestUtils 24 | import org.apache.spark.sql.util.ConfigUtil 25 | import org.scalatest.matchers.should.Matchers._ 26 | 27 | import org.apache.spark.sql.runner.command.SqlCommand 28 | import org.apache.spark.sql.runner.container.ConfigContainer 29 | 30 | /** 31 | * @author kun.wan, 32 | * @date 2020-09-15. 33 | */ 34 | class ExternalTableRuleSuite extends QueryTest with SQLTestUtils { 35 | 36 | override val spark = { 37 | System.setProperty("spark.master", "local[1]") 38 | SparkSqlRunner.sparkSession(Some("ExternalTableRuleSuite")) 39 | } 40 | 41 | val testPath = getClass.getResource("/") 42 | 43 | val bootstrapServers = "10.23.177.40:9092" 44 | val schemaRegistryUrl = "http://10.23.177.40:8081" 45 | 46 | override def beforeAll(): Unit = { 47 | cleanTestHiveData() 48 | 49 | ConfigContainer ++ Map( 50 | "mysql.url" -> "jdbc:mysql://localhost:3306/test", 51 | "mysql.username" -> "root", 52 | "mysql.password" -> "password", 53 | ) 54 | 55 | spark.sql(s"CREATE TABLE target(id int, name string) LOCATION '$testPath/target'") 56 | 57 | /** 58 | * mysql> desc stu; 59 | * +-------+------------+------+-----+---------+-------+ 60 | * | Field | Type | Null | Key | Default | Extra | 61 | * +-------+------------+------+-----+---------+-------+ 62 | * | id | int(11) | NO | PRI | NULL | | 63 | * | name | text | YES | | NULL | | 64 | * | sex | varchar(2) | YES | | NULL | | 65 | * | env | char(20) | YES | | NULL | | 66 | * +-------+------------+------+-----+---------+-------+ 67 | */ 68 | } 69 | 70 | override def afterAll() { 71 | cleanTestHiveData() 72 | spark.stop() 73 | super.afterAll() 74 | } 75 | 76 | test("query jdbc table") { 77 | ConfigUtil.withConfigs("mysql.stu.numPartitions" -> "3", "mysql.stu.partitionColumn" -> "id") { 78 | 79 | val df = spark.sql(s"""SELECT id, name 80 | |FROM jdbc.mysql.stu 81 | |where id < 10 82 | |""".stripMargin) 83 | df.rdd.partitions.length should equal(3) 84 | df.explain() 85 | df.show() 86 | } 87 | } 88 | 89 | test("query jdbc view") { 90 | ConfigUtil.withConfigs( 91 | "mysql.stu.query" -> "(select * from stu where name !='wankun') as q", 92 | "mysql.stu.numPartitions" -> "3", 93 | "mysql.stu.partitionColumn" -> "id") { 94 | 95 | val df = spark.sql(s"""SELECT id, name 96 | |FROM jdbc.mysql.stu 97 | |""".stripMargin) 98 | df.rdd.partitions.length should equal(3) 99 | df.show() 100 | } 101 | } 102 | 103 | test("write data frame to mysql table") { 104 | ConfigUtil.withConfigs( 105 | "mysql.stu.queryTimeout" -> 100.toString, 106 | "mysql.stu.uniqueKeys" -> "id") { 107 | new SqlCommand(s"""WITH t as ( 108 | | SELECT 100 as id, "user_100" as name 109 | | UNION ALL 110 | | SELECT 101 as id, "user_101" as name 111 | |) 112 | |INSERT INTO jdbc.mysql.stu 113 | |SELECT * 114 | |FROM t; 115 | |""".stripMargin).run() 116 | } 117 | } 118 | 119 | test("write json data frame to kafka table") { 120 | ConfigUtil.withConfigs( 121 | "kafka.bootstrap.servers" -> bootstrapServers, 122 | "kafka.stu.recordType" -> "json", 123 | "kafka.stu.kafkaTopic" -> "test_wankun") { 124 | new SqlCommand(s"""WITH t as ( 125 | | SELECT 100 as id, "user_100" as name 126 | | UNION ALL 127 | | SELECT 101 as id, "user_101" as name 128 | |) 129 | |INSERT INTO kafka.stu 130 | |SELECT * 131 | |FROM t; 132 | |""".stripMargin).run() 133 | } 134 | } 135 | 136 | test("write avro data frame to kafka using KAFKA_SINK") { 137 | ConfigUtil.withConfigs( 138 | "kafka.bootstrap.servers" -> bootstrapServers, 139 | "kafka.schema.registry.url" -> schemaRegistryUrl, 140 | "kafka.stu.recordType" -> "avro", 141 | "kafka.stu.kafkaTopic" -> "test_wankun2", 142 | // 不根据计算结果DDL自动生成Avro Schema,手动测试时,根据需要调整该参数 143 | "kafka.stu.avro.forceCreate" -> "false", 144 | "kafka.stu.avro.name" -> "student", 145 | "kafka.stu.avro.namespace" -> "com.wankun") { 146 | new SqlCommand(s"""INSERT INTO kafka.stu 147 | |SELECT 1 as id1, 'wankun' as name1, 148 | | '男' as sex1, 'PRD' env1, 18 age1; 149 | |""".stripMargin).run() 150 | } 151 | } 152 | 153 | /* 154 | test("send message with EMAIL_SINK") { 155 | ConfigUtil.withConfigs( 156 | // server config 157 | "email.hostname" -> "smtp.exmail.qq.com", 158 | "email.username" -> "test@leyantech.com", 159 | "email.password" -> "", 160 | "email.from" -> "test@leyantech.com", 161 | 162 | // job config 163 | "email.columns" -> "id, name", 164 | "email.columnNames" -> "ID,名称", 165 | "email.subject" -> "测试邮件", 166 | "email.email-to" -> "wankun@apache.org", 167 | "email.email-cc" -> "wankun@apache.org" 168 | ) { 169 | new SqlCommand( 170 | s"""SELECT /*+ EMAIL_SINK(email) */ 171 | | 1 as id, 'wankun' as name; 172 | |""".stripMargin).run() 173 | } 174 | } 175 | 176 | test("send message with DINGDING_SINK") { 177 | ConfigUtil.withConfigs( 178 | "dataquality.alert"-> "https://oapi.dingtalk.com/robot/send?access_token=test_token", 179 | "dataquality.alert.title" -> "测试钉钉告警", 180 | "dataquality.alert.pattern" -> "ID是{id},姓名:{name}" 181 | ) { 182 | new SqlCommand( 183 | s"""SELECT /*+ DINGDING_SINK(dataquality.alert) */ 184 | | 1 as id, 'wankun' as name; 185 | |""".stripMargin).run() 186 | } 187 | } 188 | */ 189 | 190 | } 191 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/optimizer/PartitionScanLimitRuleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.optimizer 19 | 20 | import org.apache.spark.sql.hive.test.TestHiveSingleton 21 | import org.apache.spark.sql.test.SQLTestUtils 22 | import org.apache.spark.sql.{AnalysisException, QueryTest} 23 | 24 | /** 25 | * @author kun.wan, 26 | * @date 2020-07-28. 27 | */ 28 | class PartitionScanLimitRuleSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { 29 | 30 | override def beforeAll(): Unit = { 31 | super.beforeAll() 32 | 33 | Seq("test1", "test2").map { tableName => 34 | sql( 35 | s""" 36 | |CREATE TABLE $tableName(i int) 37 | |PARTITIONED BY (p STRING) 38 | |STORED AS textfile""".stripMargin) 39 | sql(s"INSERT OVERWRITE TABLE $tableName PARTITION (p='1') select * from range(10)") 40 | } 41 | } 42 | 43 | override def afterAll(): Unit = { 44 | Seq("test1", "test2").map { tableName => 45 | sql(s"DROP TABLE IF EXISTS $tableName") 46 | } 47 | super.afterAll() 48 | } 49 | 50 | def runPartitionScanLimitRule(testQuery: String): Unit = { 51 | PartitionScanLimitRule(spark).apply( 52 | spark.sql(testQuery).queryExecution.optimizedPlan 53 | ) 54 | } 55 | 56 | test("no filters on partition table scan") { 57 | intercept[AnalysisException] { 58 | runPartitionScanLimitRule("SELECT i FROM test1") 59 | } 60 | 61 | runPartitionScanLimitRule("SELECT i FROM test1 where p='1'") 62 | runPartitionScanLimitRule( 63 | s""" 64 | |WITH t as ( 65 | | SELECT count(1) as c 66 | | FROM test1 67 | | WHERE p='1' 68 | |) 69 | |SELECT * FROM t 70 | |""".stripMargin) 71 | } 72 | 73 | test("no filters on partition table join") { 74 | intercept[AnalysisException] { 75 | runPartitionScanLimitRule( 76 | s""" 77 | |SELECT * 78 | |FROM (SELECT i FROM test1 where p='1') t1 79 | |JOIN test2 t2 80 | |ON t1.i > t2.i 81 | |""".stripMargin) 82 | } 83 | 84 | runPartitionScanLimitRule( 85 | s""" 86 | |SELECT * 87 | |FROM (SELECT i FROM test1 where p='1') t1 88 | |JOIN test2 t2 89 | |ON t1.i > t2.i 90 | |AND t2.p = '1' 91 | |""".stripMargin) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/runner/ArgParserSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner 19 | 20 | import org.apache.spark.sql.util.Logging 21 | import JobRunner.dateRangeStep 22 | import org.scalatest.funsuite.AnyFunSuite 23 | import org.scalatest.matchers.should.Matchers._ 24 | 25 | import java.time.LocalDateTime 26 | import java.time.temporal.ChronoUnit 27 | 28 | /** 29 | * @author kun.wan, 30 | * @date 2021-02-04. 31 | */ 32 | class ArgParserSuite extends AnyFunSuite with Logging { 33 | 34 | test("test time range option") { 35 | val startDate = Some(LocalDateTime.parse("2021-01-01T00:00:00")) 36 | val endDate = Some(LocalDateTime.parse("2021-01-06T00:00:00")) 37 | 38 | val rangeSize = ChronoUnit.DAYS.between(startDate.get, endDate.get) 39 | Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusDays(i)) should 40 | be(Seq(LocalDateTime.parse("2021-01-01T00:00:00"), 41 | LocalDateTime.parse("2021-01-02T00:00:00"), 42 | LocalDateTime.parse("2021-01-03T00:00:00"), 43 | LocalDateTime.parse("2021-01-04T00:00:00"), 44 | LocalDateTime.parse("2021-01-05T00:00:00"), 45 | LocalDateTime.parse("2021-01-06T00:00:00"))) 46 | 47 | 48 | dateRangeStep = 2 49 | Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusDays(i)) should 50 | be(Seq(LocalDateTime.parse("2021-01-01T00:00:00"), 51 | LocalDateTime.parse("2021-01-03T00:00:00"), 52 | LocalDateTime.parse("2021-01-05T00:00:00"))) 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/runner/command/CommandSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.command 19 | 20 | import org.scalatest.funsuite.AnyFunSuite 21 | import org.scalatest.matchers.should.Matchers._ 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2021-02-24. 26 | */ 27 | class CommandSuite extends AnyFunSuite { 28 | 29 | val textHeader = 30 | s"""/************************************************ 31 | | 32 | | author: kun.wan 33 | | period: day 34 | | run_env: PRD & PRE 35 | | describe: policy_store_config 店铺数据量检查 36 | | app.id: 303 37 | | 38 | |************************************************/ 39 | |""".stripMargin 40 | 41 | test("test parse job text") { 42 | val text = 43 | s"""$textHeader 44 | |-- 测试一下单行注释 45 | | 46 | |!set mykey=myvalue; 47 | |!set longKey = \"( 48 | |select * 49 | |from tab 50 | |WHERE dates = '{date | yyyy - MM - dd}' 51 | |) as q\"; 52 | | 53 | |SELECT id, name 54 | |FROM test_db.test_name 55 | |WHERE id in ('001', '002'); 56 | | 57 | |-- 测试SQL中包含引号 58 | |SELECT 'a;b' as a, "abc;hhh" as b,'a\\'b' as c; 59 | |""".stripMargin 60 | 61 | val commands = CommandFactory.parseCommands(text) 62 | 63 | commands.length should be(7) 64 | } 65 | 66 | test("test parse if command") { 67 | Seq("kun.wan", "King").map { username => 68 | val text = 69 | s"""$textHeader 70 | |!set user = $username; 71 | |!if (user = 'kun.wan') 72 | | select 'if command'; 73 | |!else 74 | | select 'else command'; 75 | |!fi 76 | |""".stripMargin 77 | 78 | val commands = CommandFactory.parseCommands(text) 79 | 80 | commands.length should be(3) 81 | 82 | commands.foreach(_.run()) 83 | } 84 | 85 | val text = 86 | s"""$textHeader 87 | | 88 | |SELECT /*+ COLLECT_VALUE('row_count', 'c') */ count(1) as c; 89 | |SELECT /*+ COLLECT_VALUE('row_count2', 'd') */ count(1) as d; 90 | | 91 | |!if (row_count = row_count2 and row_count = 1) 92 | | select 'row count is 1'; 93 | |!else 94 | | select 'row count is not 1'; 95 | |!fi 96 | |""".stripMargin 97 | 98 | val commands = CommandFactory.parseCommands(text) 99 | 100 | commands.length should be(4) 101 | 102 | commands.foreach(_.run()) 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/runner/config/VariableSubstitutionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.runner.config 19 | 20 | import org.apache.spark.sql.util.SystemVariables 21 | import org.apache.sql.runner.container.ConfigContainer 22 | import org.scalatest.funsuite.AnyFunSuite 23 | import org.scalatest.matchers.should.Matchers._ 24 | import java.time.LocalDateTime 25 | 26 | import org.apache.spark.sql.runner.container.{CollectorContainer, ConfigContainer} 27 | 28 | /** 29 | * @author kun.wan, 30 | * @date 2019-12-10. 31 | */ 32 | class VariableSubstitutionSuite extends AnyFunSuite { 33 | 34 | test("test time variable") { 35 | CollectorContainer :+ (SystemVariables.BATCH_TIME -> LocalDateTime.parse("2019-08-07T13:25:41")) 36 | val substitution = new VariableSubstitution() 37 | 38 | substitution.dateParameter("${date}") should be("20190807") 39 | substitution.dateParameter("${date + 2d}") should be("20190809") 40 | substitution.dateParameter("${date + 2d |yyyyMMddHH}") should be("2019080913") 41 | substitution.dateParameter("${date + 2d |yyyyMM00}") should be("20190800") 42 | substitution.dateParameter("${date + 2d |yyyy-MM-dd}") should be("2019-08-09") 43 | substitution.dateParameter("${date + 2d |yyyy_MM_dd}") should be("2019_08_09") 44 | substitution.dateParameter("${date-2m|yyyy-MM-dd HH:mm:ss}") should be("2019-08-07 13:23:41") 45 | 46 | substitution.dateParameter("${date+2d}") should be("20190809") 47 | substitution.dateParameter("${date+4y}") should be("20230807") 48 | 49 | substitution.dateParameter("${date+2D}") should be("20190809") 50 | substitution.dateParameter("${date+3M}") should be("20191107") 51 | substitution.dateParameter("${date+4Y}") should be("20230807") 52 | 53 | substitution.dateParameter("${date-2d}") should be("20190805") 54 | substitution.dateParameter("${date-4y}") should be("20150807") 55 | 56 | substitution.dateParameter("${date-2D}") should be("20190805") 57 | substitution.dateParameter("${date-3M}") should be("20190507") 58 | 59 | substitution.dt should be("20190807") 60 | substitution.yesterday should be("20190806") 61 | substitution.tomorrow should be("20190808") 62 | substitution.hour should be("2019080713") 63 | substitution.lastHour should be("2019080712") 64 | substitution.nextHour should be("2019080714") 65 | } 66 | 67 | test("test variable substitution in sql") { 68 | ConfigContainer :+ ("ab_target" -> "after_trade") 69 | CollectorContainer :+ (SystemVariables.BATCH_TIME -> LocalDateTime.parse("2019-08-07T13:25:41")) 70 | val substitution = new VariableSubstitution() 71 | 72 | substitution.substitute( 73 | """ 74 | |SELECT count(1) 75 | |FROM tab 76 | |WHERE start_date = '${yesterday}' 77 | |AND end_date = '${dt}' 78 | |AND start_hour = '${date-23H|hh}' 79 | |AND end_hour = '${date - 24h|hh}' 80 | |AND month = '${date - 24h|MM}' 81 | |AND ab_target = '${ab_target}' 82 | |""".stripMargin) should equal( 83 | s""" 84 | |SELECT count(1) 85 | |FROM tab 86 | |WHERE start_date = '20190806' 87 | |AND end_date = '20190807' 88 | |AND start_hour = '02' 89 | |AND end_hour = '01' 90 | |AND month = '08' 91 | |AND ab_target = 'after_trade' 92 | |""".stripMargin) 93 | } 94 | 95 | test("test nested variable substitution in sql") { 96 | ConfigContainer :+ ("report_days" -> "3") 97 | CollectorContainer :+ (SystemVariables.BATCH_TIME -> LocalDateTime.parse("2019-08-07T13:25:41")) 98 | val substitution = new VariableSubstitution() 99 | substitution.substitute("SELECT * FROM tab WHERE dt = ${date-${report_days}d|yyyyMMdd}") should 100 | equal("SELECT * FROM tab WHERE dt = 20190804") 101 | } 102 | 103 | test("test parameters with default value") { 104 | val substitution = new VariableSubstitution() 105 | substitution.substitute("!set key1 = ${key1, 'DEFAULT_VALUE1'};") should 106 | equal("!set key1 = DEFAULT_VALUE1;") 107 | 108 | substitution.substitute("!set key1 = ${key1, \"DEFAULT_VALUE1\"};") should 109 | equal("!set key1 = DEFAULT_VALUE1;") 110 | 111 | ConfigContainer :+ ("key1" -> "value1") 112 | 113 | substitution.substitute("!set key1 = ${key1, 'DEFAULT_VALUE1'};") should 114 | equal("!set key1 = value1;") 115 | 116 | substitution.substitute("!set key1 = ${key1, \"DEFAULT_VALUE1\"};") should 117 | equal("!set key1 = value1;") 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/udf/DateFormatUDFSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.udf 19 | 20 | import org.apache.spark.sql.{Row, SparkSqlRunnerBase} 21 | 22 | /** 23 | * @author kun.wan, 24 | * @date 2020-07-20. 25 | */ 26 | class DateFormatUDFSuite extends SparkSqlRunnerBase { 27 | 28 | test("test date_format function") { 29 | val df = spark.sql("select transform_date('20200710','yyyyMMdd','yyyy-MM-dd')") 30 | checkAnswer(df, Seq(Row("2020-07-10"))) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/util/ConfigUtilSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import org.scalatest.funsuite.AnyFunSuite 21 | import org.scalatest.matchers.should.Matchers._ 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2020-02-17. 26 | */ 27 | class ConfigUtilSuite extends AnyFunSuite { 28 | 29 | 30 | test("trim config array") { 31 | 32 | val columnName = "\n 日期,店铺id,店铺名,买家付款\n " 33 | val dbColumnName = "\n {dt},{store_id},{store_name},{buyer_payment}," + 34 | "{buyer_prepaid}," + 35 | "{inquiry_tailing},{no_order_try},\n {size_query_succeeded},{applicable_season}," + 36 | "{enable_filter_applicable_season},{chat_expires_at},{r2_expires_at},{audit_expires_at}\n " + 37 | " "; 38 | ConfigUtil.trimConfigValue(columnName) should be("日期,店铺id,店铺名,买家付款") 39 | 40 | ConfigUtil.trimConfigArray(dbColumnName, ",") should be( 41 | "{dt},{store_id},{store_name},{buyer_payment},{buyer_prepaid},{inquiry_tailing}," + 42 | "{no_order_try},{size_query_succeeded},{applicable_season}," + 43 | "{enable_filter_applicable_season},{chat_expires_at},{r2_expires_at},{audit_expires_at}") 44 | 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/util/JobIdUtilSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.util 19 | 20 | import org.scalatest.funsuite.AnyFunSuite 21 | import org.scalatest.matchers.should.Matchers._ 22 | 23 | /** 24 | * @author kun.wan, 25 | * @date 2020-03-06. 26 | */ 27 | class JobIdUtilSuite extends AnyFunSuite { 28 | 29 | test("test generatorJobId") { 30 | val jobId = JobIdUtil.generatorJobId("conf/marketing/pdd/dwd_payment_reminder_detail.xml") 31 | jobId should fullyMatch regex ("""dwd_payment_reminder_detail-\d{8}_\d{6}""") 32 | } 33 | 34 | } 35 | --------------------------------------------------------------------------------