├── .gitignore
├── .gitlab-ci.yml
├── README.md
├── assembly.xml
├── bin
    ├── profile.sh
    └── sql_runner.sh
├── docs
    ├── DataQuality_PartitionRule.md
    ├── Data_Check.md
    ├── External_Sources.md
    ├── Index_Column.md
    ├── Trouble_Shooting.md
    ├── UDF.md
    └── images
    │   ├── architecture.png
    │   ├── dq2_bollinger_model.png
    │   ├── dq2_ewma_model.png
    │   ├── dq2_row_number.png
    │   ├── dq_bollinger_model.png
    │   ├── dq_ewma_model.png
    │   └── dq_row_number.png
├── pom.xml
├── scalastyle-config.xml
└── src
    ├── main
        ├── java
        │   └── one
        │   │   └── profiler
        │   │       ├── AsyncProfiler.java
        │   │       ├── AsyncProfilerMXBean.java
        │   │       ├── Counter.java
        │   │       ├── Events.java
        │   │       └── ProfileAgent.java
        ├── resources
        │   ├── log4j.properties
        │   └── metrics.properties_template
        └── scala
        │   └── org
        │       └── apache
        │           └── spark
        │               └── sql
        │                   ├── SqlRunnerCatalogEventListener.scala
        │                   ├── execution
        │                       └── datasources
        │                       │   ├── jdbc
        │                       │       ├── JDBCCatalog.scala
        │                       │       ├── JDBCDataWriter.scala
        │                       │       ├── JDBCScanBuilder.scala
        │                       │       ├── JDBCTable.scala
        │                       │       ├── JDBCWriteBuilder.scala
        │                       │       ├── MyJDBCOptions.scala
        │                       │       └── MyJDBCUtils.scala
        │                       │   └── kafka
        │                       │       ├── KafkaCatalog.scala
        │                       │       ├── KafkaDataWriter.scala
        │                       │       ├── KafkaOptions.scala
        │                       │       └── KafkaTable.scala
        │                   ├── hive
        │                       ├── SparkSqlRunner.scala
        │                       └── SqlRunnerMetrics.scala
        │                   ├── optimizer
        │                       ├── CollectValueRule.scala
        │                       ├── DataQualityRule.scala
        │                       ├── ExternalSinkRule.scala
        │                       ├── ExternalTableRule.scala
        │                       ├── InsightExtensions.scala
        │                       ├── PartitionScanLimitRule.scala
        │                       ├── RepartitionRule.scala
        │                       └── SqlRunnerSessionStateBuilder.scala
        │                   ├── plugin
        │                       ├── AsyncProfilePlugin.scala
        │                       ├── ProfilePlugin.scala
        │                       └── YourkitPlugin.scala
        │                   ├── runner
        │                       ├── Alert.scala
        │                       ├── ArgParser.scala
        │                       ├── JobRunner.scala
        │                       ├── callback
        │                       │   ├── ArrayValueCollector.scala
        │                       │   ├── DataCallBack.scala
        │                       │   ├── DataCallBackFactory.scala
        │                       │   ├── DataCheckCallBack.scala
        │                       │   ├── EmailSink.scala
        │                       │   ├── QueryResult.scala
        │                       │   ├── SingleValueCollector.scala
        │                       │   └── Sink.scala
        │                       ├── command
        │                       │   ├── BaseCommand.scala
        │                       │   ├── BlockCommentCommand.scala
        │                       │   ├── CommandFactory.scala
        │                       │   ├── ElseCommand.scala
        │                       │   ├── FiCommand.scala
        │                       │   ├── IfCommand.scala
        │                       │   ├── LineCommentCommand.scala
        │                       │   ├── SetCommand.scala
        │                       │   ├── SourceChars.scala
        │                       │   └── SqlCommand.scala
        │                       ├── config
        │                       │   ├── ApolloClient.scala
        │                       │   └── VariableSubstitution.scala
        │                       ├── container
        │                       │   ├── CollectorContainer.scala
        │                       │   ├── ConfigContainer.scala
        │                       │   └── ContainerTrait.scala
        │                       └── metrics
        │                       │   ├── GraphiteReporter.scala
        │                       │   └── ReporterTrait.scala
        │                   ├── udf
        │                       ├── DateFormatUDF.scala
        │                       └── UDFFactory.scala
        │                   └── util
        │                       ├── ConfigUtil.scala
        │                       ├── DQUtil.scala
        │                       ├── GenericAvroSchema.scala
        │                       ├── JdbcConnector.scala
        │                       ├── JobIdUtil.scala
        │                       ├── Logging.scala
        │                       ├── NextIterator.scala
        │                       ├── OptimizerUtil.scala
        │                       ├── ReflectUtils.scala
        │                       ├── StringUtil.scala
        │                       └── SystemVariables.scala
    └── test
        └── scala
            └── org
                └── apache
                    └── spark
                        └── sql
                            ├── InsightCatalogEventListenerSuite.scala
                            ├── SQLRunnerSuiteUtils.scala
                            ├── SparkSqlRunnerBase.scala
                            ├── optimizer
                                ├── CollectValueRuleSuite.scala
                                ├── ExternalTableRuleSuite.scala
                                └── PartitionScanLimitRuleSuite.scala
                            ├── runner
                                ├── ArgParserSuite.scala
                                ├── command
                                │   └── CommandSuite.scala
                                └── config
                                │   └── VariableSubstitutionSuite.scala
                            ├── udf
                                └── DateFormatUDFSuite.scala
                            └── util
                                ├── ConfigUtilSuite.scala
                                └── JobIdUtilSuite.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | *.jar
 4 | *.iml
 5 | *.war
 6 | *.DS_Store
 7 | *.project
 8 | *.classpath
 9 | *.settings
10 | *.factorypath
11 | *dependency-reduced-pom.xml
12 | .idea
13 | target
14 | /local-test/
15 | config-cache/
16 | *.ipr
17 | *.iws
18 | !bin
19 | lib
20 | conf
21 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |   - deploy
 3 | 
 4 | .build-deploy:
 5 |   script:
 6 |     - 'mvn clean package'
 7 |     - 'tar zxvf ./target/sql-runner-2.1-bin.tar.gz -C ./target'
 8 |     - 'rm -rf /data/ws/sql-runner-2.1'
 9 |     - 'mv ./target/sql-runner-2.1 /data/ws/sql-runner-2.1'
10 |   stage: deploy
11 |   when: manual
12 | 
13 | uat-build-deploy:
14 |   extends: .build-deploy
15 |   tags:
16 |     - uat-nuc1
17 | 
18 | stg-build-deploy:
19 |   extends: .build-deploy
20 |   tags:
21 |     - stg-gw1
22 | 
23 | prd-build-deploy:
24 |   extends: .build-deploy
25 |   tags:
26 |     - prd-gw1
27 |     
28 | pre-build-deploy:
29 |   extends: .build-deploy
30 |   tags:
31 |     - pre-gw1
32 |     
33 | nta-build-deploy:
34 |   extends: .build-deploy
35 |   tags:
36 |     - nta-gw1
37 | 
38 | prg-build-deploy:
39 |   extends: .build-deploy
40 |   tags:
41 |     - prg-gw1
42 |     
43 | sth-build-deploy:
44 |   extends: .build-deploy
45 |   tags:
46 |     - sth-gw1
47 | 
48 | jdc-build-deploy:
49 |   extends: .build-deploy
50 |   tags:
51 |     - jdc-gw1
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | sql_runner 是一个以Spark SQL为内核，以SQL为主体，扩展支持了数据质量告警，支持多种外部数据源，支持数据处理流程控制的数据处理引擎。
  2 | 
  3 | ![架构图](docs/images/architecture.png)
  4 | 
  5 | 用户通过sql_runner命令就可以执行一个包含各种扩展SQL的sql任务。
  6 | 运行命令:`sql_runner [job_file.sql]` 
  7 | 
  8 | # Quick Start
  9 | 
 10 | 编写SQL文件:
 11 | 
 12 | ```sql
 13 | /************************************************
 14 | 
 15 |   author: kun.wan
 16 |   period: day
 17 |   run_env: PRD
 18 |   describe: 基础数据处理脚本
 19 | 
 20 | ************************************************/
 21 | 
 22 | INSERT  OVERWRITE TABLE trade.dws_trade partition(dt='${date|yyyyMMdd}')
 23 | SELECT  *
 24 | FROM    trade.dwd_trade t
 25 | WHERE   t.dt = '${date|yyyyMMdd}';
 26 | ```
 27 | 
 28 | 通过`sql_runner [job_file]` 命令就可以实现将表`trade.dwd_trade`的数据清洗到表`trade.dws_trade`。
 29 | 
 30 | 运行说明:
 31 | * 程序的第一部分为任务注释，注释中必须要包含 `author`, `period`, `run_env`, `describe` 这几个字段，主要是基于大型项目中的任务管理考虑，在之后的一些Demo中会将这部分头注释做省略。
 32 | * 第二部分是我们需要运行的SQL命令，后面对系统当前支持的命令再详细介绍。
 33 | * 在SQL中有看到`${date|yyyyMMdd}` 这样的特殊参数，这个会参考系统的参数管理章节。
 34 | 
 35 | # 系统命令
 36 | 
 37 | 当前系统支持如下命令
 38 | * 单行注释命令
 39 | * 多行注释命令
 40 | * SET参数命令
 41 | * IF命令
 42 | * SQL命令
 43 | 
 44 | ## 单行注释命令
 45 | 
 46 | 以`--` 作为单行注释开始，系统执行的时候会忽略单行注释
 47 | 
 48 | ## 多行注释命令
 49 | 
 50 | 以`/**` 作为多行注释开始，以 `*/`作为多行注释结束，系统执行的时候会忽略多行注释
 51 | 
 52 | ## SET参数命令
 53 | 
 54 | 以`!set` 作为SET命令开始，以`;` 作为命令结束符, 命令格式: `!set [key]=[value];`, 系统执行的时候解析该参数为系统参数
 55 | 
 56 | ## IF命令
 57 | 
 58 | 以`!if` 作为IF命令开始，以`!fi`作为命令结束符，命令支持`!else`语句分支，命令格式:
 59 | ```
 60 | !if ([条件判断语句])
 61 |   [命令1]
 62 |   [命令2]
 63 |   [命令3]
 64 | !else
 65 |   [命令4]
 66 |   [命令5]
 67 | !fi
 68 | ```
 69 | 命令正在执行的时候会对上述条件判断语句进行判断，如果条件为真，执行IF下面的命令，如果条件为假，执行ELSE下面的命令。
 70 | 
 71 | 使用示例1: 对运行环境参数进行判断，来选择IF分支命令的选择执行
 72 | 
 73 | ```sql
 74 | !set user = "kun.wan";
 75 | !if (user = 'kun.wan')
 76 |   select 'if command';
 77 | !else
 78 |   select 'else command';
 79 | !fi
 80 | ```
 81 | 
 82 | 使用示例2: 根据之前的SQL执行结果进行判断，来选择IF分支命令的选择执行
 83 | 
 84 | ```sql
 85 | SELECT /*+ COLLECT_VALUE('row_count', 'c') */ count(1) as c;
 86 | SELECT /*+ COLLECT_VALUE('row_count2', 'd') */ count(1) as d;
 87 | 
 88 | !if (row_count = row_count2 and row_count = 1)
 89 |   select 'row count is 1';
 90 | !else
 91 |   select 'row count is not 1';
 92 | !fi
 93 | ```
 94 | 
 95 | ## SQL命令
 96 | 
 97 | 除去以上命令，其他的代码会被解析为SQL命令，以`;` 作为命令结束符；每个SQL会由SQL引擎解析执行
 98 | 
 99 | 
100 | # 参数管理
101 | 
102 | 系统执行过程中会有很多运行以来参数，包括时间参数， 系统参数和Set命令参数。
103 | 系统通过set命令，apollo配置等方式进行参数定义，在程序中使用`${variable}`的格式引用参数。
104 | 通过 `${variable, 'DEFAULT_VALUE'}`格式引用参数时，如果没有找到`variable`参数，则返回`DEFAULT_VALUE`
105 | 
106 | ## 时间参数
107 | 
108 | 时间参数是一个特殊类型的参数，表示任务运行的批次时间，如没有其他参数影响，系统时间由如下决定:
109 | 
110 | * 如果是`period=month`, 系统时间表示当前时间的上一个月的1日0点0分0秒
111 | * 如果是`period=day`, 系统时间表示当前时间的前一天的0点0分0秒
112 | * 如果是`period=hour`, 系统时间表示当前时间的上一个小时的的0分0秒
113 | 
114 | 说明:
115 | * 时间参数以date 开头，date表示当前job的运行批次时间。
116 | * 时间可以通过`+`和`-`来进行时间的加减运行
117 | * 在做时间运算的时候以数字和时间单位表示加减的时间窗口，时间单位中，Y表示年，M表示月， D表示天，H表示小时，m表示分钟，S表示秒。
118 | * 输出的时间格式默认为 `yyyyMMdd`，可以通过 `|` 后连接自定义的时间格式来自定义输出时间格式。时间格式为Java 默认的时间解析格式。
119 | 
120 | 示例: `${date-3d|yyyyMMdd}`
121 | 
122 | ## 系统参数
123 | 
124 | 为了方便程序运行，程序启动的时候已经设置了一些系统参数，用于辅助程序运行。
125 | 
126 | ## 系统环境参数
127 | 
128 | 系统启动的时候会读取`env.xml` 中的配置作为系统参数，另外一些数据库中的配置系统会从Apollo中进行获取。
129 | 
130 | 另外系统还支持在命令行中修改一些系统参数:
131 | * --dates : 手工指定系统的运行批次时间，后面可以添加多个日期参数，通过逗号分隔。每个日期参数格式：`--dates 2021-01-01T00:00:00,2021-01-03T00:00:00`
132 | * --dateRange : 手工指定系统的运行批次时间，后续跟批量运行的开始日期和结束日志，参数格式:`--dateRange 2021-01-01T00:00:00 2021-01-03T00:00:00`, 默认会每一个时间单位（天级任务就是一天，小时任务就是一小时）运行一次，可以通过 `--dateRangeStep` 参数修改多少个时间单位运行一次。
133 | * --test : 单次执行该任务，此时任务会把执行日志屏幕输出；如果程序运行错误，不会进行告警。在开发模式和补跑数据时可以添加该参数运行。
134 | * --dryrun : 空跑模式， 此模式会空跑指定任务中的SQL，并在屏幕上输出日志，可用于检查编写的程序正确性。
135 | 
136 | ## Set命令参数
137 | 
138 | 这个是在任务运行过程中修改系统参数
139 | 
140 | # 参数的使用
141 | 
142 | 对于上说的各种参数，可以通过`${variable}`格式，在SQL中引用，系统在执行的时候会自动进行参数替换。
143 | 此外，参数还支持嵌套参数引用，即 `${variable1 ${variable2 ${variable3}} }`
144 | 
145 | # 高级功能说明
146 | 
147 | * [外部数据源的读写](docs/External_Sources.md)
148 | * [数据质量告警](./docs/Data_Check.md)
149 | * [Hive表数据写入时排序索引](./docs/Index_Column.md)
150 | * [Trouble Shooting](./docs/Trouble_Shooting.md)
151 | * [UDF函数](./docs/UDF.md)
152 | 
153 | # 日志管理
154 | 
155 | * 如果程序在开发环境运行，需要传入参数 `--test`，程序日志直接在命令行输出；如果程序运行出错，不进行告警；
156 | * 如果程序在生产环境运行，程序运行日志输出在目录下 `/tmp/{USER}/${yyyyMMdd}`，程序运行完毕后会将日志归档到HDFS目录 `/metadata/logs/insight/${yyyyMMdd}`；如果程序运行出错，自动进行钉钉告警；
157 | 
158 | # Contributing
159 | 
160 | 开启本地调试模式
161 | 
162 | * 注释掉pom中的 `hive-cli` 和 `hive-exec`两个依赖的provide依赖
163 | * 在resource目录下补充hdfs，yarn，hive的访问配置文件
164 | * 启动 `org.apache.sql.runner.JobRunner` 程序
165 | 


--------------------------------------------------------------------------------
/assembly.xml:
--------------------------------------------------------------------------------
 1 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 2 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |   xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 4 |   <id>bin</id>
 5 |   <formats>
 6 |     <format>tar.gz</format>
 7 |   </formats>
 8 |   <fileSets>
 9 |     <fileSet>
10 |       <directory>${project.basedir}</directory>
11 |       <outputDirectory>/</outputDirectory>
12 |       <includes>
13 |         <include>README*</include>
14 |         <include>LICENSE*</include>
15 |       </includes>
16 |     </fileSet>
17 |     <fileSet>
18 |       <directory>${project.basedir}/bin</directory>
19 |       <outputDirectory>/bin</outputDirectory>
20 |       <includes>
21 |         <include>/**</include>
22 |       </includes>
23 |       <lineEnding>unix</lineEnding>
24 |       <fileMode>0777</fileMode> <!-- 所有文件文件权限为777 -->
25 |       <directoryMode>0755</directoryMode> <!-- 所有目录权限为777  -->
26 |     </fileSet>
27 |     <fileSet>
28 |       <directory>${project.basedir}/src/main/resources</directory>
29 |       <outputDirectory>/conf</outputDirectory>
30 |       <includes>
31 |         <include>/**</include>
32 |       </includes>
33 |     </fileSet>
34 |   </fileSets>
35 | 
36 |   <dependencySets>
37 |     <dependencySet>
38 |       <useProjectArtifact>true</useProjectArtifact>
39 |       <outputDirectory>/lib</outputDirectory>
40 |       <unpack>false</unpack>
41 |       <scope>runtime</scope>
42 |       <excludes>
43 |         <exclude>mysql:mysql-connector-java</exclude>
44 |         <exclude>io.confluent:kafka-avro-serializer</exclude>
45 |         <exclude>org.apache.kafka:kafka-clients</exclude>
46 |         <exclude>io.confluent:kafka-schema-registry-client</exclude>
47 |         <exclude>io.confluent:common-config</exclude>
48 |         <exclude>io.confluent:common-utils</exclude>
49 |         <exclude>io.dropwizard.metrics:*</exclude>
50 |         <exclude>org.glassfish.jersey.core:*</exclude>
51 |         <exclude>org.glassfish.jersey.containers:*</exclude>
52 |         <exclude>org.glassfish.jersey.inject:*</exclude>
53 |         <exclude>com.fasterxml.jackson.core:*</exclude>
54 |         <exclude>org.apache.thrift:*</exclude>
55 |         <exclude>org.apache.parquet:*</exclude>
56 |         <exclude>org.apache.orc:*</exclude>
57 |         <exclude>org.apache.avro:*</exclude>
58 |         <exclude>org.apache.hadoop:*</exclude>
59 |         <exclude>org.apache.hive:*</exclude>
60 |         <exclude>org.apache.spark:*</exclude>
61 |       </excludes>
62 |     </dependencySet>
63 |   </dependencySets>
64 | </assembly>
65 | 


--------------------------------------------------------------------------------
/bin/profile.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )"
 6 | export JAVA_HOME=/usr/java/latest
 7 | 
 8 | if [ $# == 0 ]; then
 9 |   echo "miss parameter for profile shell!"
10 |   exit 1
11 | fi
12 | 
13 | cmd=$1
14 | shift
15 | if [ "${cmd}" == "executor" ];then
16 |   exec "${JAVA_HOME}"/bin/java -cp ./sql-runner-2.1.jar one.profiler.ProfileAgent
17 |   exit 0
18 | elif [ "${cmd}" == "upload" ]; then
19 |   sleep 1s
20 |   if [ $# == 2 -a -f $1 ]; then
21 |     profile_file=$1
22 |     hdfs_file=$2
23 | 
24 |     lastTime=$(stat -c %Y "$profile_file")
25 |     now=$(date +%s)
26 |     stop=$(( now + 120 ))
27 |     while [ ${now} -lt ${stop} ]; do
28 |       sleep 1s
29 |       if [ ${lastTime} -eq $(stat -c %Y "$profile_file") ];then
30 |         HADOOP_USER_NAME=schedule hdfs dfs -put "${profile_file}" "${hdfs_file}"
31 |         rm "${profile_file}"
32 |         exit 0
33 |       fi
34 |     done
35 |     rm "${profile_file}"
36 |   fi
37 | fi
38 | 


--------------------------------------------------------------------------------
/bin/sql_runner.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )"
 4 | export JAVA_HOME=/usr/java/latest
 5 | 
 6 | if [ ! $# -ge 1 ]; then
 7 |   echo "job config file must be provided!"
 8 |   exit 1
 9 | fi
10 | 
11 | jobFile=$1
12 | if [ ! -f ${jobFile} ];then
13 |   jobFile="${BASEDIR}/${jobFile}"
14 |   if [ ! -f ${jobFile} ];then
15 |     echo "没有找到job文件: "${jobFile}
16 |     exit
17 |   fi
18 | fi
19 | jobFile=$(readlink -f ${jobFile})
20 | shift
21 | 
22 | cd ${BASEDIR}
23 | 
24 | export SPARK_HOME=/opt/spark
25 | export CLASSPATH=$(echo ${SPARK_HOME}/jars/*.jar | tr ' ' ':'):${CLASSPATH}
26 | export CLASSPATH=$(echo ${BASEDIR}/lib/*.jar | tr ' ' ':'):${CLASSPATH}
27 | 
28 | export HADOOP_CONF_DIR=/etc/hadoop/conf
29 | export SPARK_CONF_DIR=${SPARK_HOME}/conf
30 | export CLASSPATH=${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${CLASSPATH}
31 | 
32 | export JAVA_OPTS="-Xmx2048m -Xms256m -server -XX:+UseG1GC"
33 | jobFileName=$(basename ${jobFile})
34 | 
35 | export CLASSPATH=${BASEDIR}/conf:${CLASSPATH}
36 | stdoutFile="/tmp/$USER/$(date +%Y%m%d)/${jobFileName%%.*}_$(date +%Y%m%d_%H%M%S).stdout"
37 | mkdir -p "/tmp/$USER/$(date +%Y%m%d)"
38 | if [[ "$*" =~ "--test" ]];then
39 |   export JAVA_OPTS="${JAVA_OPTS} -Dinsight.root.logger=INFO,CA,FA -Dinsight.file.stdout=${stdoutFile}"
40 | elif [[ "$*" =~ "--dryrun" ]]; then
41 |   export JAVA_OPTS="${JAVA_OPTS} -Dinsight.root.logger=INFO,CA,FA -Dinsight.file.stdout=${stdoutFile}"
42 | else
43 |   export JAVA_OPTS="${JAVA_OPTS} -Dinsight.root.logger=INFO,FA -Dinsight.file.stdout=${stdoutFile}"
44 | fi
45 | 
46 | # echo "Using CLASSPATH:"$CLASSPATH
47 | export HADOOP_USER_NAME=schedule
48 | if [[ "$*" =~ "--test" ]];then
49 |   "${JAVA_HOME}"/bin/java ${JAVA_OPTS} org.apache.sql.runner.JobRunner ${jobFile} $@
50 | elif [[ "$*" =~ "--dryrun" ]]; then
51 |   "${JAVA_HOME}"/bin/java ${JAVA_OPTS} org.apache.sql.runner.JobRunner ${jobFile} $@
52 | else
53 |   "${JAVA_HOME}"/bin/java ${JAVA_OPTS} org.apache.sql.runner.JobRunner ${jobFile} $@ 2>>"${stdoutFile}"
54 | fi
55 | if [ $? -ne 0 ];then
56 |   "${JAVA_HOME}"/bin/java org.apache.sql.runner.Alert ${jobFile} $@
57 | fi
58 | 


--------------------------------------------------------------------------------
/docs/DataQuality_PartitionRule.md:
--------------------------------------------------------------------------------
 1 | # Spark分区数据质量检查
 2 | 
 3 | 本系统扩展了Spark的ExternalCatalogEventListener类，增加了对分区表在进行分区变化时的监听。
 4 | 对监听到发生变化的分区进行进行表分析。
 5 | 获取当前表中的其他历史分区的记录条数，建立记录数预估模型，如果当前分区的记录数不在预估模型内，则进行告警。
 6 | 
 7 | 目前数据预估模型有两类：布林模型(Bollinger)和指数加权移动平均模型(EWMA)
 8 | 
 9 | ## 布林模型(Bollinger)
10 | 
11 | MA=最近10天记录数的绝对平均值
12 | MD=最近10天记录数的标准差
13 | (MA, UP, DN) = (MA, MA + 2 * MD, MA - 2 * MD)
14 | 
15 | ## 指数加权移动平均模型(EWMA)
16 | 
17 | MA=最近10天记录数的指数加权移动平均值
18 | MD=最近10天记录数和当日MA的标准差
19 | (MA, UP, DN) = (MA, MA + 2 * MD, MA - 2 * MD)
20 | 
21 | ## 模型对比
22 | 
23 | ### 数据对比模型1
24 | 
25 | 表的记录数
26 | ![表的记录数](images/dq_row_number.png)
27 | 
28 | Bollinger模型效果
29 | ![Bollinger模型效果](images/dq_bollinger_model.png)
30 | 
31 | EWMA模型效果
32 | ![EWMA模型效果](images/dq_ewma_model.png)
33 | 
34 | ### 数据对比模型2
35 | 
36 | 表的记录数
37 | ![表的记录数](images/dq2_row_number.png)
38 | 
39 | Bollinger模型效果
40 | ![Bollinger模型效果](images/dq2_bollinger_model.png)
41 | 
42 | EWMA模型效果
43 | ![EWMA模型效果](images/dq2_ewma_model.png)


--------------------------------------------------------------------------------
/docs/Data_Check.md:
--------------------------------------------------------------------------------
 1 | # 数据质量检查
 2 | 
 3 | ## DATA_CHECK([ALERT_MESSAGE], [CHECK_EXPRESSION])
 4 | 
 5 | 功能说明:
 6 | 系统扩展了一个名为 DATA_CHECK的 SQL Hint，用户可以自定义数据检查表达式，如果该Boolean表达式计算结果返回false，会进行钉钉告警。
 7 | 
 8 | 参数说明:
 9 | 
10 | * ALERT_MESSAGE: 数据检查失败时，告警信息
11 | * CHECK_EXPRESSION: 数据检查Boolean表达式。表达式中可以使用当前SQL中可以访问的任意列数据。
12 | 
13 | 使用示例:
14 | 
15 | 以下SQL会对比trade表中最近两天的店铺数量，如果差值大于100，则进行钉钉告警。
16 | 
17 | ```sql
18 | !set diff_num = 100;
19 | 
20 | WITH raw AS (
21 |     SELECT
22 |         count(DISTINCT store_id) AS stores
23 |     FROM trade
24 |     WHERE dt in ('${date-1d|yyyyMMdd}', '${date|yyyyMMdd}')
25 |     GROUP BY dt)
26 | SELECT /*+DATA_CHECK('交易表今日与昨日店铺数量差值大于${diff_num}', 'diff < ${diff_num}') */
27 |         max(stores) - min(stores) AS diff
28 | FROM raw;
29 | ```
30 | 


--------------------------------------------------------------------------------
/docs/External_Sources.md:
--------------------------------------------------------------------------------
  1 | # 读写外部表
  2 | 
  3 | 目前系统支持对JDBC数据源的读写，对Kafka的自动数据写入。在写SQL之前需要配置好相关系统参数和外部表参数。系统参数一般只JDBC连接信息，Kafka Broker地址等共用信息，不需要每个任务都进行配置。外部表参数指Mysql中的表名，表的主键等需要具体配置的参数。
  4 | 
  5 | ## 读写JDBC表
  6 | 
  7 | 功能说明:
  8 | 系统支持将JDBC数据源中的一个表或者一个JDBC查询作为一个Spark中表进行读写。设置好JDBC表的相关参数后，通过 `jdbc.[NAMESPACE].[TABLE_NAME]` 名称就可以进行读写了。
  9 | 
 10 | 使用参数说明:
 11 | NAMESPACE: JDBC 连接数据库的标识
 12 | TABLE_NAME: Spark SQL中注册的表名
 13 | 
 14 | 系统参数说明(一般由系统统一配置，多个任务共享该参数):
 15 | * [NAMESPACE].url : JDBC连接的地址
 16 | * [NAMESPACE].username : JDBC连接的用户名
 17 | * [NAMESPACE].password : JDBC连接的密码
 18 | 
 19 | 读取外部表数据需要配置的参数:
 20 | * [NAMESPACE].[TABLE_NAME].numPartitions : 外部表查询的并发数
 21 | * [NAMESPACE].[TABLE_NAME].partitionColumn : 外部表并发查询的数据分区字段
 22 | * [NAMESPACE].[TABLE_NAME].query : 可选参数，允许将一个JDBC 查询的视图，作为Spark的外部表进行查询
 23 | 
 24 | 计算结果写入外部表需要配置的参数:
 25 | * [NAMESPACE].[TABLE_NAME].uniqueKeys : 外部表的数据更新主键，因为数据写入JDBC表使用的是upsert方式，所以必须提供upsert操作的数据主键
 26 | 
 27 | 使用示例:
 28 | 
 29 | Spark读取 bi 数据库中 stores表数据示例:
 30 | 
 31 | ```sql
 32 | !set bi.stores.numPartitions = 2; 
 33 | !set bi.stores.partitionColumn = id;
 34 | // query 为可选参数, 如果没有该参数，将直接查询 bi数据库中的stores表，如果有该参数，会查询query的查询结果视图
 35 | !set bi.stores.query = """(select * from stores where store_id >10) as subq""";
 36 | 
 37 | SELECT  store_id, store_name
 38 | FROM    jdbc.bi.stores
 39 | WHERE   store_id < 50;
 40 | ```
 41 | 
 42 | Spark 写入 bi 数据库中 stores表数据示例:
 43 | ```sql
 44 | !set bi.stores.uniqueKeys = id;
 45 | 
 46 | INSERT INTO jdbc.bi.stores
 47 | SELECT 100 as store_id, "store_100" as store_name;
 48 | ```
 49 | 
 50 | ## 数据写入Kafka
 51 | 
 52 | 功能说明:
 53 | 将SQL的计算结果插入到Kafka中。目前支持将结果自动转换为 avro 和 json 两种数据格式。
 54 | 设置好Kafka的相关参数后，通过 `kafka.[TABLE_NAME]` 名称就可以向Kafka写数据了。
 55 | 
 56 | 使用参数说明:
 57 | TABLE_NAME: Spark SQL中注册的表名
 58 | 
 59 | 系统参数说明(一般由系统统一配置，多个任务共享该参数):
 60 | * kafka.bootstrap.servers : Kafka集群的Broker地址
 61 | * kafka.schema.registry.url : 可选参数，如果Kafka 集群是Confluent版本的Kafka，可以管理Avro格式kafka数据，avro的schema由schema registry进行集中管理。可以配置上对应的schema registry地址。
 62 | 
 63 | 写入json格式数据需要配置的参数:
 64 | * kafka.[TABLE_NAME].kafkaTopic: kafka的topic名称
 65 | * kafka.[TABLE_NAME].recordType: 填写 json
 66 | * kafka.[TABLE_NAME].maxRatePerPartition : 可选参数，每个spark executor写入kafka的每秒消息数。数据结果数据集比较大，一定要加上速度限制，否则会把kafka写爆掉。
 67 | 
 68 | 写入avro格式数据需要配置的参数:
 69 | * kafka.[TABLE_NAME].kafkaTopic: kafka的topic名称
 70 | * kafka.[TABLE_NAME].recordType: 填写 avro
 71 | * kafka.[TABLE_NAME].avro.forceCreate : 默认为false， 如果为true，会强制使用计算结果dataframe schema作为kafka avro schema，如果schema registry上已经存在schema则会报错。如果为false，会先从Schema Registry上获取topic的Schema（此时其他avro参数无需配置），如果获取失败，再使用计算结果dataframe schema作为kafka avro schema。
 72 | * kafka.[TABLE_NAME].avro.name : 可选参数，如果 `forceCreate` = true, 则必须提供创建avro数据schema需要的 name。
 73 | * kafka.[TABLE_NAME].avro.namespace : 可选参数，如果 `forceCreate` = true, 则必须提供创建avro数据schema需要的 namespace。
 74 | * kafka.[TABLE_NAME].maxRatePerPartition : 可选参数，每个spark executor写入kafka的每秒消息数。数据结果数据集比较大，一定要加上速度限制，否则会把kafka写爆掉。
 75 | 
 76 | 使用示例:
 77 | 
 78 | 向kafka写入json数据示例:
 79 | 
 80 | ```sql
 81 | !set kafka.test_topic.recordType = json;
 82 | !set kafka.test_topic.kafkaTopic = test_topic;
 83 | 
 84 | INSERT INTO kafka.test_topic
 85 | SELECT 100 as id, "user_100" as name;
 86 | ```
 87 | 
 88 | 向kafka写入avro数据示例:
 89 | 
 90 | ```sql
 91 | !set kafka.test_topic2.recordType = avro;
 92 | !set kafka.test_topic2.kafkaTopic = test_topic2;
 93 | -- 根据计算结果DDL自动生成Avro Schema
 94 | !set kafka.test_topic2.avro.forceCreate = true;
 95 | !set kafka.test_topic2.avro.name = student;
 96 | !set kafka.test_topic2.avro.namespace = com.wankun;
 97 | 
 98 | INSERT INTO kafka.test_topic2
 99 | SELECT 1 as id1, 'wankun' as name1, '男' as sex1, 'PRD' env1;
100 | ```
101 | 
102 | 注意：
103 | Avro 1.8.* 版本对于Enum类型支持有问题。比较trick的解决办法是直接将Spark安装环境下的avro包删除掉.
104 | 因为hive-exec-1.1.0-cdh5.13.3.jar包 assemb 了 avro 的 1.7.6-cdh5.13.3的包，所以运行完全没问题。 
105 | 
106 | ## DINGDING_SINK(DING_BOT)
107 | 
108 | 功能说明:
109 | 将SQL的程序结果通过钉钉机器人发送到钉钉群。
110 | 
111 | 参数说明:
112 | 
113 | * DING_BOT: 钉钉机器人名称
114 | 
115 | 辅助参数说明:
116 | !data_alert.title=chatlog白名单店铺没有拉取到chatlog数据;
117 | !data_alert.pattern={store_id}: {store_name};
118 | 
119 | * ${DING_BOT} : 钉钉机器人Token
120 | * ${DING_BOT}.title : 钉钉群信息Title
121 | * ${DING_BOT}.pattern : 钉钉信息格式
122 | 
123 | 使用示例:
124 | 
125 | ```sql
126 | !data_alert.title=trade信息告警;
127 | !data_alert.pattern={store_id}: {store_name};
128 | 
129 | select /*+ DINGDING_SINK(data_alert) */
130 |         distinct a.store_id, a.store_name
131 | FROM trade a
132 | WHERE a.dt='${date|yyyyMMdd}';
133 | ```
134 | 
135 | ## EMAIL_SINK(EMAIL_BOT)
136 | 
137 | 功能说明:
138 | 将SQL的程序结果发送邮件。
139 | 
140 | 参数说明:
141 | 
142 | * EMAIL_BOT: Email发送机器人名称
143 | 
144 | 辅助参数说明:
145 | 
146 | * ${EMAIL_BOT} : Email 机器人标识
147 | * ${EMAIL_BOT}.columns : 需要取结果数据中的哪些字段
148 | * ${EMAIL_BOT}.columnNames : 结果数据中字段对应的中文名
149 | * ${EMAIL_BOT}.subject : 邮件标题
150 | * ${EMAIL_BOT}.email-to : 邮件接收人地址，多个地址使用逗号分割
151 | * ${EMAIL_BOT}.email-cc : 邮件抄送人地址，多个地址使用逗号分割
152 | 
153 | 使用示例:
154 | 
155 | ```sql
156 | !set email.columns={store_id}, {store_name};
157 | !set email.columnNames=ID,名称;
158 | !set email.subject = 测试邮件;
159 | !set email.email-to = test-to@abc.com;
160 | !set email.email-cc = test-cc@abc.com;
161 | 
162 | select /*+ EMAIL_SINK(email) */
163 |         distinct a.store_id, a.store_name
164 | FROM trade a
165 | WHERE a.dt='${date|yyyyMMdd}';
166 | ```


--------------------------------------------------------------------------------
/docs/Index_Column.md:
--------------------------------------------------------------------------------
1 | # Hive表数据写入时排序索引
2 | 
3 | 如果程序是将计算结果插入到hive表，运行前会先进行判断hive表是否有 `index_column` 属性，如果有，结果数据会根据这个列进行数据排序。
4 | 后续对该表的数据查询，如果带有`index_column`列的查询条件，数据会进行非常多的文件级过滤。
5 | 
6 | hive表设置索引列方法:
7 | 
8 | `alter table t set tblproperties('index_column'='col');`


--------------------------------------------------------------------------------
/docs/Trouble_Shooting.md:
--------------------------------------------------------------------------------
 1 | # Trouble Shooting
 2 | 
 3 | ## 常见问题整理
 4 | 
 5 | * Executor OOM
 6 | 
 7 | ```
 8 | 19:50:23.535 Executor task launch worker for task 3 ERROR org.apache.spark.executor.Executor: Exception in task 0.3 in stage 0.0 (TID 3)
 9 | java.lang.OutOfMemoryError: Java heap space
10 | ```
11 | 默认SparkExecutor
12 | 目前给Spark的Driver和Executor都配置了2G内存。如果出现上述OOM错误，可以尝试增加Executor内存
13 | ```xml
14 |   <configs>
15 |     <config>
16 |       <name>spark.executor.memory</name>
17 |       <value>4g</value>
18 |     </config>
19 |   </configs>
20 | ```
21 | 
22 | * 读取parquet文件出现很多读取空文件的Task
23 | 
24 | 这个是Spark FileSourceStrategy在将LogicalPlan 转换为 FileSourceScanExec(DataSourceScanExec的子类)时的BUG。
25 | 虽然spark在计算每个split时会按照比较理想的参数去计算split，但是物理执行时对应的FileFormat(ParquetFileFormat)读取文件时可能会读到空数据。
26 | ```
27 |   def maxSplitBytes(
28 |       sparkSession: SparkSession,
29 |       selectedPartitions: Seq[PartitionDirectory]): Long = {
30 |     val defaultMaxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes
31 |     val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes
32 |     val defaultParallelism = sparkSession.sparkContext.defaultParallelism
33 |     val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum
34 |     val bytesPerCore = totalBytes / defaultParallelism
35 | 
36 |     Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore))
37 |   }
38 | ```
39 | 
40 | 可以通过调大openCostInBytes参数，也可以关闭向FileSourceScanExec算子的转化来处理。
41 | ```xml
42 |     <config>
43 |       <name>spark.sql.hive.convertMetastoreParquet</name>
44 |       <value>false</value>
45 |     </config>
46 | ```
47 | 
48 | * 复杂Streaming任务Driver端OOM
49 | 
50 | Spark UI默认会保留最近1000个executions的执行内容，供用户查看。但是如果每个execution的解析计划比较大，就比较容易造成driver端OOM。
51 | 例如，我遇到的一个Streaming任务，每个任务的Spark Plan内存占用约4M，很危险。通过减小`spark.sql.ui.retainedExecutions` 参数后，系统恢复稳定。
52 | 
53 | ## Spark 任务profile
54 | 
55 | 系统内置了async-profile工具用于对Spark程序运行过程进行Profile分析。
56 | 因为async-profile工具是基于 perf_events 进行程序采样分析的，所以要求集群机器上开启对应的系统参数。
57 | ```
58 | sysctl -w kernel.perf_event_paranoid=1
59 | sysctl -w kernel.kptr_restrict=0
60 | ```
61 | 
62 | 在开启executor profile过程中，程序会占用额外的内存资源，有可能会被NodeManager以内存用超而Kill掉。
63 | 为了能够将profile结果上传到HDFS，需要修改NodeManager参数`yarn.nodemanager.sleep-delay-before-sigkill.ms=60000`
64 | 
65 | Spark任务执行过程profile目前提供如下三种方式:
66 | 
67 | ### 直接查看堆栈法
68 | 
69 | 对于简单的任务，可以直接进行spark job管理页面，查看运行慢的executor对应Thread Dump，分析具体那个Thread运行慢导致
70 | 
71 | ### 自动Profile Executor法
72 | 
73 | 对于运行时间较短，但是运行比较慢的任务，可以通过 `--profile` 参数开启对executor进程的Profile。
74 | 
75 | profile有如下三种profile结果，默认生成JFR文件:
76 | * 通过`--config spark.profile.type=jfr`来指定生成JFR文件
77 | * 通过`--config spark.profile.type=svg`来指定生成SVG火焰图
78 | * 通过`--config spark.profile.type=yourkit`来指定生成Yourkit snapshot文件
79 | 
80 | executor运行完毕后会将生成的JFR文件上传到HDFS的`/metadata/logs/profile/${applicationId}/${attemptId}/` 路径。
81 | 
82 | ### 手动Profile Executor法
83 | 
84 | 对于运行时间较长，不需要权量进行Profile的Executor可以可以通过 `--profile --config spark.profile.manualprofile=true` 参数手动开启profile。
85 | 此时可以进入executor执行的节点的对应进程启动目录，执行 `profile.sh executor`，再依次输入`start` 和`stop` 命令，profile结束后会在当前机器生成火焰图文件 `/tmp/executor_${PID}.svg` 。
86 | 
87 | PS: 程序内部使用MXBean 进行Profile管理，并提供了Shell工具进行外部管理。如果有JS好的同学可以直接修改Spark Executor页面，增加开始和结束并直接查看火焰图是最方便的了。


--------------------------------------------------------------------------------
/docs/UDF.md:
--------------------------------------------------------------------------------
 1 | # UDF函数
 2 | 
 3 | ## UDF函数分类:
 4 | 
 5 | * 不包含业务数据处理逻辑的UDF: 建议直接写入本项目内，这样方便UDF的代码复用
 6 | * 包含业务处理逻辑的UDF: 这类代码一般不可复用，但是代码仍然要做到统一管理，所以建议和公司内一样，使用独立的repo来管理代码。
 7 | 接下来说的扩展UDF函数开发流程，指的就是这类UDF函数。
 8 | 
 9 | ## 扩展UDF函数开发流程
10 | 
11 | ### 编写UDF开发函数 
12 | 
13 | UDF函数开发规范
14 | 
15 | * 代码类以`com.wankun.[业务模块].[主业务逻辑]UDF`命名，类名上需要添加`UDFDescription`注解。
16 | * 注解中的`name`属性表示注册的函数名
17 | * 注解中的`returnType`属性表示函数的函数数据类型
18 | * 注解中的`description`属性表示函数的说明
19 | * 原则上所有业务逻辑必须要有明确`description`说明，主函数需要有UT测试类
20 | 
21 | ### 部署UDF函数
22 | 
23 | 编译项目，并上传结果jar到hdfs目录:`/deploy/config/biz-udfs-1.0.jar` (PS: 一般CI可以做到自动化)
24 | 
25 | ```
26 | mvn clean package
27 | hdfs dfs -put -f ./target/biz-udfs-1.0.jar /deploy/config/biz-udfs-1.0.jar
28 | ```
29 | 
30 | # 开发例子说明
31 | 
32 | 下文以开发推荐归因数据的order转换的UDF函数为例，说明开发和使用步骤。
33 | 
34 | ## 开发UDF 函数，并实现UDF4的call方法，实现传入4个参数，输出一个参数的UDF函数
35 | 
36 | ```java
37 | package com.wankun.udfs.recommend;
38 | 
39 | @UDFDescription(
40 |     name = "attribution_orders",
41 |     returnType = "array<struct< order_id: string, spu_id: string, sku_id: string, quantity: int,\n" +
42 |         "  price: double, payment: double, divide_order_fee: double,  status: string, attribution: string >>",
43 |     description = "在对推荐归因计算时，转换原始trade中的orders数据")
44 | public class AttributionOrdersUDF
45 |     implements UDF4<String, WrappedArray<String>, String, WrappedArray<Row>, WrappedArray<Row>> {
46 |   
47 | @Override
48 |   public WrappedArray<Row> call(String abTarget,
49 |                                 WrappedArray<String> priorSpuIds,
50 |                                 String dispatchSpuId,
51 |                                 WrappedArray<Row> originOrders) throws Exception {
52 |   
53 |                                 }
54 |     }
55 | ```
56 | 
57 | ## 使用函数
58 | 
59 | ```sql
60 | 
61 | !set spark.sql.externalUdfClasses = com.wankun.udfs.recommend.AttributionOrdersUDF;
62 | 
63 | SELECT attribution_orders(ab_target, prior_spu_id, spu_id, orders) as orders 
64 | FROM trade;
65 | ```


--------------------------------------------------------------------------------
/docs/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/architecture.png


--------------------------------------------------------------------------------
/docs/images/dq2_bollinger_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq2_bollinger_model.png


--------------------------------------------------------------------------------
/docs/images/dq2_ewma_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq2_ewma_model.png


--------------------------------------------------------------------------------
/docs/images/dq2_row_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq2_row_number.png


--------------------------------------------------------------------------------
/docs/images/dq_bollinger_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq_bollinger_model.png


--------------------------------------------------------------------------------
/docs/images/dq_ewma_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq_ewma_model.png


--------------------------------------------------------------------------------
/docs/images/dq_row_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wankunde/sql-runner/57998a059d610b358b96988ab9ca43b96bf31d06/docs/images/dq_row_number.png


--------------------------------------------------------------------------------
/src/main/java/one/profiler/AsyncProfilerMXBean.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 Andrei Pangin
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package one.profiler;
18 | 
19 | /**
20 |  * AsyncProfiler interface for JMX server.
21 |  * How to register AsyncProfiler MBean:
22 |  *
23 |  * <pre>{@code
24 |  *     ManagementFactory.getPlatformMBeanServer().registerMBean(
25 |  *             AsyncProfiler.getInstance(),
26 |  *             new ObjectName("one.profiler:type=AsyncProfiler")
27 |  *     );
28 |  * }</pre>
29 |  */
30 | public interface AsyncProfilerMXBean {
31 |     void start(String event, long interval) throws IllegalStateException;
32 |     void resume(String event, long interval) throws IllegalStateException;
33 |     void stop() throws IllegalStateException;
34 | 
35 |     long getSamples();
36 |     String getVersion();
37 | 
38 |     String execute(String command) throws IllegalArgumentException, java.io.IOException;
39 | 
40 |     String dumpCollapsed(Counter counter);
41 |     String dumpTraces(int maxTraces);
42 |     String dumpFlat(int maxMethods);
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/one/profiler/Counter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 Andrei Pangin
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package one.profiler;
18 | 
19 | /**
20 |  * Which metrics to use when generating profile in collapsed stack traces format.
21 |  */
22 | public enum Counter {
23 |     SAMPLES,
24 |     TOTAL
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/one/profiler/Events.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 Andrei Pangin
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package one.profiler;
18 | 
19 | /**
20 |  * Predefined event names to use in {@link AsyncProfiler#start(String, long)}
21 |  */
22 | public class Events {
23 |     public static final String CPU    = "cpu";
24 |     public static final String ALLOC  = "alloc";
25 |     public static final String LOCK   = "lock";
26 |     public static final String WALL   = "wall";
27 |     public static final String ITIMER = "itimer";
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the file core/target/unit-tests.log
19 | insight.root.logger=INFO,CA
20 | insight.file.stdout=/tmp/stdout
21 | log4j.rootLogger=${insight.root.logger}
22 | 
23 | #Console Appender
24 | log4j.appender.CA=org.apache.log4j.ConsoleAppender
25 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout
26 | log4j.appender.CA.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %p %c: %m%n
27 | log4j.appender.CA.Threshold = TRACE
28 | log4j.appender.CA.follow = true
29 | 
30 | #File Appender
31 | log4j.appender.FA=org.apache.log4j.FileAppender
32 | log4j.appender.FA.append=false
33 | log4j.appender.FA.file=${insight.file.stdout}
34 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout
35 | log4j.appender.FA.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %p %c: %m%n
36 | 
37 | # Set the logger level of File Appender to WARN
38 | log4j.appender.FA.Threshold = TRACE
39 | 
40 | # Some packages are noisy for no good reason.
41 | log4j.additivity.parquet.hadoop.ParquetRecordReader=false
42 | log4j.logger.parquet.hadoop.ParquetRecordReader=OFF
43 | 
44 | log4j.additivity.parquet.hadoop.ParquetOutputCommitter=false
45 | log4j.logger.parquet.hadoop.ParquetOutputCommitter=OFF
46 | 
47 | log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false
48 | log4j.logger.org.apache.hadoop.hive.serde2.lazy.LazyStruct=OFF
49 | 
50 | log4j.additivity.org.apache.hadoop.hive.metastore.RetryingHMSHandler=false
51 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=OFF
52 | 
53 | log4j.additivity.hive.ql.metadata.Hive=false
54 | log4j.logger.hive.ql.metadata.Hive=OFF
55 | 
56 | # Parquet related logging
57 | log4j.logger.parquet.hadoop=WARN
58 | log4j.logger.org.apache.spark.sql.parquet=WARN
59 | 
60 | log4j.logger.org.spark_project.jetty=ERROR
61 | log4j.logger.org.apache.spark=WARN
62 | log4j.logger.org.apache.spark.deploy.yarn=INFO
63 | log4j.logger.org.apache.hadoop.hive.ql=INFO
64 | log4j.logger.org.apache.hadoop.hive.metastore=WARN
65 | log4j.logger.org.apache.hadoop.hive.ql.log.PerfLogger=WARN
66 | log4j.logger.org.apache.hadoop.mapreduce.lib=INFO
67 | log4j.logger.org.apache.spark.sql=INFO
68 | 
69 | log4j.logger.BlockManagerMasterEndpoint=ERROR
70 | 
71 | log4j.logger.org.apache.spark.sql.execution.datasources.FileSourceStrategy=WARN
72 | 
73 | # to enable RuleExecutor log in Spark2
74 | #log4j.logger.org.apache.spark.sql.hive=TRACE
75 | #log4j.logger.org.apache.spark.sql.hive.client=INFO
76 | #log4j.logger.org.apache.spark.sql.hive.HiveMetastoreCatalog=DEBUG
77 | #log4j.logger.org.apache.spark.sql.execution.FileSourceScanExec=DEBUG
78 | 
79 | # to enable RuleExecutor log in Spark3, set this configuration in spark_default.xml
80 | #spark.sql.optimizer.planChangeLog.level=INFO
81 | 


--------------------------------------------------------------------------------
/src/main/resources/metrics.properties_template:
--------------------------------------------------------------------------------
1 | # USING : --files metrics.properties
2 | *.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
3 | *.sink.graphite.host=graphite_host
4 | *.sink.graphite.port=2003
5 | *.sink.graphite.prefix=java
6 | master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
7 | worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource
8 | driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
9 | executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCCatalog.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.execution.datasources.jdbc
19 | 
20 | import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table}
21 | import org.apache.spark.sql.util.Logging
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2021-04-08.
26 |  */
27 | class JDBCCatalog extends DelegatingCatalogExtension with Logging {
28 | 
29 |   override def name(): String = "JDBC"
30 | 
31 |   override def loadTable(ident: Identifier): Table = JDBCTable(ident)
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCDataWriter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.execution.datasources.jdbc
 19 | 
 20 | import java.sql.Connection
 21 | 
 22 | import org.apache.spark.sql.catalyst.InternalRow
 23 | import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage}
 24 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.createConnectionFactory
 25 | import org.apache.spark.sql.execution.datasources.jdbc.MyJDBCUtils._
 26 | import org.apache.spark.sql.types._
 27 | import org.apache.spark.sql.util.Logging
 28 | 
 29 | /**
 30 |  * @author kun.wan, <wankun@apache.org>
 31 |  * @date 2021-04-07.
 32 |  */
 33 | class JDBCDataWriter(schema: StructType, options: MyJDBCOptions)
 34 |   extends DataWriter[InternalRow] with Logging {
 35 | 
 36 |   val table = options.tableOrQuery
 37 |   val uniqueKeys: Set[String] =
 38 |     options.uniqueKeys.split(",").map(_.trim.toLowerCase).toSet
 39 | 
 40 |   val conn: Connection = createConnectionFactory(options)()
 41 |   conn.setAutoCommit(false)
 42 |   val (upsertSql, affectColumns, updateColumns) = upsertSqlAndColumns(conn, options)
 43 |   val stmt = conn.prepareStatement(upsertSql)
 44 | 
 45 |   val nameToIndex = schema.names.map(_.toLowerCase).zipWithIndex.toMap
 46 |   val setters =
 47 |     (affectColumns ++ updateColumns).zipWithIndex.map { case (column, pos) =>
 48 |       val fieldIndex = nameToIndex(column.toLowerCase)
 49 |       makeSetter(fieldIndex, pos + 1, schema.fields(fieldIndex).dataType)
 50 |     }
 51 | 
 52 |   var rowCount = 0
 53 |   val batchSize = options.batchSize
 54 | 
 55 |   override def write(row: InternalRow): Unit = {
 56 |     try {
 57 |       setters.map(_.apply(stmt, row))
 58 |     } catch {
 59 |       case e: Exception =>
 60 |         logError(s"fail to fill prepare statement params. Row=($row), statement=$stmt")
 61 |         throw e
 62 |     }
 63 | 
 64 |     stmt.addBatch()
 65 |     rowCount += 1
 66 |     if (rowCount % batchSize == 0) {
 67 |       val updateCounts = stmt.executeBatch().length
 68 |       //      upsertCount.add(updateCounts)
 69 |       logInfo(s"commit JDBC PreparedStatement,affected rows = ${updateCounts}, " +
 70 |         s"statement counter = ${rowCount}")
 71 | 
 72 |       rowCount = 0
 73 |     }
 74 |   }
 75 | 
 76 |   override def commit(): WriterCommitMessage = {
 77 |     val updateCounts = stmt.executeBatch().length
 78 |     //      upsertCount.add(updateCounts)
 79 |     logInfo(s"commit JDBC PreparedStatement,affected rows = ${updateCounts}, " +
 80 |       s"statement counter = ${rowCount}")
 81 |     conn.commit()
 82 |     new WriterCommitMessage() {}
 83 |   }
 84 | 
 85 |   override def abort(): Unit = {
 86 |     conn.rollback()
 87 |   }
 88 | 
 89 |   override def close(): Unit = {
 90 |     stmt.close()
 91 |     conn.close()
 92 |   }
 93 | 
 94 |   def upsertSqlAndColumns(conn: Connection,
 95 |                           options: JDBCOptions): (String, Array[String], Array[String]) = {
 96 |     val tableSchema = JdbcUtils.getSchemaOption(conn, options)
 97 |     assert(tableSchema.isDefined, s"Fail to get $table in db, maybe $table does not exist")
 98 |     val tableColumnNames = tableSchema.get.fieldNames
 99 |     val rddSchemaNames = schema.names.map(_.toLowerCase)
100 |     val affectColumns = tableColumnNames.filter(col => rddSchemaNames.contains(col.toLowerCase))
101 |     val updateColumns = affectColumns.filter(col => !uniqueKeys.contains(col.toLowerCase))
102 |     tableColumnNames.filterNot(affectColumns.contains)
103 |       .foreach(col => logWarning(s"row schema doesn't contains column : {${col} }"))
104 | 
105 |     val upsertSql =
106 |       s"""
107 |          |INSERT INTO ${table} (${affectColumns.mkString(", ")})
108 |          |VALUES ( ${affectColumns.map(_ => "?").mkString(", ")} )
109 |          |ON DUPLICATE KEY UPDATE ${updateColumns.map(_ + "= ?").mkString(", ")}
110 |          |""".stripMargin
111 |     logInfo(s"upsert sql : $upsertSql")
112 |     (upsertSql, affectColumns, updateColumns)
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCTable.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.execution.datasources.jdbc
 19 | 
 20 | import java.sql.Connection
 21 | import java.util
 22 | 
 23 | import scala.collection.JavaConverters._
 24 | import scala.collection.mutable
 25 | 
 26 | import org.apache.spark.sql.connector.catalog._
 27 | import org.apache.spark.sql.connector.read.ScanBuilder
 28 | import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
 29 | import org.apache.spark.sql.runner.container.ConfigContainer
 30 | import org.apache.spark.sql.types.StructType
 31 | import org.apache.spark.sql.util.{CaseInsensitiveStringMap, Logging}
 32 | 
 33 | /**
 34 |  * @author kun.wan, <wankun@apache.org>
 35 |  * @date 2021-04-07.
 36 |  *
 37 |  * 一般的实现里，会有一个Source类，继承 RelationProvider 和 TableProvider，负责提供Relation 和 Table对象。
 38 |  * 然后调用 DataSourceV2Utils.getTableFromProvider() 方法，从Provider 获取table实例的方法，但是我感觉这样
 39 |  * 还不如直接new 一个Table实例方便，那样做反而更绕了～～
 40 |  */
 41 | case class JDBCTable(ident: Identifier) extends Table
 42 |   with SupportsRead
 43 |   with SupportsWrite
 44 |   with Logging {
 45 | 
 46 |   import MyJDBCOptions._
 47 | 
 48 |   val namespace = ident.namespace()(0)
 49 |   val relationName = ident.name()
 50 | 
 51 |   val tableOrQuery =
 52 |     ConfigContainer.getOrElse(s"$namespace.$relationName.query", ident.name())
 53 | 
 54 |   val jdbcOptions = {
 55 |     val parameters = mutable.Map(
 56 |       JDBC_URL -> ConfigContainer.get(s"$namespace.url"),
 57 |       "user" -> ConfigContainer.get(s"$namespace.username"),
 58 |       "password" -> ConfigContainer.get(s"$namespace.password"),
 59 |       JDBC_TABLE_NAME -> tableOrQuery
 60 |     )
 61 |     Seq(
 62 |       JDBC_PARTITION_COLUMN,
 63 |       JDBC_NUM_PARTITIONS,
 64 |       JDBC_QUERY_TIMEOUT,
 65 |       JDBC_BATCH_FETCH_SIZE,
 66 |       JDBC_PUSHDOWN_PREDICATE,
 67 |       JDBC_UNIQUE_KEYS
 68 |     ).map(optionName => optionName -> s"$namespace.$relationName.$optionName")
 69 |       .filter(option => ConfigContainer.contains(option._2))
 70 |       .foreach { option => parameters += (option._1 -> ConfigContainer.get(option._2)) }
 71 | 
 72 |     // 读数据使用新的分区算法，JDBC_PARTITION_COLUMN 为必须参数，JDBC_LOWER_BOUND, JDBC_UPPER_BOUND 传入伪参数
 73 |     if (parameters.contains(JDBC_PARTITION_COLUMN)) {
 74 |       parameters += (JDBC_LOWER_BOUND -> "0")
 75 |       parameters += (JDBC_UPPER_BOUND -> "0")
 76 |     }
 77 | 
 78 |     // JDBC 更新数据时需要准备好更新的表的数据主键
 79 |     new MyJDBCOptions(parameters.toMap)
 80 |   }
 81 | 
 82 |   override def name(): String = ident.toString
 83 | 
 84 |   /**
 85 |    * JDBC表写的时候，schema通过child Plan自动解析生成
 86 |    * JDBC表读的时候，进行schema自动推测
 87 |    * @return
 88 |    */
 89 |   override def schema(): StructType = {
 90 |     if (ConfigContainer.contains(s"${ident.toString}.schemaDDL")) {
 91 |       StructType.fromDDL(ConfigContainer.get(s"${ident.toString}.schemaDDL"))
 92 |     } else {
 93 |       val conn: Connection = MyJDBCUtils.createConnectionFactory(jdbcOptions)()
 94 |       try {
 95 |         JdbcUtils.getSchemaOption(conn, jdbcOptions).get
 96 |       } finally {
 97 |         conn.close()
 98 |       }
 99 |     }
100 |   }
101 | 
102 |   override def capabilities(): util.Set[TableCapability] =
103 |     Set(TableCapability.BATCH_READ,
104 |       TableCapability.BATCH_WRITE).asJava
105 | 
106 |   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
107 |     Seq(
108 |       JDBC_URL,
109 |       "user",
110 |       "password",
111 |       JDBC_TABLE_NAME,
112 |       JDBC_PARTITION_COLUMN,
113 |       JDBC_NUM_PARTITIONS
114 |     ).foreach { option =>
115 |       require(jdbcOptions.parameters.contains(option),
116 |         s"parameter $option is needed in JDBC read")
117 |     }
118 | 
119 |     new JDBCScanBuilder(schema, jdbcOptions)
120 |   }
121 | 
122 |   override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {
123 |     Seq(
124 |       JDBC_URL,
125 |       "user",
126 |       "password",
127 |       JDBC_TABLE_NAME,
128 |       JDBC_UNIQUE_KEYS
129 |     ).foreach { option =>
130 |       require(jdbcOptions.parameters.contains(option),
131 |         s"parameter $option is needed in JDBC write")
132 |     }
133 | 
134 |     new JDBCWriteBuilder(schema, jdbcOptions)
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCWriteBuilder.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.execution.datasources.jdbc
19 | 
20 | import org.apache.spark.sql.catalyst.InternalRow
21 | import org.apache.spark.sql.connector.write._
22 | import org.apache.spark.sql.types.StructType
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  * @date 2021-04-07.
27 |  */
28 | class JDBCWriteBuilder(schema: StructType, options: MyJDBCOptions) extends WriteBuilder {
29 | 
30 |   override def buildForBatch(): BatchWrite = new JDBCBatchWrite(schema, options)
31 | 
32 | }
33 | 
34 | class JDBCBatchWrite(schema: StructType, options: MyJDBCOptions) extends BatchWrite {
35 | 
36 |   override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory =
37 |     new JDBCDataWriterFactory(schema, options)
38 | 
39 |   override def commit(messages: Array[WriterCommitMessage]): Unit = {}
40 | 
41 |   override def abort(messages: Array[WriterCommitMessage]): Unit = {}
42 | }
43 | 
44 | class JDBCDataWriterFactory(schema: StructType, options: MyJDBCOptions) extends DataWriterFactory {
45 | 
46 |   override def createWriter(partitionId: Int, taskId: Long): DataWriter[InternalRow] =
47 |     new JDBCDataWriter(schema, options)
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/MyJDBCOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.execution.datasources.jdbc
19 | 
20 | import java.util.Locale
21 | 
22 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  *
27 |  * @date 2021-04-08.
28 |  *
29 |  *      Spark内置的JDBCOptions 不会序列化用户传入的自定义属性，所以直接自己干
30 |  */
31 | case class MyJDBCOptions(@transient override val parameters: CaseInsensitiveMap[String])
32 |   extends JDBCOptions(parameters) {
33 | 
34 |   import JDBCOptions._
35 | 
36 |   def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
37 | 
38 |   def this(url: String, table: String, parameters: Map[String, String]) = {
39 |     this(CaseInsensitiveMap(parameters ++ Map(
40 |       JDBCOptions.JDBC_URL -> url,
41 |       JDBCOptions.JDBC_TABLE_NAME -> table)))
42 |   }
43 | 
44 |   require(
45 |     parameters.get(JDBC_TABLE_NAME).isDefined,
46 |     s"Option '$JDBC_TABLE_NAME' is required. " +
47 |       s"Option '$JDBC_QUERY_STRING' is not applicable while writing.")
48 | 
49 |   val uniqueKeys = parameters.getOrElse(MyJDBCOptions.JDBC_UNIQUE_KEYS, "")
50 | 
51 |   var filterWhereClause = parameters.getOrElse(MyJDBCOptions.JDBC_FILTER_WHERE_CLAUSE, "")
52 | 
53 | }
54 | 
55 | object MyJDBCOptions {
56 | 
57 |   private val jdbcOptionNames = collection.mutable.Set[String]()
58 | 
59 |   private def newOption(name: String): String = {
60 |     jdbcOptionNames += name.toLowerCase(Locale.ROOT)
61 |     name
62 |   }
63 | 
64 |   val JDBC_URL = newOption("url")
65 |   val JDBC_TABLE_NAME = newOption("dbtable")
66 |   val JDBC_QUERY_STRING = newOption("query")
67 |   val JDBC_DRIVER_CLASS = newOption("driver")
68 |   val JDBC_PARTITION_COLUMN = newOption("partitionColumn")
69 |   val JDBC_LOWER_BOUND = newOption("lowerBound")
70 |   val JDBC_UPPER_BOUND = newOption("upperBound")
71 |   val JDBC_NUM_PARTITIONS = newOption("numPartitions")
72 |   val JDBC_QUERY_TIMEOUT = newOption("queryTimeout")
73 |   val JDBC_BATCH_FETCH_SIZE = newOption("fetchsize")
74 |   val JDBC_TRUNCATE = newOption("truncate")
75 |   val JDBC_CASCADE_TRUNCATE = newOption("cascadeTruncate")
76 |   val JDBC_CREATE_TABLE_OPTIONS = newOption("createTableOptions")
77 |   val JDBC_CREATE_TABLE_COLUMN_TYPES = newOption("createTableColumnTypes")
78 |   val JDBC_CUSTOM_DATAFRAME_COLUMN_TYPES = newOption("customSchema")
79 |   val JDBC_BATCH_INSERT_SIZE = newOption("batchsize")
80 |   val JDBC_TXN_ISOLATION_LEVEL = newOption("isolationLevel")
81 |   val JDBC_SESSION_INIT_STATEMENT = newOption("sessionInitStatement")
82 |   val JDBC_PUSHDOWN_PREDICATE = newOption("pushDownPredicate")
83 |   val JDBC_UNIQUE_KEYS = newOption("uniqueKeys")
84 |   val JDBC_FILTER_WHERE_CLAUSE = newOption("filterWhereClause")
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/kafka/KafkaCatalog.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.execution.datasources.kafka
19 | 
20 | import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table}
21 | import org.apache.spark.sql.util.Logging
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2021-04-08.
26 |  */
27 | class KafkaCatalog extends DelegatingCatalogExtension with Logging {
28 | 
29 |   override def name(): String = "KAFKA"
30 | 
31 |   override def loadTable(ident: Identifier): Table = KafkaTable(ident)
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/kafka/KafkaOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.execution.datasources.kafka
19 | 
20 | import java.util.Properties
21 | 
22 | import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper}
23 | import io.confluent.kafka.serializers.{AbstractKafkaAvroSerDeConfig, KafkaAvroSerializer}
24 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig}
25 | import org.apache.kafka.common.serialization.StringSerializer
26 | 
27 | import scala.reflect.ClassTag
28 | 
29 | import scala.collection.JavaConverters._
30 | 
31 | /**
32 |  * @author kun.wan, <wankun@apache.org>
33 |  * @date 2020-07-13.
34 |  */
35 | case class KafkaOptions(name: String, config: Map[String, String]) extends Serializable {
36 |   val bootstrapServers = config(s"kafka.bootstrap.servers")
37 |   val schemaRegistryUrl = config.getOrElse(s"kafka.schema.registry.url", "")
38 | 
39 |   val topic = config(s"kafka.${name}.kafkaTopic")
40 |   val recordType: String = config(s"kafka.${name}.recordType")
41 |   val avroName = config.getOrElse(s"kafka.${name}.avro.name", "")
42 |   val avroNamespace = config.getOrElse(s"kafka.${name}.avro.namespace", "")
43 |   val fieldMapping = config.getOrElse(s"kafka.${name}.avro.fieldMapping", "")
44 |   val avroForceCreate = config.getOrElse(s"kafka.${name}.avro.forceCreate", "false")
45 | 
46 |   val maxRatePerPartition = config.getOrElse(s"kafka.${name}.maxRatePerPartition", "10000000").toInt
47 | 
48 |   lazy val fieldMappingMap = {
49 |     val objectMapper = new ObjectMapper
50 |     if (fieldMapping != "") {
51 |       objectMapper.readTree(fieldMapping)
52 |         .asScala
53 |         .map(f => f.path("name").textValue() -> f)
54 |         .toMap
55 |     } else {
56 |       Map[String, JsonNode]()
57 |     }
58 |   }
59 | 
60 |   lazy val serialClass: Class[_] = recordType match {
61 |     case JSON_TYPE =>
62 |       classOf[StringSerializer]
63 |     case AVRO_TYPE =>
64 |       classOf[KafkaAvroSerializer]
65 |   }
66 | 
67 |   val JSON_TYPE: String = "json"
68 |   val AVRO_TYPE: String = "avro"
69 | 
70 |   def initProducer[T: ClassTag](): KafkaProducer[String, T] = {
71 |     val properties = new Properties
72 |     properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers)
73 |     properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer])
74 |     properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serialClass)
75 |     properties.put(ProducerConfig.ACKS_CONFIG, "all")
76 |     properties.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl)
77 |     new KafkaProducer[String, T](properties)
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/kafka/KafkaTable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.execution.datasources.kafka
19 | 
20 | import java.util
21 | 
22 | import org.apache.spark.sql.catalyst.InternalRow
23 | import org.apache.spark.sql.connector.catalog.{Identifier, SupportsWrite, Table, TableCapability}
24 | import org.apache.spark.sql.connector.write._
25 | import org.apache.spark.sql.types.StructType
26 | import scala.collection.JavaConverters._
27 | 
28 | import org.apache.spark.sql.runner.container.ConfigContainer
29 | 
30 | /**
31 |  * @author kun.wan, <wankun@apache.org>
32 |  * @date 2021-04-06.
33 |  */
34 | case class KafkaTable(ident: Identifier) extends Table with SupportsWrite {
35 | 
36 |   override def name(): String = ident.toString
37 | 
38 |   override def schema(): StructType =
39 |     StructType.fromDDL(ConfigContainer.get(s"${ident.toString}.schemaDDL"))
40 | 
41 |   override def capabilities(): util.Set[TableCapability] =
42 |     Set(TableCapability.BATCH_WRITE).asJava
43 | 
44 |   override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
45 |     new KafkaWriteBuilder(ident.name(), schema())
46 | }
47 | 
48 | class KafkaWriteBuilder(name: String, schema: StructType) extends WriteBuilder {
49 | 
50 |   override def buildForBatch(): BatchWrite = new KafkaBatchWrite(name, schema)
51 | 
52 | }
53 | 
54 | class KafkaBatchWrite(name: String, schema: StructType) extends BatchWrite {
55 | 
56 |   override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory =
57 |     new KafkaDataWriterFactory(name, schema)
58 | 
59 |   override def commit(messages: Array[WriterCommitMessage]): Unit = {}
60 | 
61 |   override def abort(messages: Array[WriterCommitMessage]): Unit = {}
62 | }
63 | 
64 | class KafkaDataWriterFactory(name: String, schema: StructType) extends DataWriterFactory {
65 | 
66 |   val kafkaOption: KafkaOptions = KafkaOptions(name, ConfigContainer.valueMap.get())
67 | 
68 |   override def createWriter(partitionId: Int, taskId: Long): DataWriter[InternalRow] =
69 |     new KafkaDataWriter(kafkaOption, schema)
70 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hive/SqlRunnerMetrics.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.execution.SparkPlan
22 | import org.apache.spark.sql.execution.command.DataWritingCommandExec
23 | import org.apache.spark.sql.execution.metric.SQLMetric
24 | import org.apache.spark.sql.hive.execution.{HiveTableScanExec, InsertIntoHiveTable}
25 | 
26 | /**
27 |  * @author kun.wan, <wankun@apache.org>
28 |  * @date 2020-04-29.
29 |  */
30 | object SqlRunnerMetrics extends Logging {
31 | 
32 |   def logSparkPlanMetrics(plan: SparkPlan): Unit = plan match {
33 |     case HiveTableScanExec(_, relation, _) =>
34 |       logInfo(s"source ${relation.nodeName}(${relation.tableMeta.identifier}) metrics : ${formatMetrics(plan.metrics)}")
35 |     case DataWritingCommandExec(cmd: InsertIntoHiveTable, _) =>
36 |       logInfo(s"Insert table ${cmd.table.identifier} metrics : ${formatMetrics(plan.metrics)}")
37 | 
38 |     case _ =>
39 |   }
40 | 
41 |   def formatMetrics(metrics: Map[String, SQLMetric]): Map[String, Long] = metrics.map {
42 |     case (name: String, metric: SQLMetric) =>
43 |       name -> metric.value
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/optimizer/CollectValueRule.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import java.util.Locale
21 | 
22 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnresolvedHint}
23 | import org.apache.spark.sql.catalyst.rules.Rule
24 | import org.apache.spark.sql.util.OptimizerUtil.parseHintParameter
25 | 
26 | import org.apache.spark.sql.runner.callback.{ArrayValueCollector, DataCallBackFactory, SingleValueCollector}
27 | 
28 | /**
29 |  * @author kun.wan, <wankun@apache.org>
30 |  * @date 2020-09-15.
31 |  */
32 | object CollectValueRule extends Rule[LogicalPlan] {
33 | 
34 |   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
35 |     case hint@UnresolvedHint(hintName, parameters, child) => hintName.toUpperCase(Locale.ROOT) match {
36 |       case "COLLECT_VALUE" =>
37 |         val name: String = parseHintParameter(parameters(0))
38 |         val columnName: String = parseHintParameter(parameters(1))
39 |         DataCallBackFactory.registerDataCallBack(SingleValueCollector(name, columnName))
40 | 
41 |         child
42 | 
43 |       case "COLLECT_ARRAY" =>
44 |         val name: String = parseHintParameter(parameters(0))
45 |         val columnName: String = parseHintParameter(parameters(1))
46 |         DataCallBackFactory.registerDataCallBack(ArrayValueCollector(name, columnName))
47 | 
48 |         child
49 | 
50 |       case _ => hint
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/optimizer/DataQualityRule.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import java.util.Locale
21 | 
22 | import org.apache.spark.internal.Logging
23 | import org.apache.spark.sql.catalyst.expressions.Literal
24 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
25 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, UnresolvedHint}
26 | import org.apache.spark.sql.catalyst.rules.Rule
27 | import org.apache.spark.sql.{Column, SparkSession}
28 | import org.apache.spark.util.IdGenerator
29 | 
30 | import org.apache.spark.sql.runner.callback.{DataCallBackFactory, DataCheckCallBack}
31 | 
32 | /**
33 |  * @author kun.wan, <wankun@apache.org>
34 |  * @date 2021-02-20.
35 |  */
36 | case class DataQualityRule(spark: SparkSession) extends Rule[LogicalPlan] {
37 | 
38 |   import DataQualityRule._
39 | 
40 |   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
41 |     case hint @ UnresolvedHint(hintName, parameters, child) => hintName.toUpperCase(Locale.ROOT) match {
42 |       case "DATA_CHECK" =>
43 |         val checkTitle: String = parameters.head.toString
44 |         val dataCheckExpressions =
45 |           parameters.tail map { case literal: Literal =>
46 |             val expression = literal.toString()
47 |             val checkResultColumn = generateDataCheckColumnName()
48 |             val column = Column.apply(CatalystSqlParser.parseExpression(expression)).as(checkResultColumn)
49 |             column.named.children.head.children.find { expr => child.output.contains(expr) } match {
50 |               case Some(originColumnExpr) =>
51 |                 DataCallBackFactory.registerDataCallBack(
52 |                   DataCheckCallBack(checkTitle,
53 |                     child.output.find( p => p == originColumnExpr).get.name,
54 |                     checkResultColumn,
55 |                     expression))
56 |                 column.named
57 | 
58 |               case _ =>
59 |                 throw new RuntimeException("Data check column not matched!")
60 |             }
61 |           }
62 | 
63 |         Project(child.output ++ dataCheckExpressions, child)
64 | 
65 |       case _ => hint
66 |     }
67 |   }
68 | }
69 | 
70 | object DataQualityRule extends Logging {
71 |   private val ID_GENERATOR = new IdGenerator
72 | 
73 |   def generateDataCheckColumnName(): String = {
74 |     s"__DATA_CHECK_${ID_GENERATOR.next}__"
75 |   }
76 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/optimizer/ExternalSinkRule.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import java.util.Locale
21 | 
22 | import org.apache.spark.sql.SparkSession
23 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnresolvedHint}
24 | import org.apache.spark.sql.catalyst.rules.Rule
25 | import org.apache.spark.sql.util.OptimizerUtil.parseHintParameter
26 | 
27 | import org.apache.spark.sql.runner.callback.{DataCallBackFactory, EmailSink}
28 | import org.apache.spark.sql.runner.container.ConfigContainer
29 | 
30 | /**
31 |  * @author kun.wan, <wankun@apache.org>
32 |  * @date 2020-09-15.
33 |  */
34 | case class ExternalSinkRule(spark: SparkSession) extends Rule[LogicalPlan] {
35 | 
36 |   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
37 |     case hint@UnresolvedHint(hintName, parameters, child) => hintName.toUpperCase(Locale.ROOT) match {
38 |       case "EMAIL_SINK" =>
39 |         val name = parseHintParameter(parameters(0))
40 |         DataCallBackFactory.registerDataCallBack(EmailSink(name, ConfigContainer.valueMap.get()))
41 |         child
42 | 
43 |       case _ => hint
44 |     }
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/optimizer/ExternalTableRule.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, UnresolvedRelation}
22 | import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, LogicalPlan, With}
23 | import org.apache.spark.sql.catalyst.rules.Rule
24 | import org.apache.spark.sql.execution.QueryExecution
25 | import org.apache.spark.sql.runner.container.ConfigContainer
26 | 
27 | /**
28 |  * @author kun.wan, <wankun@apache.org>
29 |  * @date 2021-04-07.
30 |  */
31 | case class ExternalTableRule(spark: SparkSession) extends Rule[LogicalPlan] {
32 | 
33 |   import spark.sessionState.analyzer._
34 | 
35 |   // from Analyzer
36 |   private def isResolvingView: Boolean = AnalysisContext.get.catalogAndNamespace.nonEmpty
37 | 
38 |   // If we are resolving relations insides views, we need to expand single-part relation names with
39 |   // the current catalog and namespace of when the view was created.
40 |   private def expandRelationName(nameParts: Seq[String]): Seq[String] = {
41 |     if (!isResolvingView) return nameParts
42 | 
43 |     if (nameParts.length == 1) {
44 |       AnalysisContext.get.catalogAndNamespace :+ nameParts.head
45 |     } else if (spark.sessionState.catalogManager.isCatalogRegistered(nameParts.head)) {
46 |       nameParts
47 |     } else {
48 |       AnalysisContext.get.catalogAndNamespace.head +: nameParts
49 |     }
50 |   }
51 | 
52 |   def setSchemaDDL(u: UnresolvedRelation, child: LogicalPlan): Unit = {
53 |     expandRelationName(u.multipartIdentifier) match {
54 |       case NonSessionCatalogAndIdentifier(catalog, ident) =>
55 |         val schemaDDL = new QueryExecution(spark, child).analyzed.schema.toDDL
56 |         ConfigContainer :+ (s"${ident.toString}.schemaDDL" -> schemaDDL)
57 | 
58 |       case _ =>
59 |     }
60 |   }
61 | 
62 |   override def apply(plan: LogicalPlan): LogicalPlan = {
63 |     plan match {
64 |       case InsertIntoStatement(u: UnresolvedRelation, _, _, query: LogicalPlan, _, _) =>
65 |         setSchemaDDL(u, query)
66 | 
67 |       case With(InsertIntoStatement(u: UnresolvedRelation, _, _, query: LogicalPlan, _, _), cteRelations) =>
68 |         setSchemaDDL(u, With(query, cteRelations))
69 | 
70 |       case _ =>
71 |     }
72 |     plan
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/optimizer/InsightExtensions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.SparkSessionExtensions
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2020-04-17.
26 |  */
27 | class InsightExtensions extends (SparkSessionExtensions => Unit) with Logging {
28 |   def apply(e: SparkSessionExtensions): Unit = {
29 |     e.injectOptimizerRule(RepartitionRule)
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/optimizer/PartitionScanLimitRule.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation}
22 | import org.apache.spark.sql.catalyst.expressions._
23 | import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan}
24 | import org.apache.spark.sql.catalyst.rules.Rule
25 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
26 | import org.apache.spark.sql.{AnalysisException, SparkSession}
27 | 
28 | import scala.collection.mutable.ArrayBuffer
29 | 
30 | /**
31 |  * @author kun.wan, <wankun@apache.org>
32 |  * @date 2020-07-28.
33 |  */
34 | case class PartitionScanLimitRule(spark: SparkSession) extends Rule[LogicalPlan] with Logging {
35 | 
36 |   val partitionScanLimitEnable: Boolean =
37 |     spark.conf.get("spark.partition.scan.limit.enable", "true").toBoolean
38 | 
39 |   def conditionCheck(partitionColNames: Seq[String],
40 |                      filters: ArrayBuffer[Expression],
41 |                      tableMeta: CatalogTable): Unit = {
42 |     val filteredAttributes = filters.flatMap(_.references.map(_.name.toLowerCase))
43 |     if ((partitionColNames.map(_.toLowerCase) intersect filteredAttributes).size == 0) {
44 |       val table = tableMeta.identifier
45 |       throw new AnalysisException(
46 |         s"""Does not find partition column filter condition for table $table
47 |            |partitionColNames : ${partitionColNames.mkString(", ")}
48 |            |filteredAttributes : $filteredAttributes
49 |            |""".stripMargin)
50 |     }
51 |   }
52 | 
53 |   def checkRelationFilters(plan: LogicalPlan, filters: ArrayBuffer[Expression]): Unit =
54 |     plan match {
55 |       case Filter(condition, child) if condition.deterministic =>
56 |         checkRelationFilters(child, filters :+ condition)
57 | 
58 |       case HiveTableRelation(catalogTable, _, partitionCols, _, _)
59 |         if partitionCols.nonEmpty =>
60 |         val partitionColNames = partitionCols.map(_.name)
61 |         conditionCheck(partitionColNames, filters, catalogTable)
62 | 
63 |       case LogicalRelation(relation: HadoopFsRelation, _, catalogTableOpt, _) =>
64 |         relation.partitionSchemaOption.map { case partitionSchema =>
65 |           val partitionColNames = partitionSchema.fieldNames
66 |           conditionCheck(partitionColNames, filters, catalogTableOpt.get)
67 |         }
68 | 
69 |       case Join(left, right, _, _, _) =>
70 |         checkRelationFilters(left, ArrayBuffer[Expression]())
71 |         checkRelationFilters(right, ArrayBuffer[Expression]())
72 | 
73 |       case _ =>
74 |         plan.children.map(checkRelationFilters(_, filters))
75 |     }
76 | 
77 |   override def apply(plan: LogicalPlan): LogicalPlan = {
78 |     if (partitionScanLimitEnable) {
79 |       checkRelationFilters(plan, ArrayBuffer[Expression]())
80 |     }
81 |     plan
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/optimizer/RepartitionRule.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.optimizer
 19 | 
 20 | import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics
 21 | import org.apache.hadoop.hive.common.StatsSetupConst
 22 | import org.apache.spark.internal.Logging
 23 | import org.apache.spark.sql.SparkSession
 24 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTablePartition}
 25 | import org.apache.spark.sql.catalyst.dsl.expressions._
 26 | import org.apache.spark.sql.catalyst.expressions.SortOrder
 27 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition, RepartitionByExpression, _}
 28 | import org.apache.spark.sql.catalyst.rules.Rule
 29 | import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
 30 | import org.apache.spark.sql.hive.execution.InsertIntoHiveTable
 31 | import org.apache.spark.sql.util.SystemVariables.INDEX_COLUMN_NAME
 32 | 
 33 | /**
 34 |  * @author kun.wan, <wankun@apache.org>
 35 |  * @date 2020-04-17.
 36 |  */
 37 | case class RepartitionRule(spark: SparkSession) extends Rule[LogicalPlan] with Logging {
 38 | 
 39 |   val DEFAULT_PARTITION_SIZE = 64 * 1024 * 1024L
 40 |   val SAMPLING_PARTITIONS = 10
 41 | 
 42 |   val analyzer = spark.sessionState.analyzer
 43 |   val catalog = SparkSession.active.sessionState.catalog
 44 | 
 45 |   override def apply(plan: LogicalPlan): LogicalPlan = {
 46 |     val newPlan = plan transform {
 47 |       case InsertIntoHiveTable(table, partition, query, overwrite, partitionExists, outputCols)
 48 |         if table.partitionColumnNames.size > 0 && checkQueryType(query) =>
 49 | 
 50 |         val newQuery: LogicalPlan = transformQuery(table, query)
 51 |         InsertIntoHiveTable(table, partition, newQuery, overwrite, partitionExists, outputCols)
 52 | 
 53 |       case InsertIntoHadoopFsRelationCommand(outputPath, staticPartitions, ifPartitionNotExists,
 54 |       partitionColumns, bucketSpec, fileFormat, options, query, mode, catalogTable, fileIndex,
 55 |       outputColumnNames)
 56 |         if catalogTable.isDefined && (staticPartitions.size + partitionColumns.size) > 0
 57 |           && checkQueryType(query) =>
 58 |         val newQuery =
 59 |           transformQuery(catalogTable.get, query)
 60 | 
 61 |         InsertIntoHadoopFsRelationCommand(
 62 |           outputPath,
 63 |           staticPartitions,
 64 |           ifPartitionNotExists,
 65 |           partitionColumns,
 66 |           bucketSpec,
 67 |           fileFormat,
 68 |           options,
 69 |           newQuery,
 70 |           mode,
 71 |           catalogTable,
 72 |           fileIndex,
 73 |           outputColumnNames)
 74 |     }
 75 |     if (!newPlan.fastEquals(plan)) {
 76 |       logDebug(s"plan after RepartitionRule:\n$newPlan")
 77 |     }
 78 |     newPlan
 79 |   }
 80 | 
 81 |   private def checkQueryType(query: LogicalPlan): Boolean = {
 82 |     !query.isInstanceOf[Sort] && !query.isInstanceOf[Repartition] &&
 83 |       !query.isInstanceOf[RepartitionByExpression]
 84 |   }
 85 | 
 86 |   private def transformQuery(table: CatalogTable, query: LogicalPlan): LogicalPlan = {
 87 |     val tableName = table.identifier
 88 |     val sortExprsOpt: Option[Seq[SortOrder]] =
 89 |       table.properties.get(INDEX_COLUMN_NAME).map(indexColumn => {
 90 |         val order = Symbol(indexColumn).attr.asc
 91 |         Seq(analyzer.resolveExpressionBottomUp(order, query).asInstanceOf[SortOrder])
 92 |       })
 93 | 
 94 |     val numPartitionsOpt = repartitionNumbers(catalog.listPartitions(tableName))
 95 |     (sortExprsOpt, numPartitionsOpt) match {
 96 |       case (Some(sortExprs), Some(numPartitions)) =>
 97 |         RepartitionByExpression(sortExprs, query, numPartitions)
 98 | 
 99 |       case (Some(sortExprs), None) => Sort(sortExprs, true, query)
100 |       case (None, Some(numPartitions)) => Repartition(numPartitions, true, query)
101 |       case (None, None) => query
102 |     }
103 |   }
104 | 
105 |   /**
106 |    * 1. 根据分区创建时间倒排序，取最近创建的分区
107 |    * 2. sample 采样10个分区元数据来计算分区个数，取结果中位数
108 |    * @param partitions
109 |    * @return
110 |    */
111 |   def repartitionNumbers(partitions: Seq[CatalogTablePartition]): Option[Int] = {
112 | 
113 |     val stats = new DescriptiveStatistics
114 |     if (log.isDebugEnabled) {
115 |       partitions.foreach(p => logDebug(s"got partition ${p.simpleString}"))
116 |     }
117 |     partitions.filter(_.parameters.contains(StatsSetupConst.TOTAL_SIZE))
118 |       .sortWith((p1, p2) => p1.createTime > p2.createTime)
119 |       .slice(0, SAMPLING_PARTITIONS)
120 |       .foreach { p =>
121 |         stats.addValue(p.parameters.get(StatsSetupConst.TOTAL_SIZE).get.toLong
122 |           / DEFAULT_PARTITION_SIZE)
123 |       }
124 |     if (stats.getPercentile(50).isNaN) {
125 |       None
126 |     } else {
127 |       val number = stats.getPercentile(50).toInt + 1
128 |       if (number > 0) {
129 |         Some(number)
130 |       } else {
131 |         None
132 |       }
133 |     }
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/plugin/AsyncProfilePlugin.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.plugin
19 | 
20 | import java.lang.management.ManagementFactory
21 | 
22 | import javax.management.ObjectName
23 | import one.profiler.AsyncProfiler
24 | 
25 | /**
26 |  * @author kun.wan, <wankun@apache.org>
27 |  * @date 2020-05-14.
28 |  */
29 | class AsyncProfilePlugin extends ProfilePlugin {
30 | 
31 |   var profiler: AsyncProfiler = _
32 |   
33 |   override def init0(): Unit = {
34 |     profileFile = s"${logDir}/${containerId}.${profileType}"
35 | 
36 |     profiler = AsyncProfiler.getInstance()
37 |     ManagementFactory.getPlatformMBeanServer().registerMBean(
38 |       profiler,
39 |       new ObjectName("one.profiler:type=AsyncProfiler")
40 |     )
41 |     if (!manualProfile) {
42 |       logInfo(profiler.execute(s"start,${profileType},file=${profileFile}"))
43 |     }
44 |   }
45 | 
46 |   override def shutdown0(): Unit = {
47 |     logInfo(profiler.execute(s"stop,file=${profileFile}"))
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/plugin/ProfilePlugin.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.plugin
 19 | 
 20 | import java.util.{Map => JMap}
 21 | 
 22 | import org.apache.hadoop.conf.Configuration
 23 | import org.apache.hadoop.fs.{FileSystem, Path}
 24 | import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
 25 | import org.apache.spark.internal.Logging
 26 | import org.apache.spark.util.SignalUtils
 27 | import org.apache.spark.SparkConf
 28 | import org.apache.spark.api.plugin.{ExecutorPlugin, PluginContext}
 29 | 
 30 | import scala.reflect.io.File
 31 | 
 32 | 
 33 | /**
 34 |  * @author kun.wan, <wankun@apache.org>
 35 |  * @date 2020-05-26.
 36 |  */
 37 | abstract class ProfilePlugin extends ExecutorPlugin with Logging {
 38 | 
 39 |   val pluginName = this.getClass.getName.stripSuffix("$")
 40 | 
 41 |   var conf: SparkConf = _
 42 |   var manualProfile: Boolean = _
 43 |   var profileType: String = _
 44 | 
 45 |   val logDir = System.getProperty("spark.yarn.app.container.log.dir")
 46 |   val containerId = YarnSparkHadoopUtil.getContainerId
 47 |   val applicationAttemptId = containerId.getApplicationAttemptId
 48 |   val applicationId = applicationAttemptId.getApplicationId
 49 | 
 50 |   var profileFile: String = _
 51 | 
 52 |   val fs = FileSystem.get(new Configuration())
 53 |   var shutdownFlag = false
 54 | 
 55 |   def init0(): Unit = {}
 56 | 
 57 |   def shutdown0(): Unit = {}
 58 | 
 59 |   override def init(ctx: PluginContext, extraConf: JMap[String, String]): Unit = {
 60 |     conf = ctx.conf()
 61 |     manualProfile = conf.getBoolean("spark.profile.manualprofile", false)
 62 |     profileType = conf.get("spark.profile.type", "jfr")
 63 | 
 64 |     init0()
 65 |     logInfo(s"init ProfileExecutorPlugin")
 66 | 
 67 |     // Handle SIGTERM from NodeManager
 68 |     Seq("TERM", "HUP", "INT").foreach { sig =>
 69 |       SignalUtils.register(sig) {
 70 |         log.error("Executor RECEIVED SIGNAL " + sig)
 71 |         while(!shutdownFlag) {
 72 |           Thread sleep 100
 73 |           log.error("Executor shutdown loopback. SIGNAL " + sig)
 74 |         }
 75 |         log.error("ProfilePlugin Shutdown loop end. SIGNAL " + sig)
 76 |         false
 77 |       }
 78 |     }
 79 |   }
 80 | 
 81 |   /**
 82 |    * 1. Shutdown method is already a ShutdownHook.
 83 |    * 2. Executor may be killed by NodeManager before the shutdown method is finished.
 84 |    *  The default wait time  is 250ms defined by sleepDelayBeforeSigKill in ContainerLaunch Service.
 85 |    */
 86 |   override def shutdown(): Unit = {
 87 |     if (!manualProfile) {
 88 |       logInfo(s"shutdown ${pluginName}")
 89 |       shutdown0()
 90 | 
 91 |       logInfo("begin upload executor profile file.")
 92 | 
 93 |       val srcPath = new Path(profileFile)
 94 |       val dstPath = new Path(s"/metadata/logs/profile/${applicationId}/" +
 95 |         s"${applicationAttemptId.getAttemptId}/${containerId}.${profileType}")
 96 |       logInfo(s"profileFile :${srcPath} hdfs path : ${dstPath}")
 97 |       fs.copyFromLocalFile(true, true, srcPath, dstPath)
 98 |       File(profileFile).delete()
 99 |     }
100 |     logInfo(s"end ${pluginName}")
101 |     shutdownFlag = true
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/plugin/YourkitPlugin.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.plugin
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2020-05-14.
23 |  */
24 | class YourkitPlugin extends ProfilePlugin {
25 | 
26 |   override def shutdown0(): Unit = {
27 |     val controllerCls = Class.forName("com.yourkit.api.Controller")
28 |     val controller = controllerCls.newInstance()
29 | 
30 |     val displayNameMethod = controllerCls.getMethod("capturePerformanceSnapshot")
31 |     profileFile = displayNameMethod.invoke(controller).asInstanceOf[String]
32 | 
33 |     val stopCpuProfilingMethod = controllerCls.getMethod("stopCpuProfiling")
34 |     stopCpuProfilingMethod.invoke(controller)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/Alert.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.util.{DQUtil, SystemVariables}
22 | import org.apache.spark.sql.runner.container.ConfigContainer
23 | 
24 | /**
25 |  * 对非测试，运行失败的程序进行告警
26 |  *
27 |  * @author kun.wan, <wankun@apache.org>
28 |  * @date 2020-02-26.
29 |  */
30 | object Alert extends ArgParser with Logging {
31 |   def main(args: Array[String]): Unit = {
32 |     if (!args.contains("--test") && !args.contains("--dryrun")) {
33 |       parseArgument(args)
34 |       val env = ConfigContainer.getOrElse(SystemVariables.ENV, SystemVariables.DEFAULT_ENV)
35 | 
36 |       val alertMessage = s"$env : 程序 ${args(0)} 运行失败，请检查！"
37 |       logError(alertMessage)
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/ArgParser.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.runner
 19 | 
 20 | import java.time.LocalDateTime
 21 | 
 22 | import org.apache.commons.io.FilenameUtils
 23 | import org.apache.commons.lang3.StringUtils
 24 | 
 25 | import org.apache.spark.sql.util.SystemVariables
 26 | import org.apache.sql.runner.command.CommandFactory
 27 | import scala.collection.mutable.ArrayBuffer
 28 | import scala.io.Source
 29 | 
 30 | import org.apache.spark.sql.runner.command.{BaseCommand, BlockCommentCommand, CommandFactory}
 31 | import org.apache.spark.sql.runner.config.ApolloClient
 32 | import org.apache.spark.sql.runner.container.ConfigContainer
 33 | 
 34 | /**
 35 |  * @author kun.wan, <wankun@apache.org>
 36 |  * @date 2020-06-03.
 37 |  */
 38 | class ArgParser {
 39 | 
 40 |   var batchTimesOpt: Option[Seq[LocalDateTime]] = None
 41 |   var startDate: Option[LocalDateTime] = None
 42 |   var endDate: Option[LocalDateTime] = None
 43 |   var dateRangeStep: Int = 1
 44 |   var jobFile: String = _
 45 |   var commands: Array[BaseCommand] = _
 46 | 
 47 |   def parseArgument(args: Array[String]): Unit = {
 48 |     if (args.length < 1) {
 49 |       println("job configuration file must be found!")
 50 |       System.exit(-1)
 51 |     }
 52 | 
 53 |     val leftArgs = new ArrayBuffer[String]()
 54 |     var argv = args.toList
 55 | 
 56 | 
 57 |     while (!argv.isEmpty) {
 58 |       argv match {
 59 |         case "--dateRange" :: startDateStr :: endDateStr :: tail =>
 60 |           startDate = Some(LocalDateTime.parse(startDateStr))
 61 |           endDate = Some(LocalDateTime.parse(endDateStr))
 62 |           argv = tail
 63 |         case "--dates" :: dates :: tail =>
 64 |           batchTimesOpt = Some(dates.split(",").map(LocalDateTime.parse(_)).toSeq)
 65 |           argv = tail
 66 |         case "--config" :: value :: tail =>
 67 |           val tup = value.split("=")
 68 |           ConfigContainer :+ (tup(0) -> tup(1))
 69 |           argv = tail
 70 |         case "--profile" :: tail =>
 71 |           ConfigContainer :+ ("spark.profile" -> "true")
 72 |           argv = tail
 73 |         case "--dryrun" :: tail =>
 74 |           ConfigContainer :+ ("dryrun" -> "true")
 75 |           argv = tail
 76 |         case "--dateRangeStep" :: dateRangeStepStr :: tail =>
 77 |           dateRangeStep = dateRangeStepStr.toInt
 78 |           argv = tail
 79 |         case head :: tail if head != null =>
 80 |           leftArgs.append(head)
 81 |           argv = tail
 82 |       }
 83 |     }
 84 | 
 85 |     jobFile = leftArgs(0)
 86 | 
 87 |     ConfigContainer :+ (SystemVariables.JOB_NAME -> FilenameUtils.getBaseName(jobFile))
 88 | 
 89 |     if (StringUtils.isNotBlank(System.getenv(SystemVariables.APOLLO_META))) {
 90 |       ConfigContainer :+ (SystemVariables.APOLLO_META -> System.getenv(SystemVariables.APOLLO_META))
 91 |     }
 92 | 
 93 |     commands = CommandFactory.parseCommands(Source.fromFile(jobFile).mkString)
 94 |     require(commands.length > 0 && commands(0).isInstanceOf[BlockCommentCommand],
 95 |       "sql job must start with job description!")
 96 |     checkHeader(commands(0).asInstanceOf[BlockCommentCommand])
 97 | 
 98 |     // pull variables from apollo
 99 |     ApolloClient.pollVariablesFromApollo()
100 |   }
101 | 
102 |   def checkHeader(cmd: BlockCommentCommand): Unit = {
103 |     val keys = Set("author", "period", "run_env", "describe")
104 |     val headerMap: Map[String, String] =
105 |       cmd.comment.split('\n')
106 |         .filter(_.contains(":"))
107 |         .map { line =>
108 |           val splits = line.split(":")
109 |           splits(0).trim -> splits(1).trim
110 |         }.toMap
111 | 
112 |     val notExistsKeys = keys.filterNot(headerMap.contains(_))
113 |     assert(notExistsKeys.isEmpty, s"Header 中缺少 ${notExistsKeys.mkString(", ")} 参数!")
114 |     for ((key, value) <- headerMap) {
115 |       ConfigContainer :+ (key -> value)
116 |     }
117 |   }
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/JobRunner.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.runner
 19 | 
 20 | import java.time.LocalDateTime
 21 | import java.time.temporal.ChronoUnit
 22 | 
 23 | import scala.reflect.io.File
 24 | 
 25 | import org.apache.spark.sql.plugin.{AsyncProfilePlugin, YourkitPlugin}
 26 | import org.apache.spark.sql.runner.command.SqlCommand
 27 | import org.apache.spark.sql.runner.container.{CollectorContainer, ConfigContainer}
 28 | import org.apache.spark.sql.util.SystemVariables._
 29 | import org.apache.spark.sql.util.{Logging, SystemVariables}
 30 | 
 31 | /**
 32 |  * @author kun.wan, <wankun@apache.org>
 33 |  * @date 2019-12-05.
 34 |  */
 35 | object JobRunner extends ArgParser with Logging {
 36 |   def main(args: Array[String]): Unit = {
 37 |     parseArgument(args)
 38 |     logInfo(s"submit job for ${jobFile}")
 39 | 
 40 |     prepareRuntimeParameter()
 41 | 
 42 |     batchTimesOpt.getOrElse(Seq[LocalDateTime]()).map { batchTime =>
 43 |       CollectorContainer :+ (SystemVariables.BATCH_TIME -> batchTime)
 44 |       logInfo(s"submitting job(batchTime = $batchTime)")
 45 |       if (ConfigContainer.contains("dryrun")) {
 46 |         commands.foreach(_.dryrun())
 47 |       } else {
 48 |         commands.foreach(_.run())
 49 |       }
 50 |     }
 51 |     SqlCommand.stop()
 52 | 
 53 |     logInfo(s"end job")
 54 |   }
 55 | 
 56 |   def prepareRuntimeParameter(): Unit = {
 57 |     // prepare for spark mode
 58 |     val distJars = Seq(PROJECT_JAR_NAME).map(jar => s"lib/${jar}").mkString(",")
 59 |     ConfigContainer :+ ("spark.yarn.dist.jars" -> distJars)
 60 |     if (!ConfigContainer.contains("spark.yarn.queue")) {
 61 |       ConfigContainer :+ ("spark.yarn.queue" -> s"root.${File(jobFile).parent.name}")
 62 |     }
 63 | 
 64 |     if (ConfigContainer.getOrElse("spark.profile", "false").toBoolean) {
 65 |       val profileShell = "hdfs:///deploy/config/profile.sh"
 66 |       val yourkitAgent = "hdfs:///deploy/config/libyjpagent.so"
 67 | 
 68 |       ConfigContainer.getOrElse("spark.profile.type", "jfr") match {
 69 |         case "yourkit" =>
 70 |           ConfigContainer :+ ("spark.profile.type" -> "snapshot")
 71 |           ConfigContainer :+ ("spark.yarn.dist.files" -> s"${profileShell},${yourkitAgent}")
 72 |           ConfigContainer :+ ("spark.yarn.dist.jars" -> s"${distJars},hdfs:///deploy/config/yjp-controller-api-redist.jar")
 73 |           ConfigContainer :+ ("spark.executor.extraJavaOptions" -> "-agentpath:libyjpagent.so=logdir=<LOG_DIR>,async_sampling_cpu")
 74 |           ConfigContainer :+ ("spark.executor.plugins" -> classOf[YourkitPlugin].getName)
 75 | 
 76 |         case _ =>
 77 |           ConfigContainer :+ ("spark.yarn.dist.archives" ->
 78 |             "hdfs:///deploy/config/async-profiler/async-profiler.zip#async-profiler")
 79 |           ConfigContainer :+ ("spark.yarn.dist.files" -> profileShell)
 80 |           ConfigContainer :+ ("spark.executor.extraLibraryPath" -> "./async-profiler/build/")
 81 |           ConfigContainer :+ ("spark.executor.plugins" -> classOf[AsyncProfilePlugin].getName)
 82 |       }
 83 |     }
 84 | 
 85 |     // 如果日期参数为空，时间设置为上一个执行周期
 86 |     if (startDate != None && endDate != None) {
 87 |       batchTimesOpt = ConfigContainer.get("period") match {
 88 |         case "minute" =>
 89 |           val rangeSize = ChronoUnit.MINUTES.between(startDate.get, endDate.get)
 90 |           Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusMinutes(i)))
 91 |         case "hour" | "hourly" =>
 92 |           val rangeSize = ChronoUnit.HOURS.between(startDate.get, endDate.get)
 93 |           Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusHours(i)))
 94 |         case "day" | "daily" =>
 95 |           val rangeSize = ChronoUnit.DAYS.between(startDate.get, endDate.get)
 96 |           Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusDays(i)))
 97 |         case "month" =>
 98 |           val rangeSize = ChronoUnit.MONTHS.between(startDate.get, endDate.get)
 99 |           Some(Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusMonths(i)))
100 |       }
101 |     }
102 |     if (batchTimesOpt == None) {
103 |       val defaultBatchTime = {
104 |         ConfigContainer.get("period") match {
105 |           case "minute" =>
106 |             val dt = LocalDateTime.now.minusMinutes(1)
107 |             LocalDateTime.of(dt.getYear, dt.getMonth, dt.getDayOfMonth,
108 |               dt.getHour, dt.getMinute, 0)
109 |           case "hour" =>
110 |             val dt = LocalDateTime.now.minusHours(1)
111 |             LocalDateTime.of(dt.getYear, dt.getMonth, dt.getDayOfMonth, dt.getHour, 0, 0)
112 |           case "day" =>
113 |             val dt = LocalDateTime.now.minusDays(1)
114 |             LocalDateTime.of(dt.getYear, dt.getMonth, dt.getDayOfMonth, 0, 0, 0)
115 |           case "month" =>
116 |             val dt = LocalDateTime.now.minusMonths(1)
117 |             LocalDateTime.of(dt.getYear, dt.getMonth, 1, 0, 0, 0)
118 |         }
119 |       }
120 |       batchTimesOpt = Some(Seq(defaultBatchTime))
121 |     }
122 |   }
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/ArrayValueCollector.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.callback
19 | 
20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
21 | import org.apache.spark.sql.util.Logging
22 | import scala.collection.mutable.ArrayBuffer
23 | 
24 | import org.apache.spark.sql.runner.container.CollectorContainer
25 | 
26 | /**
27 |  * @author kun.wan, <wankun@apache.org>
28 |  * @date 2021-03-08.
29 |  */
30 | case class ArrayValueCollector(name: String, columnName: String)
31 |   extends DataCallBack with Logging {
32 | 
33 |   val array = ArrayBuffer[Any]()
34 | 
35 |   override def next(row: GenericRowWithSchema): Unit = {
36 |     array += row.get(row.schema.fieldIndex(columnName))
37 |   }
38 | 
39 |   override def close(): Unit = {
40 |     CollectorContainer :+ (name -> array.toArray)
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/DataCallBack.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.callback
19 | 
20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
21 | import org.apache.spark.sql.types.StructType
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2021-02-20.
26 |  */
27 | trait DataCallBack {
28 | 
29 |   var skipEmpty = true
30 | 
31 |   def init(schema: StructType): Unit = {}
32 | 
33 |   def next(row: GenericRowWithSchema): Unit
34 | 
35 |   def close(): Unit
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/DataCallBackFactory.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.callback
19 | 
20 | import org.apache.spark.sql.util.Logging
21 | 
22 | import scala.collection.mutable.ArrayBuffer
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  * @date 2021-03-08.
27 |  */
28 | object DataCallBackFactory extends Logging {
29 | 
30 |   val callBacks: ThreadLocal[ArrayBuffer[DataCallBack]] =
31 |     new ThreadLocal[ArrayBuffer[DataCallBack]] {
32 |       override def initialValue(): ArrayBuffer[DataCallBack] = ArrayBuffer[DataCallBack]()
33 |     }
34 | 
35 |   def registerDataCallBack(dataCallBack: DataCallBack): Unit = {
36 |     logInfo(s"add new data call back:\n$dataCallBack")
37 |     callBacks.get() += dataCallBack
38 |   }
39 | 
40 |   def clearDataCallBack(): Unit = callBacks.get().clear()
41 | 
42 |   def consumeResult(qr: QueryResult): Unit = {
43 |     val iterator = qr.iterator
44 |     while (iterator.hasNext) {
45 |       iterator.next()
46 |     }
47 |   }
48 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/DataCheckCallBack.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.callback
19 | 
20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
21 | import org.apache.spark.sql.util.{DQUtil, Logging}
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2021-03-08.
26 |  */
27 | case class DataCheckCallBack(title: String,
28 |                              originColumn: String,
29 |                              checkResultColumn: String,
30 |                              expression: String)
31 |   extends DataCallBack with Logging {
32 | 
33 |   override def next(row: GenericRowWithSchema): Unit = {
34 |     val value: Any = row.get(row.schema.fieldIndex(originColumn))
35 |     val checkResult: Boolean = row.getAs(checkResultColumn)
36 |     val messages =
37 |       Seq(title,
38 |         s"数据检查${if (checkResult) "正常" else "异常"}",
39 |         s"检查条件: $expression",
40 |         s"实际值 $value ${if (!checkResult) "不" else ""}满足条件!")
41 | 
42 |     logInfo(messages.mkString("\n"))
43 |   }
44 | 
45 |   override def close(): Unit = {}
46 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/EmailSink.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.runner.callback
 19 | 
 20 | import java.util.Properties
 21 | 
 22 | import javax.activation.DataHandler
 23 | import javax.mail.internet.{InternetAddress, MimeBodyPart, MimeMessage, MimeMultipart}
 24 | import javax.mail.{Message, Session}
 25 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 26 | import org.apache.spark.sql.util.ConfigUtil
 27 | 
 28 | import scala.collection.mutable.ArrayBuffer
 29 | 
 30 | case class EmailSink(name: String, config: Map[String, String]) extends Sink {
 31 | 
 32 |   // email邮件服务器参数
 33 |   val hostName = config.getOrElse(
 34 |     "email.hostname",
 35 |     throw new IllegalArgumentException("config email.hostname is needed.")
 36 |   )
 37 |   val userName = config.getOrElse(
 38 |     "email.username",
 39 |     throw new IllegalArgumentException("config email.username is needed.")
 40 |   )
 41 |   val password = config.getOrElse(
 42 |     "email.password",
 43 |     throw new IllegalArgumentException("config email.password is needed.")
 44 |   )
 45 |   val from = config.getOrElse(
 46 |     "email.from",
 47 |     throw new IllegalArgumentException("config email.from is needed.")
 48 |   )
 49 | 
 50 |   // email内容构建参数
 51 |   val names = ConfigUtil.trimConfigArray(
 52 |     config.getOrElse(
 53 |       s"$name.columns",
 54 |       throw new IllegalArgumentException(s"config $name.columns is needed.")
 55 |     ),
 56 |     ","
 57 |   )
 58 |   val columnNames = ConfigUtil.trimConfigArray(
 59 |     config.getOrElse(
 60 |       s"$name.columnNames",
 61 |       throw new IllegalArgumentException(s"config $name.columnNames is needed.")
 62 |     ),
 63 |     ","
 64 |   )
 65 |   val to = config.getOrElse(
 66 |     s"$name.email-to",
 67 |     throw new IllegalArgumentException(s"config $name.email-to is needed.")
 68 |   )
 69 | 
 70 |   val cc = config.getOrElse(s"$name.email-cc", "")
 71 | 
 72 |   val emailPattern = EmailSink.generateTitle(names)
 73 |   val emailColumnName = EmailSink.generateTitle(columnNames)
 74 |   val emailTemplate = config.getOrElse(
 75 |     s"$name.email-template",
 76 |     s"""<table border="1"><tbody><tr>%s</tbody></table>"""
 77 |   )
 78 |   val csvPattern = columnNames
 79 |   val subject = envName + "环境:" + config.getOrElse(s"$name.subject", "no subject")
 80 |   val attachedFileName = config.getOrElse("email-attach-filename", subject)
 81 | 
 82 |   val emailContent = new ArrayBuffer[String]()
 83 |   val csvContentBuffer = new ArrayBuffer[String]()
 84 |   emailContent.append(emailColumnName)
 85 |   csvContentBuffer.append(columnNames)
 86 | 
 87 |   var i = 0
 88 | 
 89 |   override def next(row: GenericRowWithSchema): Unit = {
 90 |     if (i < rowLimit) {
 91 |       emailContent.append(parsePattern(emailPattern, row))
 92 |       i = i + 1
 93 |     }
 94 |     csvContentBuffer.append(parsePattern(names, row))
 95 | 
 96 |   }
 97 | 
 98 |   override def close(): Unit = {
 99 |     val htmlContent = emailTemplate.format(emailContent.mkString("\n"))
100 |     val csvContent = csvContentBuffer.mkString("\n")
101 | 
102 |     // 邮件发送
103 |     val properties = new Properties()
104 |     properties.put("mail.transport.protocol", "smtp")
105 |     properties.put("mail.smtp.host", hostName)
106 |     properties.put("mail.smtp.port", "465")
107 |     properties.put(
108 |       "mail.smtp.socketFactory.class",
109 |       "javax.net.ssl.SSLSocketFactory"
110 |     )
111 |     properties.put("mail.smtp.auth", "true")
112 |     properties.put("mail.smtp.ssl.enable", "true")
113 | 
114 |     val session = Session.getInstance(properties)
115 |     val message = new MimeMessage(session)
116 |     message.setFrom(new InternetAddress(from, userName))
117 |     message.addRecipients(Message.RecipientType.TO, to)
118 |     message.addRecipients(Message.RecipientType.CC, cc)
119 |     message.setSubject(subject)
120 |     val multipart = new MimeMultipart()
121 |     val contentPart = new MimeBodyPart()
122 |     contentPart.setContent(htmlContent, "text/html;charset=UTF-8")
123 |     multipart.addBodyPart(contentPart)
124 |     val mdp = new MimeBodyPart()
125 |     val dh = new DataHandler(
126 |       new String(Array[Byte](0xEF.toByte, 0xBB.toByte, 0xBF.toByte)) + csvContent,
127 |       "text/plain;charset=UTF-8"
128 |     )
129 |     mdp.setFileName(attachedFileName + ".csv")
130 |     mdp.setDataHandler(dh)
131 |     multipart.addBodyPart(mdp)
132 |     message.setContent(multipart)
133 |     val transport = session.getTransport
134 |     transport.connect(from, password)
135 |     transport.sendMessage(message, message.getAllRecipients)
136 |     transport.close
137 |     logInfo(s"Email sink finished")
138 |   }
139 | 
140 |   override def toString: String = {
141 |     s"EmailSink(name = $name, from = $from, to = $to, cc = $cc, " +
142 |       s"names = $names, columnNames = $columnNames)"
143 |   }
144 | 
145 | }
146 | 
147 | object EmailSink {
148 |   def generateTitle(columnName: String): String = {
149 |     val columnTitle = columnName.split(",")
150 |       .map(col => s"<td align='center'>${ConfigUtil.trimConfigValue(col)}</td>")
151 |       .mkString
152 | 
153 |     s"<tr>${columnTitle}</tr>"
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/QueryResult.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.callback
19 | 
20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
21 | import org.apache.spark.sql.types.StructType
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2019-12-05.
26 |  */
27 | case class QueryResult(schema: StructType, iterator: Iterator[GenericRowWithSchema])
28 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/SingleValueCollector.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.callback
19 | 
20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
21 | import org.apache.spark.sql.util.Logging
22 | import org.apache.spark.sql.runner.container.CollectorContainer
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  * @date 2021-03-08.
27 |  */
28 | case class SingleValueCollector(name: String, columnName: String)
29 |   extends DataCallBack with Logging {
30 | 
31 |   var value: Any = _
32 | 
33 |   override def next(row: GenericRowWithSchema): Unit = {
34 |     value = row.get(row.schema.fieldIndex(columnName))
35 |   }
36 | 
37 |   override def close(): Unit = {
38 |     CollectorContainer :+ (name -> value)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/callback/Sink.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.callback
19 | 
20 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
21 | import org.apache.spark.sql.runner.metrics.ReporterTrait
22 | import org.apache.spark.sql.util.{Logging, SystemVariables}
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  * @date 2019-12-12.
27 |  */
28 | trait Sink extends DataCallBack with ReporterTrait with Logging {
29 | 
30 |   val config: Map[String, String]
31 | 
32 |   val envName = config.getOrElse(SystemVariables.ENV, "UNKNOWN")
33 | 
34 |   var resultRows: Long = 0
35 | 
36 |   val defaultRowLimit: String = "1000"
37 | 
38 |   val rowLimit: Int = config.getOrElse("rowLimit", defaultRowLimit).toInt
39 | 
40 |   def parsePattern(pattern: String, row: GenericRowWithSchema): String = {
41 |     val sb = new StringBuilder
42 |     var startIdx = -1
43 |     for ((c, idx) <- pattern.zipWithIndex) {
44 |       if (c == '{' && startIdx < 0) {
45 |         startIdx = idx
46 |       } else if (c == '}' && startIdx >= 0) {
47 |         val variableName = pattern.substring(startIdx + 1, idx)
48 |         val fieldValue: AnyRef = row.getAs(variableName)
49 |         sb.append(fieldValue)
50 |         startIdx = -1
51 |       } else if (startIdx < 0) {
52 |         sb.append(c)
53 |       }
54 |     }
55 | 
56 |     sb.toString
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/BaseCommand.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | import org.apache.spark.sql.util.{Logging, StringUtil}
21 | 
22 | /**
23 |  * @author kun.wan, <wankun@apache.org>
24 |  * @date 2021-02-23.
25 |  */
26 | abstract class BaseCommand(sourceChars: SourceChars) extends Logging {
27 | 
28 |   val escapeMapping: Map[Array[Char], Array[Char]] = Map(
29 |     Array('\"') -> Array('\"'),
30 |     Array(''') -> Array('''),
31 |     Array('(') -> Array(')'),
32 |   )
33 | 
34 |   val chars = sourceChars.chars
35 | 
36 |   def readTo(char: Char): (String, Int, Int) = readTo(Array(char))
37 | 
38 |   def readTo(target: String): (String, Int, Int) = readTo(target.toCharArray)
39 | 
40 |   private def readTo(target: Array[Char]): (String, Int, Int) = {
41 |     val len = target.length
42 |     var index = -1
43 |     var i = sourceChars.start
44 |     while (i < sourceChars.end && index < 0) {
45 |       // deal with escape char array
46 |       for ((startChars, endChars) <- escapeMapping if startChars.intersect(target).size == 0) {
47 |         val slen = startChars.length
48 |         if (i > slen && chars(i - slen) != '\\') {
49 |           if (chars.slice(i - slen + 1, i + 1) sameElements startChars) {
50 |             val elen = endChars.length
51 |             i = i + elen
52 |             while (i < sourceChars.end && (chars(i - elen) == '\\' ||
53 |               !(chars.slice(i - elen + 1, i + 1) sameElements endChars))) {
54 |               i = i + 1
55 |             }
56 |           }
57 |         }
58 |       }
59 | 
60 |       if (chars.slice(i - len + 1, i + 1) sameElements target) {
61 |         index = i + 1 - len
62 |       } else {
63 |         i = i + 1
64 |       }
65 |     }
66 |     assert(index >= 0, s"Parse Job Error!\n${new String(chars.slice(sourceChars.start, sourceChars.end))}")
67 |     val res =
68 |       StringUtil.escapeStringValue(new String(chars.slice(sourceChars.start, index)))
69 |     val nextStart = i + 1
70 |     (res, index, nextStart)
71 |   }
72 | 
73 |   def run(): Unit = {
74 |     throw new Exception("Unsupport Command!")
75 |   }
76 | 
77 |   def dryrun(): Unit = run()
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/BlockCommentCommand.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-02-24.
23 |  */
24 | case class BlockCommentCommand(sourceChars: SourceChars)
25 |   extends BaseCommand(sourceChars) {
26 | 
27 |   def this(sourceString: String) {
28 |     this(SourceChars(sourceString.toCharArray, 0, sourceString.length))
29 |   }
30 | 
31 |   sourceChars.start = sourceChars.start + CommandFactory.blockCommentPrefix.length
32 | 
33 |   val (comment, _, nextStart) = readTo("*/")
34 |   sourceChars.start = nextStart
35 | 
36 |   override def toString: String = s"/**${comment}*/"
37 | 
38 |   override def run(): Unit = {
39 |     logInfo(s"\n${this.toString}")
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/CommandFactory.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | import scala.collection.mutable.ArrayBuffer
21 | 
22 | /**
23 |  * @author kun.wan, <wankun@apache.org>
24 |  * @date 2021-02-24.
25 |  */
26 | object CommandFactory {
27 |   val sqlPrefix = ""
28 |   val lineCommentPrefix = "--"
29 |   val blockCommentPrefix = "/**"
30 |   val setPrefix = "!set"
31 | 
32 |   val ifPrefix = "!if"
33 |   val elsePrefix = "!else"
34 |   val fiPrefix = "!fi"
35 | 
36 |   def skipEmptyChars(sourceChars: SourceChars): Unit = {
37 |     while (sourceChars.start < sourceChars.chars.length &&
38 |       Character.isWhitespace(sourceChars.chars.charAt(sourceChars.start))) {
39 |       sourceChars.start = sourceChars.start + 1
40 |     }
41 |   }
42 | 
43 |   /**
44 |    * 使用探测法，找到下一条Command
45 |    * @param sourceChars
46 |    */
47 |   def nextCommand(sourceChars: SourceChars): BaseCommand = {
48 |     skipEmptyChars(sourceChars)
49 |     val commandPrefix: Option[String] =
50 |       Seq(
51 |         lineCommentPrefix,
52 |         blockCommentPrefix,
53 |         setPrefix,
54 |         ifPrefix,
55 |         elsePrefix,
56 |         fiPrefix
57 |       ) find { prefix =>
58 |         val len = prefix.length
59 |         if (sourceChars.start + len >= sourceChars.end) {
60 |           false
61 |         }
62 |         else {
63 |           prefix.equalsIgnoreCase(new String(sourceChars.chars, sourceChars.start, len))
64 |         }
65 |       }
66 | 
67 |     val cmd =
68 |       commandPrefix match {
69 |         case Some(prefix) if prefix == lineCommentPrefix => LineCommentCommand(sourceChars)
70 |         case Some(prefix) if prefix == blockCommentPrefix => BlockCommentCommand(sourceChars)
71 |         case Some(prefix) if prefix == setPrefix => SetCommand(sourceChars)
72 |         case Some(prefix) if prefix == ifPrefix => IfCommand(sourceChars)
73 |         case Some(prefix) if prefix == elsePrefix => ElseCommand(sourceChars)
74 |         case Some(prefix) if prefix == fiPrefix => FiCommand(sourceChars)
75 |         case None => SqlCommand(sourceChars)
76 |       }
77 |     skipEmptyChars(sourceChars)
78 |     cmd
79 |   }
80 | 
81 |   def parseCommands(source: String): Array[BaseCommand] = {
82 |     val commands = ArrayBuffer[BaseCommand]()
83 |     val sourceChars = SourceChars(source.toCharArray, 0, source.length)
84 | 
85 |     while (sourceChars.start < source.length) {
86 |       val command = nextCommand(sourceChars)
87 |       commands += command
88 |     }
89 |     commands.toArray
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/ElseCommand.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-02-24.
23 |  */
24 | case class ElseCommand(sourceChars: SourceChars)
25 |   extends BaseCommand(sourceChars) {
26 | 
27 |   def this(sourceString: String) {
28 |     this(SourceChars(sourceString.toCharArray, 0, sourceString.length))
29 |   }
30 | 
31 |   sourceChars.start = sourceChars.start + CommandFactory.elsePrefix.length
32 | }
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/FiCommand.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-02-24.
23 |  */
24 | case class FiCommand(sourceChars: SourceChars)
25 |   extends BaseCommand(sourceChars) {
26 | 
27 |   def this(sourceString: String) {
28 |     this(SourceChars(sourceString.toCharArray, 0, sourceString.length))
29 |   }
30 | 
31 |   sourceChars.start = sourceChars.start + CommandFactory.fiPrefix.length
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/IfCommand.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.runner.command
 19 | 
 20 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 21 | import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Cast, Literal}
 22 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 23 | import org.apache.spark.sql.types.DataType
 24 | import org.apache.sql.runner.container.ConfigContainer
 25 | import scala.collection.mutable
 26 | import scala.collection.mutable.ArrayBuffer
 27 | 
 28 | import org.apache.spark.sql.runner.config.VariableSubstitution
 29 | import org.apache.spark.sql.runner.container.{CollectorContainer, ConfigContainer}
 30 | 
 31 | /**
 32 |  * @author kun.wan, <wankun@apache.org>
 33 |  * @date 2021-02-24.
 34 |  */
 35 | case class IfCommand(sourceChars: SourceChars)
 36 |   extends BaseCommand(sourceChars) {
 37 | 
 38 |   def this(sourceString: String) {
 39 |     this(SourceChars(sourceString.toCharArray, 0, sourceString.length))
 40 |   }
 41 | 
 42 |   sourceChars.start = sourceChars.start + CommandFactory.ifPrefix.length
 43 | 
 44 |   val (_, _, nextStart1) = readTo("(")
 45 |   sourceChars.start = nextStart1
 46 |   val (ifConditionString, _, nextStart2) = readTo(")")
 47 |   sourceChars.start = nextStart2
 48 | 
 49 |   val ifCommands = new ArrayBuffer[BaseCommand]()
 50 |   val elseCommands = new ArrayBuffer[BaseCommand]()
 51 | 
 52 |   var parseStage = "if"
 53 |   while (parseStage != "fi") {
 54 |     val cmd = CommandFactory.nextCommand(sourceChars)
 55 |     cmd match {
 56 |       case _: FiCommand =>
 57 |         parseStage = "fi"
 58 | 
 59 |       case _: ElseCommand =>
 60 |         parseStage = "else"
 61 | 
 62 |       case _ =>
 63 |         parseStage match {
 64 |           case "if" =>
 65 |             ifCommands += cmd
 66 |           case "else" =>
 67 |             elseCommands += cmd
 68 |         }
 69 |     }
 70 |   }
 71 | 
 72 |   override def toString: String = {
 73 |     val elseString =
 74 |       if (elseCommands.size > 0) {
 75 |         s"""\n!else
 76 |            |${elseCommands.mkString("\n")}
 77 |            |""".stripMargin
 78 |       } else {
 79 |         ""
 80 |       }
 81 | 
 82 |     s"""!if ($ifConditionString)
 83 |        |${ifCommands.mkString("\n") + elseString}
 84 |        |!fi
 85 |        |""".stripMargin
 86 | 
 87 |   }
 88 | 
 89 |   override def run(): Unit = {
 90 |     doRun(isDryRun = false)
 91 |   }
 92 | 
 93 |   override def dryrun(): Unit = {
 94 |     doRun(isDryRun = true)
 95 |   }
 96 | 
 97 |   def doRun(isDryRun: Boolean): Unit = {
 98 |     VariableSubstitution.withSubstitution { substitution =>
 99 |       val dataTypeMap = mutable.Map[String, DataType]()
100 | 
101 |       val originExpr = CatalystSqlParser.parseExpression(substitution.substitute(ifConditionString))
102 | 
103 |       var lastMapSize = -1
104 |       while (lastMapSize != dataTypeMap.size) {
105 |         lastMapSize = dataTypeMap.size
106 |         originExpr transform {
107 |           case expr: BinaryExpression =>
108 |             (expr.left, expr.right) match {
109 |               case (attr: UnresolvedAttribute, literal: Literal) =>
110 |                 dataTypeMap += (attr.name -> literal.dataType)
111 | 
112 |               case (literal: Literal, attr: UnresolvedAttribute) =>
113 |                 dataTypeMap += (attr.name -> literal.dataType)
114 | 
115 |               case (attr1: UnresolvedAttribute, attr2: UnresolvedAttribute) =>
116 |                 if (dataTypeMap.contains(attr1.name)) {
117 |                   dataTypeMap += (attr2.name -> dataTypeMap(attr1.name))
118 |                 }
119 |                 if (dataTypeMap.contains(attr2.name)) {
120 |                   dataTypeMap += (attr1.name -> dataTypeMap(attr2.name))
121 |                 }
122 | 
123 |               case (_, _) =>
124 |             }
125 |             expr
126 | 
127 |           case e => e
128 |         }
129 |       }
130 | 
131 |       val ifCondition =
132 |         originExpr transform {
133 |           case e: UnresolvedAttribute =>
134 |             val dataType = dataTypeMap(e.name)
135 |             val literal = Literal(CollectorContainer.getOrElse(e.name, ConfigContainer.get(e.name)))
136 |             if (dataType == literal.dataType) {
137 |               literal
138 |             } else {
139 |               Cast(literal, dataType)
140 |             }
141 | 
142 |           case e => e
143 |         }
144 | 
145 |       val ret = ifCondition.eval().asInstanceOf[Boolean]
146 |       if (ret) {
147 |         ifCommands.foreach(cmd => if (isDryRun) cmd.run() else cmd.dryrun())
148 |       } else {
149 |         elseCommands.foreach(cmd => if (isDryRun) cmd.run() else cmd.dryrun())
150 |       }
151 |     }
152 | 
153 |   }
154 | }
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/LineCommentCommand.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-02-24.
23 |  */
24 | case class LineCommentCommand(sourceChars: SourceChars)
25 |   extends BaseCommand(sourceChars) {
26 | 
27 |   def this(sourceString: String) {
28 |     this(SourceChars(sourceString.toCharArray, 0, sourceString.length))
29 |   }
30 | 
31 |   sourceChars.start = sourceChars.start + CommandFactory.lineCommentPrefix.length
32 | 
33 |   val (comment, _, nextStart) = readTo('\n')
34 |   sourceChars.start = nextStart
35 | 
36 |   override def toString: String = s"${CommandFactory.lineCommentPrefix} ${comment}"
37 | 
38 |   override def run(): Unit = {
39 |     logInfo(s"\n${this.toString}")
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/SetCommand.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | import org.apache.spark.sql.runner.config.VariableSubstitution
21 | import org.apache.spark.sql.runner.container.ConfigContainer
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2021-02-24.
26 |  */
27 | case class SetCommand(sourceChars: SourceChars) extends BaseCommand(sourceChars) {
28 | 
29 |   def this(sourceString: String) {
30 |     this(SourceChars(sourceString.toCharArray, 0, sourceString.length))
31 |   }
32 | 
33 |   sourceChars.start = sourceChars.start + CommandFactory.setPrefix.length
34 | 
35 |   val (key, _, valueStart) = readTo('=')
36 |   sourceChars.start = valueStart
37 | 
38 |   val (value, _, nextStart) = readTo(';')
39 |   sourceChars.start = nextStart
40 | 
41 |   override def toString: String = s"${CommandFactory.setPrefix} $key = $value;"
42 | 
43 |   override def run(): Unit = {
44 |     val substitutionValue =
45 |       VariableSubstitution.withSubstitution { substitution =>
46 |         substitution.substitute(value)
47 |       }
48 | 
49 |     ConfigContainer :+ (key -> substitutionValue)
50 |     logInfo(s"\n${CommandFactory.setPrefix} $key = $substitutionValue;")
51 |   }
52 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/SourceChars.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.command
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-02-24.
23 |  */
24 | case class SourceChars(chars: Array[Char], var start: Int, var end: Int)
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/command/SqlCommand.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.runner.command
 19 | 
 20 | import org.apache.spark.sql.SparkSession
 21 | import org.apache.spark.sql.hive.SparkSqlRunner
 22 | import org.apache.spark.sql.util.{Logging, SystemVariables}
 23 | import scala.collection.JavaConverters._
 24 | 
 25 | import org.apache.spark.sql.runner.callback.DataCallBackFactory
 26 | import org.apache.spark.sql.runner.config.VariableSubstitution
 27 | import org.apache.spark.sql.runner.container.ConfigContainer
 28 | 
 29 | /**
 30 |  * @author kun.wan, <wankun@apache.org>
 31 |  * @date 2021-02-24.
 32 |  */
 33 | case class SqlCommand(sourceChars: SourceChars)
 34 |   extends BaseCommand(sourceChars) {
 35 | 
 36 |   def this(sourceString: String) {
 37 |     this(SourceChars(sourceString.toCharArray, 0, sourceString.length))
 38 |   }
 39 | 
 40 |   sourceChars.start = sourceChars.start + CommandFactory.sqlPrefix.length
 41 | 
 42 |   val (sql, _, nextStart) = readTo(";")
 43 |   sourceChars.start = nextStart
 44 | 
 45 |   override def toString: String = s"$sql;"
 46 | 
 47 |   override def run(): Unit = {
 48 |     doRun(isDryRun = false)
 49 |   }
 50 | 
 51 |   override def dryrun(): Unit = {
 52 |     doRun(isDryRun = true)
 53 |   }
 54 | 
 55 |   def doRun(isDryRun: Boolean): Unit = {
 56 |     VariableSubstitution.withSubstitution { substitution =>
 57 |       // 这里需要注意参数的还原
 58 |       val sqlText = substitution.substitute(sql)
 59 |       logInfo(s"sql content:\n$sqlText")
 60 |       if (!isDryRun) {
 61 |         DataCallBackFactory.consumeResult(SqlCommand.sparkSqlRunner.run(sqlText))
 62 |       }
 63 |     }
 64 |   }
 65 | }
 66 | 
 67 | object SqlCommand extends Logging {
 68 | 
 69 |   implicit lazy val sparkSession: SparkSession =
 70 |     SparkSqlRunner.sparkSession(
 71 |       Some(ConfigContainer.getOrElse(SystemVariables.JOB_NAME, "Unknown Job Name")))
 72 | 
 73 |   lazy val sparkSqlRunner = new SparkSqlRunner
 74 | 
 75 |   //  val catalogEventListener = InsightCatalogEventListener()
 76 |   var sqlContext = sparkSession.sqlContext
 77 | 
 78 |   //  SparkSession.active.sharedState.externalCatalog.addListener(catalogEventListener)
 79 | 
 80 |   /** Cleans up and shuts down the Spark SQL environments. */
 81 |   def stop() {
 82 |     logDebug("Clear SparkSession and SparkContext")
 83 |     // TODO
 84 |     //    catalogEventListener.stop()
 85 |     if (sqlContext != null) {
 86 |       sqlContext = null
 87 |     }
 88 |     if (sparkSession != null) {
 89 |       sparkSession.stop()
 90 |     }
 91 |     SparkSession.clearActiveSession
 92 | 
 93 |     val clazz = Class.forName("java.lang.ApplicationShutdownHooks")
 94 |     val field = clazz.getDeclaredField("hooks")
 95 |     field.setAccessible(true)
 96 |     val inheritableThreadLocalsField = classOf[Thread].getDeclaredField("inheritableThreadLocals")
 97 |     inheritableThreadLocalsField.setAccessible(true)
 98 | 
 99 |     val hooks = field.get(clazz).asInstanceOf[java.util.IdentityHashMap[Thread, Thread]].asScala
100 |     hooks.keys.map(inheritableThreadLocalsField.set(_, null))
101 |   }
102 | 
103 |   def simpleTypeName(typeName: String): String = {
104 |     val i = typeName.indexOf("(")
105 |     if (i > 0) {
106 |       typeName.substring(0, i)
107 |     } else {
108 |       typeName
109 |     }
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/config/ApolloClient.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.config
19 | 
20 | import com.ctrip.framework.apollo.{Config, ConfigService}
21 | import org.apache.commons.lang3.StringUtils
22 | 
23 | import org.apache.spark.sql.util.{Logging, SystemVariables}
24 | import scala.collection.JavaConverters._
25 | 
26 | import org.apache.spark.sql.runner.container.ConfigContainer
27 | 
28 | /**
29 |  * @author kun.wan, <wankun@apache.org>
30 |  * @date 2020-03-04.
31 |  */
32 | case class ApolloClient(namespace: String) extends Logging {
33 | 
34 |   lazy val config: Config = ConfigService.getConfig(namespace)
35 | 
36 |   def getProperty(key: String, defaultValue: String): String = {
37 |     config.getProperty(key, defaultValue)
38 |   }
39 | }
40 | 
41 | object ApolloClient extends Logging {
42 | 
43 |   /**
44 |    * 去Apollo 获取参数太慢了
45 |    *
46 |    * @return
47 |    */
48 |   def pollVariablesFromApollo(): Unit = {
49 |     if (StringUtils.isNotBlank(System.getenv(SystemVariables.APOLLO_META))) {
50 |       val appId =
51 |         ConfigContainer.getOrElse("apollo.app.id",
52 |           ConfigContainer.getOrElse("appId",
53 |             SystemVariables.DEFAULT_APOLLO_ID))
54 |       System.setProperty("app.id", appId)
55 | 
56 |       val systemClient = ApolloClient("1.above-board")
57 | 
58 |       systemClient.config.getPropertyNames
59 |         .toArray.map { case key: String =>
60 |         val value = systemClient.getProperty(key, "")
61 |         val encryptedValue = if (key.toLowerCase.contains("password")) "******" else value
62 |         logInfo(s"pull variable from apollo, $key = $encryptedValue)")
63 |         ConfigContainer :+ (key -> value)
64 |       }
65 | 
66 |       if (ConfigContainer.contains("apollo.namespace")) {
67 |         val appClient = ApolloClient(ConfigContainer.get("apollo.namespace"))
68 |         appClient.config.getPropertyNames.asScala.map { case key: String =>
69 |           val value = appClient.getProperty(key, "")
70 |           val encryptedValue = if (key.toLowerCase.contains("password")) "******" else value
71 |           logInfo(s"pull variable from apollo, $key = $encryptedValue")
72 |           ConfigContainer :+ (key -> value)
73 |         }
74 |       }
75 |     }
76 |   }
77 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/container/CollectorContainer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.container
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-03-08.
23 |  */
24 | object CollectorContainer extends ContainerTrait[String, Any]
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/container/ConfigContainer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.container
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2020-03-06.
23 |  */
24 | object ConfigContainer extends ContainerTrait[String, String]
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/container/ContainerTrait.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.container
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-03-08.
23 |  */
24 | class ContainerTrait[A, B] {
25 | 
26 |   /**
27 |    * 这里设计为 ThreadLocal 变量，用于支持多线程运行多job时，维护各自的配置信息.
28 |    * 其他线程如果要维护自己的配置信息，从valueMap拷贝出去进行自己维护
29 |    */
30 |   val valueMap =
31 |     new InheritableThreadLocal[Map[A, B]]() {
32 |       override def initialValue(): Map[A, B] = Map[A, B]()
33 |     }
34 | 
35 |   /**
36 |    * 原有map和新的map合并，如果key冲突，保留新的map值
37 |    *
38 |    * @param map
39 |    */
40 |   def ++(map: Map[A, B]): Unit = {
41 |     valueMap.set(valueMap.get() ++ map)
42 |   }
43 | 
44 |   /**
45 |    * 向map中加入新值，如果key已经存在，使用新值覆盖
46 |    * @param kv
47 |    */
48 |   def :+(kv: (A, B)): Unit = {
49 |     valueMap.set(valueMap.get() + kv)
50 |   }
51 | 
52 |   def getOrElse(key: A, default: => B): B = valueMap.get().getOrElse(key, default)
53 | 
54 |   def get(key: A): B = valueMap.get()(key)
55 | 
56 |   def getOption(key: A): Option[B] = valueMap.get().get(key)
57 | 
58 |   def contains(key: A): Boolean = valueMap.get().contains(key)
59 | 
60 |   def -(key: A): Unit = {
61 |     if (valueMap.get().contains(key)) {
62 |       valueMap.set(valueMap.get() - key)
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/metrics/GraphiteReporter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.metrics
19 | 
20 | import java.io.PrintWriter
21 | import java.net.Socket
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2020-02-26.
26 |  */
27 | case class GraphiteReporter(host: String, port: Int) extends AutoCloseable with Serializable {
28 | 
29 |   @transient val socket: Socket = new Socket(host, port)
30 |   @transient val out: PrintWriter = new PrintWriter(socket.getOutputStream, true)
31 | 
32 |   def reportMetrics(key: String, value: Number): Unit = {
33 |     val timestamp = System.currentTimeMillis() / 1000
34 |     out.printf(s"${key} ${value} ${timestamp}%n")
35 |   }
36 | 
37 |   override def close(): Unit = {
38 |     if (out != null) {
39 |       out.close()
40 |     }
41 |     if (socket != null) {
42 |       socket.close()
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/runner/metrics/ReporterTrait.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner.metrics
19 | 
20 | import org.apache.spark.sql.runner.container.ConfigContainer
21 | 
22 | /**
23 |  * @author kun.wan, <wankun@apache.org>
24 |  * @date 2020-02-26.
25 |  */
26 | trait ReporterTrait {
27 | 
28 |   lazy val reporter: Option[GraphiteReporter] = {
29 |     val enableMetrics = ConfigContainer.getOrElse("metrics.enable", "true").toBoolean
30 |     if (enableMetrics && ConfigContainer.contains("graphite.host")) {
31 |       val graphiteHost = ConfigContainer.get("graphite.host")
32 |       val graphitePort = ConfigContainer.getOrElse("graphite.port", "2003").toInt
33 |       Some(GraphiteReporter(graphiteHost, graphitePort))
34 |     } else {
35 |       None
36 |     }
37 |   }
38 | 
39 |   def reportMetrics(key: String, value: Number): Unit =
40 |     reporter.map(_.reportMetrics(key, value))
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/udf/DateFormatUDF.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.udf
19 | 
20 | import java.time.format.DateTimeFormatter
21 | import java.time.format.DateTimeFormatter._
22 | 
23 | import org.sparkproject.guava.cache.CacheLoader
24 | import org.sparkproject.guava.cache.CacheBuilder
25 | 
26 | /**
27 |  * @author kun.wan, <wankun@apache.org>
28 |  * @date 2020-07-20.
29 |  */
30 | object DateFormatUDF {
31 | 
32 |   lazy val cache = CacheBuilder.newBuilder()
33 |     .maximumSize(100)
34 |     .build(new CacheLoader[String, DateTimeFormatter] {
35 |       override def load(key: String): DateTimeFormatter = ofPattern(key)
36 |     })
37 | 
38 |   implicit def toFormatter(pattern: String): DateTimeFormatter = cache.get(pattern)
39 | 
40 |   // function name : transform_date
41 |   val transform_date_udf: (String, String, String) => String = {
42 |     (dt: String, srcPattern: String, dstPattern: String) =>
43 |       toFormatter(dstPattern).format(srcPattern.parse(dt))
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/udf/UDFFactory.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.udf
19 | 
20 | import java.lang.annotation.Annotation
21 | 
22 | import org.apache.spark.sql.SparkSession
23 | import org.apache.spark.sql.runner.container.ConfigContainer
24 | import org.apache.spark.sql.types.DataType
25 | import org.apache.spark.sql.util.Logging
26 | 
27 | /**
28 |  * @author kun.wan, <wankun@apache.org>
29 |  * @date 2020-07-20.
30 |  */
31 | object UDFFactory extends Logging {
32 | 
33 |   val EXTERNAL_UDFS = "spark.sql.externalUdfClasses"
34 | 
35 |   def registerExternalUDFs(spark: SparkSession): Unit = {
36 |     spark.udf.register("transform_date", DateFormatUDF.transform_date_udf)
37 | 
38 |     ConfigContainer.getOption(EXTERNAL_UDFS).map {
39 |       case udfClasses: String =>
40 |         spark.sessionState.resourceLoader.addJar("hdfs:///deploy/config/biz-udfs-1.0.jar")
41 | 
42 |         val annotationClazz =
43 |           Class.forName("org.apachetech.udfs.annotations.UDFDescription",
44 |             true,
45 |             spark.sharedState.jarClassLoader)
46 |             .asInstanceOf[Class[_ <: Annotation]]
47 |         val nameMethod = annotationClazz.getMethod("name")
48 |         val returnTypeMethod = annotationClazz.getMethod("returnType")
49 | 
50 |         udfClasses.split(",").map(_.trim).foreach(udfClass => {
51 |           val clazz = Class.forName(udfClass, true, spark.sharedState.jarClassLoader)
52 |           val annotation = clazz.getAnnotation(annotationClazz)
53 |           val name: String = nameMethod.invoke(annotation).asInstanceOf[String]
54 |           val returnType: String = returnTypeMethod.invoke(annotation).asInstanceOf[String]
55 | 
56 |           logInfo(s"register udf ${name} with class ${udfClass}")
57 |           spark.udf.registerJava(name, udfClass, DataType.fromDDL(returnType))
58 |         })
59 |     }
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/ConfigUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.runner.container.ConfigContainer
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2020-02-17.
26 |  */
27 | object ConfigUtil {
28 | 
29 |   def ltrim(s: String): String = s.replaceAll("^\\s+", "")
30 | 
31 |   def rtrim(s: String): String = s.replaceAll("\\s+$", "")
32 | 
33 |   def trimConfigValue(configValue: String): String = rtrim(ltrim(configValue))
34 | 
35 | 
36 |   def trimConfigArray(configValue: String, separator: String): String = {
37 |     configValue.split(separator)
38 |       .map(trimConfigValue(_))
39 |       .mkString(separator)
40 |   }
41 | 
42 |   def withConfigs[T](configs: (String, String)*)(func: => T): T = {
43 |     val spark = SparkSession.active
44 |     try {
45 |       configs.foreach(config => {
46 |         ConfigContainer :+ (config._1 -> config._2)
47 |         spark.conf.set(config._1, config._2)
48 |       })
49 | 
50 |       func
51 |     } finally {
52 |       configs.foreach(config => {
53 |         ConfigContainer - config._1
54 |         spark.conf.unset(config._1)
55 |       })
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/DQUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import org.apache.spark.sql.runner.container.ConfigContainer
21 | 
22 | /**
23 |  * @author kun.wan, <wankun@apache.org>
24 |  * @date 2020-02-26.
25 |  */
26 | object DQUtil {
27 | 
28 |   val serverUrl = ConfigContainer.getOrElse("dataquality.alert", "")
29 |   val title = s"${ConfigContainer.getOrElse(SystemVariables.ENV, SystemVariables.DEFAULT_ENV)}数据质量检查告警"
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/GenericAvroSchema.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import scala.beans.BeanProperty
21 | 
22 | /**
23 |  * @author kun.wan, <wankun@apache.org>
24 |  * @date 2020-02-26.
25 |  */
26 | case class GenericAvroSchema(@BeanProperty name: String,
27 |                              @BeanProperty namespace: String,
28 |                              @BeanProperty fields: Array[AvroField],
29 |                              @BeanProperty `type`: String = "record",
30 |                              @BeanProperty doc: String = "")
31 | 
32 | case class AvroField(@BeanProperty name: String,
33 |                      @BeanProperty `type`: String,
34 |                      @BeanProperty doc: String = "")
35 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/JdbcConnector.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.util
 19 | 
 20 | import java.sql.{Connection, PreparedStatement, SQLException}
 21 | 
 22 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 23 | import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
 24 | import org.apache.spark.sql.types._
 25 | 
 26 | /**
 27 |  * 1. 提供JDBC相关配置参数
 28 |  * 2. 提供JDBCOption实例 作为connect参数
 29 |  * 3. 提供JDBC相关操作util方法
 30 |  *
 31 |  * @author kun.wan, <wankun@apache.org>
 32 |  * @date 2019-12-11.
 33 |  */
 34 | class JdbcConnector(config: Map[String, String]) extends Logging {
 35 | 
 36 |   val tag: String = config.getOrElse(
 37 |     "tag",
 38 |     throw new IllegalArgumentException("config tag is needed.")
 39 |   )
 40 | 
 41 |   /**
 42 |    * 1. Get ${tag.key} value from config map
 43 |    * 2. Return default value if or defaultValue is not empty
 44 |    * 3. throw parameter should be provided exception
 45 |    *
 46 |    * @param key
 47 |    * @param defaultValue
 48 |    * @return
 49 |    */
 50 |   def getJdbcConfig(key: String, defaultValue: String = ""): String = {
 51 |     config.get(s"$tag.$key") match {
 52 |       case Some(v) => v
 53 |       case None if defaultValue != "" => defaultValue
 54 |       case None => throw new Exception(s"parameter $key should be provided!")
 55 |     }
 56 |   }
 57 | 
 58 |   val url = getJdbcConfig("url")
 59 |   val username = getJdbcConfig("username")
 60 |   val password = getJdbcConfig("password")
 61 |   val queryTimeout = getJdbcConfig("query.timeout", "180").toInt
 62 |   val tableName: String = config("tableName")
 63 | 
 64 |   val jdbcConnectOption: JDBCOptions =
 65 |     new JDBCOptions(Map(
 66 |       JDBCOptions.JDBC_URL -> url,
 67 |       "user" -> username,
 68 |       "password" -> password,
 69 |       JDBCOptions.JDBC_TABLE_NAME -> tableName,
 70 |       JDBCOptions.JDBC_QUERY_TIMEOUT -> queryTimeout.toString
 71 |     ))
 72 | 
 73 |   def getConnection(): Connection = JdbcUtils.createConnectionFactory(jdbcConnectOption)()
 74 | 
 75 |   def closeConnection(conn: Connection): Unit = {
 76 |     try {
 77 |       if (conn != null) {
 78 |         conn.close()
 79 |       }
 80 |     } catch {
 81 |       case ex: Exception => logError("close jdbc connection error!", ex)
 82 |     }
 83 |   }
 84 | 
 85 |   def withConnection[T](body: Connection => T): T = {
 86 |     val conn: Connection = getConnection()
 87 |     try {
 88 |       body(conn)
 89 |     } catch {
 90 |       case ex: Exception =>
 91 |         logError("execute jdbc function error!", ex)
 92 |         throw ex
 93 |     } finally {
 94 |       closeConnection(conn)
 95 |     }
 96 |   }
 97 | 
 98 |   def getTableSchema(): StructType = {
 99 |     val tableSchemaOption = JdbcUtils.getSchemaOption(getConnection(), jdbcConnectOption)
100 |     assert(tableSchemaOption.isDefined, s"Failed to get $tableName schema!")
101 |     tableSchemaOption.get
102 |   }
103 | 
104 |   /**
105 |    * @param row 准备转换的Row数据
106 |    * @param pstmt JDBC PreparedStatement
107 |    * @param fields 需要转换的字段列表, pstmt在进行参数转换时的开始下标，默认为1
108 |    */
109 |   def rowToPreparedStatement(row: GenericRowWithSchema,
110 |                              pstmt: PreparedStatement,
111 |                              fields: Seq[StructField]): Unit = {
112 |     fields.zipWithIndex.map {
113 |       case (field, fieldIndex) =>
114 |         field.dataType match {
115 |           case _: BooleanType =>
116 |             pstmt.setBoolean(fieldIndex + 1, row.getAs(field.name))
117 |           case _: DoubleType =>
118 |             pstmt.setDouble(fieldIndex + 1, row.getAs(field.name))
119 |           case _: DecimalType =>
120 |             pstmt.setBigDecimal(fieldIndex + 1, row.getAs(field.name))
121 |           case _: FloatType =>
122 |             pstmt.setFloat(fieldIndex + 1, row.getAs(field.name))
123 |           case _: ByteType =>
124 |             pstmt.setByte(fieldIndex + 1, row.getAs(field.name))
125 |           case _: ShortType =>
126 |             pstmt.setShort(fieldIndex + 1, row.getAs(field.name))
127 |           case _: IntegerType =>
128 |             pstmt.setInt(fieldIndex + 1, row.getAs(field.name))
129 |           case _: LongType =>
130 |             pstmt.setLong(fieldIndex + 1, row.getAs(field.name))
131 |           case _: StringType =>
132 |             pstmt.setString(fieldIndex + 1, row.getAs(field.name))
133 |           case _: DateType =>
134 |             pstmt.setDate(fieldIndex + 1, row.getAs(field.name))
135 |           case _ =>
136 |             throw new IllegalArgumentException(
137 |               s"Unsupported type ${field.dataType}"
138 |             )
139 |         }
140 |     }
141 |   }
142 | 
143 |   var statementCounter: Long = 0
144 | 
145 |   def tryStatement[T](pstmt: PreparedStatement, row: Option[GenericRowWithSchema] = None)
146 |                      (body: PreparedStatement => Unit): Unit = {
147 |     try {
148 |       statementCounter.synchronized {
149 |         if (pstmt != null) {
150 |           body(pstmt)
151 |           statementCounter = statementCounter + 1
152 |         }
153 |         if (statementCounter % 10000 == 0) {
154 |           val updateCounts = pstmt.executeBatch
155 |           logInfo(s"commit JDBC PreparedStatement,affected rows = ${updateCounts.length}, " +
156 |             s"statement counter = ${statementCounter}")
157 |           pstmt.clearParameters()
158 |         }
159 |       }
160 |     } catch {
161 |       case e: Exception =>
162 |         logError(s"debug message for pstmt : ${pstmt}, row : ${row}")
163 |         throw e
164 |     }
165 |   }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/JobIdUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import java.io.File
21 | import java.time.LocalDateTime
22 | import java.time.format.DateTimeFormatter
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  * @date 2020-03-06.
27 |  */
28 | object JobIdUtil {
29 | 
30 |   def generatorJobId(jobFile: String): String = {
31 |     val ts = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"))
32 |     val prefix = new File(jobFile).getName.stripSuffix(".xml")
33 |     s"${prefix}-${ts}"
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/Logging.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.util
 19 | 
 20 | import org.slf4j.{Logger, LoggerFactory}
 21 | 
 22 | /**
 23 |  * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows
 24 |  * logging messages at different levels using methods that only evaluate parameters lazily if the
 25 |  * log level is enabled.
 26 |  *
 27 |  */
 28 | trait Logging {
 29 |   // Make the log field transient so that objects with Logging can
 30 |   // be serialized and used on another machine
 31 |   @transient private var log_ : Logger = null
 32 | 
 33 |   implicit def anyToString(any: Any): String = any.toString
 34 | 
 35 |   // Method to get the logger name for this object
 36 |   protected def logName = {
 37 |     // Ignore trailing $'s in the class names for Scala objects
 38 |     this.getClass.getName.stripSuffix("$")
 39 |   }
 40 | 
 41 |   // Method to get or create the logger for this object
 42 |   protected def log: Logger = {
 43 |     if (log_ == null) {
 44 |       log_ = LoggerFactory.getLogger(logName)
 45 |     }
 46 |     log_
 47 |   }
 48 | 
 49 |   // Log methods that take only a String
 50 |   protected def logInfo(msg: => String) {
 51 |     if (log.isInfoEnabled) log.info(msg)
 52 |   }
 53 | 
 54 |   protected def logDebug(msg: => String) {
 55 |     if (log.isDebugEnabled) log.debug(msg)
 56 |   }
 57 | 
 58 |   protected def logTrace(msg: => String) {
 59 |     if (log.isTraceEnabled) log.trace(msg)
 60 |   }
 61 | 
 62 |   protected def logWarning(msg: => String) {
 63 |     if (log.isWarnEnabled) log.warn(msg)
 64 |   }
 65 | 
 66 |   protected def logError(msg: => String) {
 67 |     if (log.isErrorEnabled) log.error(msg)
 68 |   }
 69 | 
 70 |   // Log methods that take Throwables (Exceptions/Errors) too
 71 |   protected def logInfo(msg: => String, throwable: Throwable) {
 72 |     if (log.isInfoEnabled) log.info(msg, throwable)
 73 |   }
 74 | 
 75 |   protected def logDebug(msg: => String, throwable: Throwable) {
 76 |     if (log.isDebugEnabled) log.debug(msg, throwable)
 77 |   }
 78 | 
 79 |   protected def logTrace(msg: => String, throwable: Throwable) {
 80 |     if (log.isTraceEnabled) log.trace(msg, throwable)
 81 |   }
 82 | 
 83 |   protected def logWarning(msg: => String, throwable: Throwable) {
 84 |     if (log.isWarnEnabled) log.warn(msg, throwable)
 85 |   }
 86 | 
 87 |   protected def logError(msg: => String, throwable: Throwable) {
 88 |     if (log.isErrorEnabled) log.error(msg, throwable)
 89 |   }
 90 | 
 91 |   protected def isTraceEnabled(): Boolean = {
 92 |     log.isTraceEnabled
 93 |   }
 94 | 
 95 |   def runWithErrorLog[T](body: => T): T = {
 96 |     try {
 97 |       body
 98 |     } catch {
 99 |       case e: Exception =>
100 |         logError(s"find exception: $e")
101 |         throw e
102 |     }
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/NextIterator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | /** Provides a basic/boilerplate Iterator implementation. */
21 | abstract class NextIterator[U] extends Iterator[U] {
22 | 
23 |   private var gotNext = false
24 |   private var nextValue: U = _
25 |   private var closed = false
26 |   protected var finished = false
27 | 
28 |   /**
29 |    * Method for subclasses to implement to provide the next element.
30 |    *
31 |    * If no next element is available, the subclass should set `finished`
32 |    * to `true` and may return any value (it will be ignored).
33 |    *
34 |    * This convention is required because `null` may be a valid value,
35 |    * and using `Option` seems like it might create unnecessary Some/None
36 |    * instances, given some iterators might be called in a tight loop.
37 |    *
38 |    * @return U, or set 'finished' when done
39 |    */
40 |   def getNext(): U
41 | 
42 |   /**
43 |    * Method for subclasses to implement when all elements have been successfully
44 |    * iterated, and the iteration is done.
45 |    *
46 |    * <b>Note:</b> `NextIterator` cannot guarantee that `close` will be
47 |    * called because it has no control over what happens when an exception
48 |    * happens in the user code that is calling hasNext/next.
49 |    *
50 |    * Ideally you should have another try/catch, as in HadoopRDD, that
51 |    * ensures any resources are closed should iteration fail.
52 |    */
53 |   def close()
54 | 
55 |   /**
56 |    * Calls the subclass-defined close method, but only once.
57 |    *
58 |    * Usually calling `close` multiple times should be fine, but historically
59 |    * there have been issues with some InputFormats throwing exceptions.
60 |    */
61 |   def closeIfNeeded() {
62 |     if (!closed) {
63 |       // Note: it's important that we set closed = true before calling close(), since setting it
64 |       // afterwards would permit us to call close() multiple times if close() threw an exception.
65 |       closed = true
66 |       close()
67 |     }
68 |   }
69 | 
70 |   override def hasNext: Boolean = {
71 |     if (!finished) {
72 |       if (!gotNext) {
73 |         nextValue = getNext()
74 |         if (finished) {
75 |           closeIfNeeded()
76 |         }
77 |         gotNext = true
78 |       }
79 |     }
80 |     !finished
81 |   }
82 | 
83 |   override def next(): U = {
84 |     if (!hasNext) {
85 |       throw new NoSuchElementException("End of stream")
86 |     }
87 |     gotNext = false
88 |     nextValue
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/OptimizerUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import org.apache.spark.sql.AnalysisException
21 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
22 | import org.apache.spark.sql.catalyst.expressions.Literal
23 | import org.apache.spark.sql.types.StringType
24 | 
25 | /**
26 |  * @author kun.wan, <wankun@apache.org>
27 |  * @date 2021-03-08.
28 |  */
29 | object OptimizerUtil {
30 | 
31 |   def parseHintParameter(value: Any): String = {
32 |     value match {
33 |       case v: String => UnresolvedAttribute.parseAttributeName(v).mkString(".")
34 |       case Literal(v, dt: StringType) => v.toString
35 |       case v: UnresolvedAttribute => v.nameParts.mkString(".")
36 |       case unsupported => throw new AnalysisException(s"Unable to parse : $unsupported")
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/ReflectUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-02-22.
23 |  */
24 | object ReflectUtils {
25 | 
26 |   /**
27 |    * 通过反射执行private方法
28 |    * @param clazz
29 |    * @param name private 方法名
30 |    * @param instance 方法执行的实例，如果是静态方法，直接传入null
31 |    * @param parameterTypes 方法参数类型列表，无参数时传入空Seq()
32 |    * @param parameters 方法参数实例列表，无参数时传入空Seq()
33 |    */
34 |   def runMethod(clazz: Class[_],
35 |                 name: String,
36 |                 instance: Any,
37 |                 parameterTypes: Seq[Class[_]],
38 |                 parameters: Seq[Object]): Unit = {
39 |     val method = clazz.getDeclaredMethod(name, parameterTypes: _*)
40 |     method.setAccessible(true)
41 |     method.invoke(instance, parameters: _*)
42 |   }
43 | 
44 |   def setVariable(instance: Any,
45 |                   fieldName: String,
46 |                   value: Any): Unit = {
47 |     val field = instance.getClass.getDeclaredField(fieldName)
48 |     field.setAccessible(true)
49 |     field.set(instance, value)
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/StringUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import org.apache.commons.lang3.StringUtils
21 | 
22 | /**
23 |  * @author kun.wan, <wankun@apache.org>
24 |  * @date 2021-03-17.
25 |  */
26 | object StringUtil {
27 | 
28 |   val escapeMapping: Map[Array[Char], Array[Char]] = Map(
29 |     Array('\"') -> Array('\"'),
30 |     Array(''') -> Array('''),
31 |     Array('(') -> Array(')'),
32 |   )
33 | 
34 |   def escapeStringValue(text: String): String = {
35 |     var res = text.trim
36 |     for ((startChars, endChars) <- escapeMapping
37 |          if res.startsWith(new String(startChars)) && res.endsWith(new String(endChars))) {
38 |       res = StringUtils.removeStart(res, new String(startChars))
39 |       res = StringUtils.removeEnd(res, new String(endChars)).trim
40 |     }
41 |     res
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/util/SystemVariables.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | /**
21 |  * @author kun.wan, <wankun@apache.org>
22 |  * @date 2021-02-24.
23 |  */
24 | object SystemVariables {
25 | 
26 |   val BATCH_TIME = "batch_time"
27 |   val JOB_NAME = "job_name"
28 |   val INDEX_COLUMN_NAME = "index_column"
29 |   val PROJECT_JAR_NAME = "sql-runner-3.0.jar"
30 | }
31 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/SQLRunnerSuiteUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql
19 | 
20 | import java.io.File
21 | 
22 | import org.apache.commons.io.FileUtils
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  * @date 2020-09-16.
27 |  */
28 | object SQLRunnerSuiteUtils {
29 | 
30 |   def cleanTestHiveData(): Unit = {
31 |     val metastoreDB = new File("metastore_db")
32 |     if (metastoreDB.exists) {
33 |       FileUtils.forceDelete(metastoreDB)
34 |     }
35 |     val sparkWarehouse = new File("spark-warehouse")
36 |     if (sparkWarehouse.exists) {
37 |       FileUtils.forceDelete(sparkWarehouse)
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/SparkSqlRunnerBase.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql
19 | 
20 | import org.apache.spark.sql.udf.UDFFactory
21 | import org.apache.spark.internal.config.Tests.IS_TESTING
22 | import org.apache.spark.sql.SQLRunnerSuiteUtils._
23 | import org.apache.spark.sql.hive.SparkSqlRunner
24 | import org.apache.spark.sql.hive.test.TestHiveSingleton
25 | import org.apache.spark.sql.runner.command.SqlCommand
26 | import org.apache.spark.sql.test.SQLTestUtils
27 | 
28 | /**
29 |  * @author kun.wan, <wankun@apache.org>
30 |  * @date 2020-04-15.
31 |  */
32 | class SparkSqlRunnerBase extends QueryTest with SQLTestUtils with TestHiveSingleton {
33 | 
34 |   implicit val sparkImp: SparkSession = spark
35 |   val sc = spark.sparkContext
36 |   var runner: SparkSqlRunner = _
37 | 
38 |   override def beforeAll(): Unit = {
39 | 
40 |     super.beforeAll()
41 |     System.setProperty(IS_TESTING.key, "true")
42 |     cleanTestHiveData()
43 | 
44 |     SparkSession.active.sharedState.externalCatalog.addListener(SqlRunnerCatalogEventListener())
45 |     UDFFactory.registerExternalUDFs(spark)
46 | 
47 |     runner = new SparkSqlRunner
48 |   }
49 | 
50 | 
51 |   override def afterAll() {
52 |     cleanTestHiveData()
53 |     SqlCommand.stop()
54 |     super.afterAll()
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/optimizer/CollectValueRuleSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import org.apache.spark.sql.QueryTest
21 | import org.apache.spark.sql.hive.SparkSqlRunner
22 | import org.apache.spark.sql.test.SQLTestData.TestData
23 | import org.apache.spark.sql.test.SQLTestUtils
24 | import org.scalatest.matchers.should.Matchers._
25 | 
26 | import org.apache.spark.sql.runner.callback.DataCallBackFactory
27 | import org.apache.spark.sql.runner.command.SqlCommand
28 | import org.apache.spark.sql.runner.container.CollectorContainer
29 | 
30 | /**
31 |  * @author kun.wan, <wankun@apache.org>
32 |  * @date 2020-07-28.
33 |  */
34 | class CollectValueRuleSuite extends QueryTest with SQLTestUtils {
35 | 
36 |   override val spark = {
37 |     System.setProperty("spark.master", "local[1]")
38 |     SparkSqlRunner.sparkSession(Some("CollectValueRuleSuite"))
39 |   }
40 | 
41 |   import spark.implicits._
42 | 
43 |   override def beforeAll() {
44 |     val df = spark.sparkContext.parallelize(
45 |       (1 to 100).map(i => TestData(i, i.toString))).toDF()
46 |     df.createOrReplaceTempView("testData")
47 |   }
48 | 
49 |   override def afterAll(): Unit = {
50 |     spark.close()
51 |   }
52 | 
53 | 
54 |   def runPartitionScanLimitRule(testQuery: String): Unit = {
55 |     PartitionScanLimitRule(spark).apply(
56 |       spark.sql(testQuery).queryExecution.optimizedPlan
57 |     )
58 |   }
59 | 
60 |   def runAndComsume(sql: String): Unit = {
61 |     DataCallBackFactory.consumeResult(SqlCommand.sparkSqlRunner.run(sql))
62 |   }
63 | 
64 |   test("test collect Hint") {
65 |     runAndComsume(
66 |       s"""SELECT /*+ COLLECT_VALUE('single_value', 'count_column') */
67 |          |       /*+ COLLECT_VALUE('max_key', 'keyColumn') */
68 |          |       count(1) as count_column,
69 |          |       concat('prefix_', max(key)) as keyColumn
70 |          |from  testData
71 |          |""".stripMargin)
72 |     CollectorContainer.get("single_value") should be(100)
73 |     CollectorContainer.get("max_key") should be("prefix_100")
74 | 
75 |     runAndComsume(
76 |       s"""SELECT /*+ COLLECT_ARRAY('intArray', 'key') */
77 |          |       /*+ COLLECT_ARRAY('stringArray', 'value') */
78 |          |       key, value
79 |          |from  testData
80 |          |""".stripMargin)
81 |     CollectorContainer.get("intArray") should be((1 to 100))
82 |     CollectorContainer.get("stringArray") should be((1 to 100).map(_.toString))
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/optimizer/ExternalTableRuleSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.optimizer
 19 | 
 20 | import org.apache.spark.sql.SQLRunnerSuiteUtils.cleanTestHiveData
 21 | import org.apache.spark.sql.QueryTest
 22 | import org.apache.spark.sql.hive.SparkSqlRunner
 23 | import org.apache.spark.sql.test.SQLTestUtils
 24 | import org.apache.spark.sql.util.ConfigUtil
 25 | import org.scalatest.matchers.should.Matchers._
 26 | 
 27 | import org.apache.spark.sql.runner.command.SqlCommand
 28 | import org.apache.spark.sql.runner.container.ConfigContainer
 29 | 
 30 | /**
 31 |  * @author kun.wan, <wankun@apache.org>
 32 |  * @date 2020-09-15.
 33 |  */
 34 | class ExternalTableRuleSuite extends QueryTest with SQLTestUtils {
 35 | 
 36 |   override val spark = {
 37 |     System.setProperty("spark.master", "local[1]")
 38 |     SparkSqlRunner.sparkSession(Some("ExternalTableRuleSuite"))
 39 |   }
 40 | 
 41 |   val testPath = getClass.getResource("/")
 42 | 
 43 |   val bootstrapServers = "10.23.177.40:9092"
 44 |   val schemaRegistryUrl = "http://10.23.177.40:8081"
 45 | 
 46 |   override def beforeAll(): Unit = {
 47 |     cleanTestHiveData()
 48 | 
 49 |     ConfigContainer ++ Map(
 50 |       "mysql.url" -> "jdbc:mysql://localhost:3306/test",
 51 |       "mysql.username" -> "root",
 52 |       "mysql.password" -> "password",
 53 |     )
 54 | 
 55 |     spark.sql(s"CREATE TABLE target(id int, name string) LOCATION '$testPath/target'")
 56 | 
 57 |     /**
 58 |    * mysql> desc stu;
 59 |    * +-------+------------+------+-----+---------+-------+
 60 |    * | Field | Type       | Null | Key | Default | Extra |
 61 |    * +-------+------------+------+-----+---------+-------+
 62 |    * | id    | int(11)    | NO   | PRI | NULL    |       |
 63 |    * | name  | text       | YES  |     | NULL    |       |
 64 |    * | sex   | varchar(2) | YES  |     | NULL    |       |
 65 |    * | env   | char(20)   | YES  |     | NULL    |       |
 66 |    * +-------+------------+------+-----+---------+-------+
 67 |    */
 68 |   }
 69 | 
 70 |   override def afterAll() {
 71 |     cleanTestHiveData()
 72 |     spark.stop()
 73 |     super.afterAll()
 74 |   }
 75 | 
 76 |   test("query jdbc table") {
 77 |     ConfigUtil.withConfigs("mysql.stu.numPartitions" -> "3", "mysql.stu.partitionColumn" -> "id") {
 78 | 
 79 |       val df = spark.sql(s"""SELECT id, name
 80 |            |FROM jdbc.mysql.stu
 81 |            |where id < 10
 82 |            |""".stripMargin)
 83 |       df.rdd.partitions.length should equal(3)
 84 |       df.explain()
 85 |       df.show()
 86 |     }
 87 |   }
 88 | 
 89 |   test("query jdbc view") {
 90 |     ConfigUtil.withConfigs(
 91 |       "mysql.stu.query" -> "(select * from stu where name !='wankun') as q",
 92 |       "mysql.stu.numPartitions" -> "3",
 93 |       "mysql.stu.partitionColumn" -> "id") {
 94 | 
 95 |       val df = spark.sql(s"""SELECT id, name
 96 |            |FROM jdbc.mysql.stu
 97 |            |""".stripMargin)
 98 |       df.rdd.partitions.length should equal(3)
 99 |       df.show()
100 |     }
101 |   }
102 | 
103 |   test("write data frame to mysql table") {
104 |     ConfigUtil.withConfigs(
105 |       "mysql.stu.queryTimeout" -> 100.toString,
106 |       "mysql.stu.uniqueKeys" -> "id") {
107 |       new SqlCommand(s"""WITH t as (
108 |            |    SELECT 100 as id, "user_100" as name
109 |            |    UNION ALL
110 |            |    SELECT 101 as id, "user_101" as name
111 |            |)
112 |            |INSERT INTO jdbc.mysql.stu
113 |            |SELECT *
114 |            |FROM t;
115 |            |""".stripMargin).run()
116 |     }
117 |   }
118 | 
119 |   test("write json data frame to kafka table") {
120 |     ConfigUtil.withConfigs(
121 |       "kafka.bootstrap.servers" -> bootstrapServers,
122 |       "kafka.stu.recordType" -> "json",
123 |       "kafka.stu.kafkaTopic" -> "test_wankun") {
124 |       new SqlCommand(s"""WITH t as (
125 |            |    SELECT 100 as id, "user_100" as name
126 |            |    UNION ALL
127 |            |    SELECT 101 as id, "user_101" as name
128 |            |)
129 |            |INSERT INTO kafka.stu
130 |            |SELECT *
131 |            |FROM t;
132 |            |""".stripMargin).run()
133 |     }
134 |   }
135 | 
136 |   test("write avro data frame to kafka using KAFKA_SINK") {
137 |     ConfigUtil.withConfigs(
138 |       "kafka.bootstrap.servers" -> bootstrapServers,
139 |       "kafka.schema.registry.url" -> schemaRegistryUrl,
140 |       "kafka.stu.recordType" -> "avro",
141 |       "kafka.stu.kafkaTopic" -> "test_wankun2",
142 |       // 不根据计算结果DDL自动生成Avro Schema，手动测试时，根据需要调整该参数
143 |       "kafka.stu.avro.forceCreate" -> "false",
144 |       "kafka.stu.avro.name" -> "student",
145 |       "kafka.stu.avro.namespace" -> "com.wankun") {
146 |       new SqlCommand(s"""INSERT INTO kafka.stu
147 |            |SELECT 1 as id1, 'wankun' as name1,
148 |            |       '男' as sex1, 'PRD' env1, 18 age1;
149 |            |""".stripMargin).run()
150 |     }
151 |   }
152 | 
153 |   /*
154 |   test("send message with EMAIL_SINK") {
155 |     ConfigUtil.withConfigs(
156 |       // server config
157 |       "email.hostname" -> "smtp.exmail.qq.com",
158 |       "email.username" -> "test@leyantech.com",
159 |       "email.password" -> "",
160 |       "email.from" -> "test@leyantech.com",
161 | 
162 |       // job config
163 |       "email.columns" -> "id, name",
164 |       "email.columnNames" -> "ID,名称",
165 |       "email.subject" -> "测试邮件",
166 |       "email.email-to" -> "wankun@apache.org",
167 |       "email.email-cc" -> "wankun@apache.org"
168 |     ) {
169 |       new SqlCommand(
170 |         s"""SELECT /*+ EMAIL_SINK(email) */
171 |            |      1 as id, 'wankun' as name;
172 |            |""".stripMargin).run()
173 |     }
174 |   }
175 | 
176 |   test("send message with DINGDING_SINK") {
177 |     ConfigUtil.withConfigs(
178 |       "dataquality.alert"-> "https://oapi.dingtalk.com/robot/send?access_token=test_token",
179 |       "dataquality.alert.title" -> "测试钉钉告警",
180 |       "dataquality.alert.pattern" -> "ID是{id}，姓名:{name}"
181 |     ) {
182 |       new SqlCommand(
183 |         s"""SELECT /*+ DINGDING_SINK(dataquality.alert) */
184 |            |      1 as id, 'wankun' as name;
185 |            |""".stripMargin).run()
186 |     }
187 |   }
188 |  */
189 | 
190 | }
191 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/optimizer/PartitionScanLimitRuleSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.optimizer
19 | 
20 | import org.apache.spark.sql.hive.test.TestHiveSingleton
21 | import org.apache.spark.sql.test.SQLTestUtils
22 | import org.apache.spark.sql.{AnalysisException, QueryTest}
23 | 
24 | /**
25 |  * @author kun.wan, <wankun@apache.org>
26 |  * @date 2020-07-28.
27 |  */
28 | class PartitionScanLimitRuleSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
29 | 
30 |   override def beforeAll(): Unit = {
31 |     super.beforeAll()
32 | 
33 |     Seq("test1", "test2").map { tableName =>
34 |       sql(
35 |         s"""
36 |            |CREATE TABLE $tableName(i int)
37 |            |PARTITIONED BY (p STRING)
38 |            |STORED AS textfile""".stripMargin)
39 |       sql(s"INSERT OVERWRITE TABLE $tableName PARTITION (p='1') select * from range(10)")
40 |     }
41 |   }
42 | 
43 |   override def afterAll(): Unit = {
44 |     Seq("test1", "test2").map { tableName =>
45 |       sql(s"DROP TABLE IF EXISTS $tableName")
46 |     }
47 |     super.afterAll()
48 |   }
49 | 
50 |   def runPartitionScanLimitRule(testQuery: String): Unit = {
51 |     PartitionScanLimitRule(spark).apply(
52 |       spark.sql(testQuery).queryExecution.optimizedPlan
53 |     )
54 |   }
55 | 
56 |   test("no filters on partition table scan") {
57 |     intercept[AnalysisException] {
58 |       runPartitionScanLimitRule("SELECT i FROM test1")
59 |     }
60 | 
61 |     runPartitionScanLimitRule("SELECT i FROM test1 where p='1'")
62 |     runPartitionScanLimitRule(
63 |       s"""
64 |          |WITH t as (
65 |          |    SELECT count(1) as c
66 |          |    FROM test1
67 |          |    WHERE p='1'
68 |          |)
69 |          |SELECT * FROM t
70 |          |""".stripMargin)
71 |   }
72 | 
73 |   test("no filters on partition table join") {
74 |     intercept[AnalysisException] {
75 |       runPartitionScanLimitRule(
76 |         s"""
77 |            |SELECT *
78 |            |FROM (SELECT i FROM test1 where p='1') t1
79 |            |JOIN test2 t2
80 |            |ON t1.i > t2.i
81 |            |""".stripMargin)
82 |     }
83 | 
84 |     runPartitionScanLimitRule(
85 |       s"""
86 |          |SELECT *
87 |          |FROM (SELECT i FROM test1 where p='1') t1
88 |          |JOIN test2 t2
89 |          |ON t1.i > t2.i
90 |          |AND t2.p = '1'
91 |          |""".stripMargin)
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/runner/ArgParserSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.runner
19 | 
20 | import org.apache.spark.sql.util.Logging
21 | import JobRunner.dateRangeStep
22 | import org.scalatest.funsuite.AnyFunSuite
23 | import org.scalatest.matchers.should.Matchers._
24 | 
25 | import java.time.LocalDateTime
26 | import java.time.temporal.ChronoUnit
27 | 
28 | /**
29 |  * @author kun.wan, <wankun@apache.org>
30 |  * @date 2021-02-04.
31 |  */
32 | class ArgParserSuite extends AnyFunSuite with Logging {
33 | 
34 |   test("test time range option") {
35 |     val startDate = Some(LocalDateTime.parse("2021-01-01T00:00:00"))
36 |     val endDate = Some(LocalDateTime.parse("2021-01-06T00:00:00"))
37 | 
38 |     val rangeSize = ChronoUnit.DAYS.between(startDate.get, endDate.get)
39 |     Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusDays(i)) should
40 |       be(Seq(LocalDateTime.parse("2021-01-01T00:00:00"),
41 |         LocalDateTime.parse("2021-01-02T00:00:00"),
42 |         LocalDateTime.parse("2021-01-03T00:00:00"),
43 |         LocalDateTime.parse("2021-01-04T00:00:00"),
44 |         LocalDateTime.parse("2021-01-05T00:00:00"),
45 |         LocalDateTime.parse("2021-01-06T00:00:00")))
46 | 
47 | 
48 |     dateRangeStep = 2
49 |     Range.inclusive(0, rangeSize.toInt, dateRangeStep).map(i => startDate.get.plusDays(i)) should
50 |       be(Seq(LocalDateTime.parse("2021-01-01T00:00:00"),
51 |         LocalDateTime.parse("2021-01-03T00:00:00"),
52 |         LocalDateTime.parse("2021-01-05T00:00:00")))
53 | 
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/runner/command/CommandSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.runner.command
 19 | 
 20 | import org.scalatest.funsuite.AnyFunSuite
 21 | import org.scalatest.matchers.should.Matchers._
 22 | 
 23 | /**
 24 |  * @author kun.wan, <wankun@apache.org>
 25 |  * @date 2021-02-24.
 26 |  */
 27 | class CommandSuite extends AnyFunSuite {
 28 | 
 29 |   val textHeader =
 30 |     s"""/************************************************
 31 |        |
 32 |        |  author: kun.wan
 33 |        |  period: day
 34 |        |  run_env: PRD & PRE
 35 |        |  describe: policy_store_config 店铺数据量检查
 36 |        |  app.id: 303
 37 |        |
 38 |        |************************************************/
 39 |        |""".stripMargin
 40 | 
 41 |   test("test parse job text") {
 42 |     val text =
 43 |       s"""$textHeader
 44 |          |-- 测试一下单行注释
 45 |          |
 46 |          |!set mykey=myvalue;
 47 |          |!set longKey = \"(
 48 |          |select *
 49 |          |from tab
 50 |          |WHERE dates = '{date | yyyy - MM - dd}'
 51 |          |) as q\";
 52 |          |
 53 |          |SELECT id, name
 54 |          |FROM test_db.test_name
 55 |          |WHERE id in ('001', '002');
 56 |          |
 57 |          |-- 测试SQL中包含引号
 58 |          |SELECT 'a;b' as a, "abc;hhh" as b,'a\\'b' as c;
 59 |          |""".stripMargin
 60 | 
 61 |     val commands = CommandFactory.parseCommands(text)
 62 | 
 63 |     commands.length should be(7)
 64 |   }
 65 | 
 66 |   test("test parse if command") {
 67 |     Seq("kun.wan", "King").map { username =>
 68 |       val text =
 69 |         s"""$textHeader
 70 |            |!set user = $username;
 71 |            |!if (user = 'kun.wan')
 72 |            |  select 'if command';
 73 |            |!else
 74 |            |  select 'else command';
 75 |            |!fi
 76 |            |""".stripMargin
 77 | 
 78 |       val commands = CommandFactory.parseCommands(text)
 79 | 
 80 |       commands.length should be(3)
 81 | 
 82 |       commands.foreach(_.run())
 83 |     }
 84 | 
 85 |     val text =
 86 |       s"""$textHeader
 87 |          |
 88 |          |SELECT /*+ COLLECT_VALUE('row_count', 'c') */ count(1) as c;
 89 |          |SELECT /*+ COLLECT_VALUE('row_count2', 'd') */ count(1) as d;
 90 |          |
 91 |          |!if (row_count = row_count2 and row_count = 1)
 92 |          |  select 'row count is 1';
 93 |          |!else
 94 |          |  select 'row count is not 1';
 95 |          |!fi
 96 |          |""".stripMargin
 97 | 
 98 |     val commands = CommandFactory.parseCommands(text)
 99 | 
100 |     commands.length should be(4)
101 | 
102 |     commands.foreach(_.run())
103 |   }
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/runner/config/VariableSubstitutionSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.runner.config
 19 | 
 20 | import org.apache.spark.sql.util.SystemVariables
 21 | import org.apache.sql.runner.container.ConfigContainer
 22 | import org.scalatest.funsuite.AnyFunSuite
 23 | import org.scalatest.matchers.should.Matchers._
 24 | import java.time.LocalDateTime
 25 | 
 26 | import org.apache.spark.sql.runner.container.{CollectorContainer, ConfigContainer}
 27 | 
 28 | /**
 29 |  * @author kun.wan, <wankun@apache.org>
 30 |  * @date 2019-12-10.
 31 |  */
 32 | class VariableSubstitutionSuite extends AnyFunSuite {
 33 | 
 34 |   test("test time variable") {
 35 |     CollectorContainer :+ (SystemVariables.BATCH_TIME -> LocalDateTime.parse("2019-08-07T13:25:41"))
 36 |     val substitution = new VariableSubstitution()
 37 | 
 38 |     substitution.dateParameter("${date}") should be("20190807")
 39 |     substitution.dateParameter("${date + 2d}") should be("20190809")
 40 |     substitution.dateParameter("${date + 2d |yyyyMMddHH}") should be("2019080913")
 41 |     substitution.dateParameter("${date + 2d |yyyyMM00}") should be("20190800")
 42 |     substitution.dateParameter("${date + 2d |yyyy-MM-dd}") should be("2019-08-09")
 43 |     substitution.dateParameter("${date + 2d |yyyy_MM_dd}") should be("2019_08_09")
 44 |     substitution.dateParameter("${date-2m|yyyy-MM-dd HH:mm:ss}") should be("2019-08-07 13:23:41")
 45 | 
 46 |     substitution.dateParameter("${date+2d}") should be("20190809")
 47 |     substitution.dateParameter("${date+4y}") should be("20230807")
 48 | 
 49 |     substitution.dateParameter("${date+2D}") should be("20190809")
 50 |     substitution.dateParameter("${date+3M}") should be("20191107")
 51 |     substitution.dateParameter("${date+4Y}") should be("20230807")
 52 | 
 53 |     substitution.dateParameter("${date-2d}") should be("20190805")
 54 |     substitution.dateParameter("${date-4y}") should be("20150807")
 55 | 
 56 |     substitution.dateParameter("${date-2D}") should be("20190805")
 57 |     substitution.dateParameter("${date-3M}") should be("20190507")
 58 | 
 59 |     substitution.dt should be("20190807")
 60 |     substitution.yesterday should be("20190806")
 61 |     substitution.tomorrow should be("20190808")
 62 |     substitution.hour should be("2019080713")
 63 |     substitution.lastHour should be("2019080712")
 64 |     substitution.nextHour should be("2019080714")
 65 |   }
 66 | 
 67 |   test("test variable substitution in sql") {
 68 |     ConfigContainer :+ ("ab_target" -> "after_trade")
 69 |     CollectorContainer :+ (SystemVariables.BATCH_TIME -> LocalDateTime.parse("2019-08-07T13:25:41"))
 70 |     val substitution = new VariableSubstitution()
 71 | 
 72 |     substitution.substitute(
 73 |       """
 74 |         |SELECT  count(1)
 75 |         |FROM    tab
 76 |         |WHERE   start_date = '${yesterday}'
 77 |         |AND     end_date = '${dt}'
 78 |         |AND     start_hour = '${date-23H|hh}'
 79 |         |AND     end_hour = '${date - 24h|hh}'
 80 |         |AND     month = '${date - 24h|MM}'
 81 |         |AND     ab_target = '${ab_target}'
 82 |         |""".stripMargin) should equal(
 83 |       s"""
 84 |          |SELECT  count(1)
 85 |          |FROM    tab
 86 |          |WHERE   start_date = '20190806'
 87 |          |AND     end_date = '20190807'
 88 |          |AND     start_hour = '02'
 89 |          |AND     end_hour = '01'
 90 |          |AND     month = '08'
 91 |          |AND     ab_target = 'after_trade'
 92 |          |""".stripMargin)
 93 |   }
 94 | 
 95 |   test("test nested variable substitution in sql") {
 96 |     ConfigContainer :+ ("report_days" -> "3")
 97 |     CollectorContainer :+ (SystemVariables.BATCH_TIME -> LocalDateTime.parse("2019-08-07T13:25:41"))
 98 |     val substitution = new VariableSubstitution()
 99 |     substitution.substitute("SELECT * FROM tab WHERE dt = ${date-${report_days}d|yyyyMMdd}") should
100 |       equal("SELECT * FROM tab WHERE dt = 20190804")
101 |   }
102 | 
103 |   test("test parameters with default value") {
104 |     val substitution = new VariableSubstitution()
105 |     substitution.substitute("!set key1 = ${key1, 'DEFAULT_VALUE1'};") should
106 |       equal("!set key1 = DEFAULT_VALUE1;")
107 | 
108 |     substitution.substitute("!set key1 = ${key1, \"DEFAULT_VALUE1\"};") should
109 |       equal("!set key1 = DEFAULT_VALUE1;")
110 | 
111 |     ConfigContainer :+ ("key1" -> "value1")
112 | 
113 |     substitution.substitute("!set key1 = ${key1, 'DEFAULT_VALUE1'};") should
114 |       equal("!set key1 = value1;")
115 | 
116 |     substitution.substitute("!set key1 = ${key1, \"DEFAULT_VALUE1\"};") should
117 |       equal("!set key1 = value1;")
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/udf/DateFormatUDFSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.udf
19 | 
20 | import org.apache.spark.sql.{Row, SparkSqlRunnerBase}
21 | 
22 | /**
23 |  * @author kun.wan, <wankun@apache.org>
24 |  * @date 2020-07-20.
25 |  */
26 | class DateFormatUDFSuite extends SparkSqlRunnerBase {
27 | 
28 |   test("test date_format function") {
29 |     val df = spark.sql("select transform_date('20200710','yyyyMMdd','yyyy-MM-dd')")
30 |     checkAnswer(df, Seq(Row("2020-07-10")))
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/util/ConfigUtilSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import org.scalatest.funsuite.AnyFunSuite
21 | import org.scalatest.matchers.should.Matchers._
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2020-02-17.
26 |  */
27 | class ConfigUtilSuite extends AnyFunSuite {
28 | 
29 | 
30 |   test("trim config array") {
31 | 
32 |     val columnName = "\n            日期,店铺id,店铺名,买家付款\n          "
33 |     val dbColumnName = "\n            {dt},{store_id},{store_name},{buyer_payment}," +
34 |       "{buyer_prepaid}," +
35 |       "{inquiry_tailing},{no_order_try},\n            {size_query_succeeded},{applicable_season}," +
36 |       "{enable_filter_applicable_season},{chat_expires_at},{r2_expires_at},{audit_expires_at}\n  " +
37 |       "  ";
38 |     ConfigUtil.trimConfigValue(columnName) should be("日期,店铺id,店铺名,买家付款")
39 | 
40 |     ConfigUtil.trimConfigArray(dbColumnName, ",") should be(
41 |       "{dt},{store_id},{store_name},{buyer_payment},{buyer_prepaid},{inquiry_tailing}," +
42 |         "{no_order_try},{size_query_succeeded},{applicable_season}," +
43 |         "{enable_filter_applicable_season},{chat_expires_at},{r2_expires_at},{audit_expires_at}")
44 | 
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/util/JobIdUtilSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.util
19 | 
20 | import org.scalatest.funsuite.AnyFunSuite
21 | import org.scalatest.matchers.should.Matchers._
22 | 
23 | /**
24 |  * @author kun.wan, <wankun@apache.org>
25 |  * @date 2020-03-06.
26 |  */
27 | class JobIdUtilSuite extends AnyFunSuite {
28 | 
29 |   test("test generatorJobId") {
30 |     val jobId = JobIdUtil.generatorJobId("conf/marketing/pdd/dwd_payment_reminder_detail.xml")
31 |     jobId should fullyMatch regex ("""dwd_payment_reminder_detail-\d{8}_\d{6}""")
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------